From 80801aaeba2d29e3a375a01d782cf081e778dfaf Mon Sep 17 00:00:00 2001 From: Joanna Hulboj <joanna.hulboj@isode.com> Date: Fri, 27 Jan 2017 10:03:51 +0000 Subject: Add checks to verify if the input data is a valid XML/XMPP Test-Information: Unit tests pass OK. Change-Id: Ibd7897c1c1249af7ac60a6549afd4c2362fcf0c0 diff --git a/COPYING.dependencies b/COPYING.dependencies index d233d9f..08823e8 100644 --- a/COPYING.dependencies +++ b/COPYING.dependencies @@ -12,6 +12,7 @@ Source and binary distributions of this project may also contain third-party lib - libnatpmp Binary distributions may contain these third-party libraries, whose licenses are also below. - OpenSSL +- UTF-8 DFA Decoder =============== Google Breakpad @@ -832,3 +833,16 @@ derivative of this code cannot be changed. i.e. this code cannot simply be copied and put under another distribution licence [including the GNU Public Licence.] +================= +UTF-8 DFA Decoder +================= + +http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ + +Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de> + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/Swiften/Base/String.cpp b/Swiften/Base/String.cpp index 4a5e4c9..bbc3003 100644 --- a/Swiften/Base/String.cpp +++ b/Swiften/Base/String.cpp @@ -17,6 +17,54 @@ #include <Swiften/Base/String.h> #include <Swiften/Base/ByteArray.h> +namespace { +const static std::uint32_t UTF8_ACCEPT = 0; + +const static std::uint8_t UTF8D[] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df + 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef + 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff + 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2 + 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4 + 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6 + 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8 +}; + +//http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ +std::uint32_t decode(std::uint32_t & state, std::uint32_t & codepoint, std::uint8_t byte) { + const auto type = UTF8D[byte]; + codepoint = (state != UTF8_ACCEPT) ? (byte & 0x3fu) | (codepoint << 6) : (0xff >> type) & (byte); + state = UTF8D[256 + state * 16 + type]; + return state; +} + +std::uint32_t getNextCodepoint(const char * begin, const char * end, std::size_t & consumed, bool & ok) { + consumed = 0; + ok = true; + + std::uint32_t state = 0; + std::uint32_t codepoint = 0; + + for (auto it = begin; it != end; ++it) { + ++consumed; + if (!decode(state, codepoint, static_cast<std::uint8_t>(*it))) + return codepoint; + } + if (state != UTF8_ACCEPT) { + ok = false; + } + return codepoint; +} + +} + namespace Swift { static inline size_t sequenceLength(char firstByte) { @@ -86,6 +134,55 @@ void String::replaceAll(std::string& src, char c, const std::string& s) { } } +bool String::isValidXMPPCharacter(std::uint32_t codepoint) { + // Special accepted characters: + if (codepoint == '\t' || codepoint == '\r' || codepoint == '\n') + return true; + // Discouraged characters: + if (codepoint >= 0x7Fu && codepoint <= 0x84u) + return false; + if (codepoint >= 0x86u && codepoint <= 0x9Fu) + return false; + if (codepoint >= 0xFDD0u && codepoint <= 0xFDEFu) + return false; + if (((codepoint & 0xFFFEu) == 0xFFEEu) || ((codepoint & 0xFFFFu) == 0xFFFFu)) + return false; + // Other valid characters (after filtering for discouraged ones above) + if (codepoint >= 0x20u && codepoint <= 0xD7FFu) + return true; + if (codepoint >= 0xE000u && codepoint <= 0xFFFDu) + return true; + if (codepoint >= 0x10000u && codepoint <= 0x10FFFFu) + return true; + return false; +} + +std::string String::sanitizeXMPPString(const std::string& input) { + std::string result; + result.reserve(input.length()); + + auto it = input.data(); + const auto end = it + input.length(); + + std::size_t consumed; + bool status = UTF8_ACCEPT; + + while (it < end) { + const auto codepoint = getNextCodepoint(it, end, consumed, status); + if (status) { + if (isValidXMPPCharacter(codepoint)) { + std::copy(it, it + consumed, std::back_inserter(result)); + } + it += consumed; + } + else { + ++it; + } + } + result.shrink_to_fit(); + return result; +} + std::vector<std::string> String::split(const std::string& s, char c) { assert((c & 0x80) == 0); std::vector<std::string> result; @@ -119,7 +216,6 @@ int String::convertHexStringToInt(const std::string& s) { return h; } - #ifdef SWIFTEN_PLATFORM_WINDOWS std::string convertWStringToString(const std::wstring& s) { int utf8Size = WideCharToMultiByte(CP_UTF8, 0, s.c_str(), -1, NULL, 0, NULL, NULL); diff --git a/Swiften/Base/String.h b/Swiften/Base/String.h index f19dca5..3a7ca65 100644 --- a/Swiften/Base/String.h +++ b/Swiften/Base/String.h @@ -6,6 +6,7 @@ #pragma once +#include <cstdint> #include <sstream> #include <string> #include <vector> @@ -21,6 +22,8 @@ namespace Swift { SWIFTEN_API std::pair<std::string, std::string> getSplittedAtFirst(const std::string&, char c); SWIFTEN_API std::vector<std::string> split(const std::string&, char c); SWIFTEN_API void replaceAll(std::string&, char c, const std::string& s); + SWIFTEN_API bool isValidXMPPCharacter(std::uint32_t codepoint); + SWIFTEN_API std::string sanitizeXMPPString(const std::string& input); inline bool beginsWith(const std::string& s, char c) { return s.size() > 0 && s[0] == c; diff --git a/Swiften/Base/UnitTest/StringTest.cpp b/Swiften/Base/UnitTest/StringTest.cpp index e2e1665..889c9c7 100644 --- a/Swiften/Base/UnitTest/StringTest.cpp +++ b/Swiften/Base/UnitTest/StringTest.cpp @@ -12,9 +12,11 @@ #include <Swiften/Base/Platform.h> #include <Swiften/Base/String.h> +#include <boost/format.hpp> + using namespace Swift; -class StringTest : public CppUnit::TestFixture { + class StringTest : public CppUnit::TestFixture { CPPUNIT_TEST_SUITE(StringTest); CPPUNIT_TEST(testGetUnicodeCodePoints); CPPUNIT_TEST(testGetSplittedAtFirst); @@ -25,6 +27,8 @@ class StringTest : public CppUnit::TestFixture { CPPUNIT_TEST(testReplaceAll_LastChar); CPPUNIT_TEST(testReplaceAll_ConsecutiveChars); CPPUNIT_TEST(testReplaceAll_MatchingReplace); + CPPUNIT_TEST(testIsValidXMPPCharacter); + CPPUNIT_TEST(testSanitizeXMPPString); CPPUNIT_TEST(testSplit); #ifdef SWIFTEN_PLATFORM_WINDOWS CPPUNIT_TEST(testConvertWStringToString); @@ -107,6 +111,44 @@ class StringTest : public CppUnit::TestFixture { CPPUNIT_ASSERT_EQUAL(std::string("abbbc"), testling); } + void testIsValidXMPPCharacter() { + const std::uint32_t testCharacters[] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, + 0x20, 0x7F, 0x80, 0x84, 0x85, 0xFF }; + const auto testLength = sizeof(testCharacters) / sizeof(std::uint32_t); + const bool expectedValid[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 1, 1 }; + static_assert(testLength == sizeof(expectedValid), "size of test data must match"); + + for (std::size_t i = 0; i != testLength; ++i) { + const auto c = testCharacters[i]; + CPPUNIT_ASSERT_EQUAL_MESSAGE(boost::str(boost::format("While testing at idx=%d: 0x%02x") % i % c), expectedValid[i], String::isValidXMPPCharacter(c)); + } + } + + void testSanitizeXMPPString() { + std::vector<std::pair<std::string, std::string>> testData = { + { "\0", "" }, + { std::string("\0\t", 3), "\t" }, + { "", "" }, + { std::string("\0", 1) , std::string() }, + { std::string("\0blah\0", 6) , std::string("blah", 4) }, + { "z\xC3\x9F\xE6\xB0\xB4\xF0\x9D\x84\x8B" , "z\xC3\x9F\xE6\xB0\xB4\xF0\x9D\x84\x8B" }, // or in u8 notation: u8"z\u00df\u6c34\U0001d10b" + { "\x7FT\t\x0c\xff\xfeT", "T\tT" }, + { "\x01Q\x0BW\x81T", "QWT" }, + { "\xF0\x9F\x98\x83" "ABC" "\xE2\xBE\xA6", "\xF0\x9F\x98\x83" "ABC" "\xE2\xBE\xA6" } + }; + + for (std::size_t i = 0; i != testData.size(); ++i) { + const auto & t = testData[i]; + const auto actual = String::sanitizeXMPPString(t.first); + CPPUNIT_ASSERT_EQUAL_MESSAGE(boost::str(boost::format("While testing string idx=%d") % i), std::string(actual), t.second); + } + } + void testSplit() { std::vector<std::string> result = String::split("abc def ghi", ' '); diff --git a/Swiften/Serializer/StanzaSerializer.cpp b/Swiften/Serializer/StanzaSerializer.cpp index f2f5529..3de706d 100644 --- a/Swiften/Serializer/StanzaSerializer.cpp +++ b/Swiften/Serializer/StanzaSerializer.cpp @@ -9,6 +9,7 @@ #include <sstream> #include <typeinfo> +#include <Swiften/Base/String.h> #include <Swiften/Base/Log.h> #include <Swiften/Elements/Stanza.h> #include <Swiften/Serializer/PayloadSerializer.h> @@ -55,6 +56,7 @@ SafeByteArray StanzaSerializer::serialize(std::shared_ptr<ToplevelElement> eleme SWIFT_LOG(warning) << "Could not find serializer for " << typeid(*(payload.get())).name() << std::endl; } } + serializedPayloads = String::sanitizeXMPPString(serializedPayloads); if (!serializedPayloads.empty()) { stanzaElement.addNode(std::make_shared<XMLRawTextNode>(serializedPayloads)); } -- cgit v0.10.2-6-g49f6