diff options
Diffstat (limited to 'Swiften')
-rw-r--r-- | Swiften/Base/String.cpp | 98 | ||||
-rw-r--r-- | Swiften/Base/String.h | 3 | ||||
-rw-r--r-- | Swiften/Base/UnitTest/StringTest.cpp | 44 | ||||
-rw-r--r-- | Swiften/Serializer/StanzaSerializer.cpp | 2 |
4 files changed, 145 insertions, 2 deletions
diff --git a/Swiften/Base/String.cpp b/Swiften/Base/String.cpp index 4a5e4c9..bbc3003 100644 --- a/Swiften/Base/String.cpp +++ b/Swiften/Base/String.cpp @@ -1,49 +1,97 @@ /* * Copyright (c) 2010-2016 Isode Limited. * All rights reserved. * See the COPYING file for more information. */ #include <Swiften/Base/Platform.h> #include <cassert> #include <algorithm> #include <sstream> #include <iomanip> #ifdef SWIFTEN_PLATFORM_WINDOWS #include <windows.h> #endif #include <Swiften/Base/String.h> #include <Swiften/Base/ByteArray.h> +namespace { +const static std::uint32_t UTF8_ACCEPT = 0; + +const static std::uint8_t UTF8D[] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df + 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef + 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff + 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2 + 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4 + 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6 + 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8 +}; + +//http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ +std::uint32_t decode(std::uint32_t & state, std::uint32_t & codepoint, std::uint8_t byte) { + const auto type = UTF8D[byte]; + codepoint = (state != UTF8_ACCEPT) ? (byte & 0x3fu) | (codepoint << 6) : (0xff >> type) & (byte); + state = UTF8D[256 + state * 16 + type]; + return state; +} + +std::uint32_t getNextCodepoint(const char * begin, const char * end, std::size_t & consumed, bool & ok) { + consumed = 0; + ok = true; + + std::uint32_t state = 0; + std::uint32_t codepoint = 0; + + for (auto it = begin; it != end; ++it) { + ++consumed; + if (!decode(state, codepoint, static_cast<std::uint8_t>(*it))) + return codepoint; + } + if (state != UTF8_ACCEPT) { + ok = false; + } + return codepoint; +} + +} + namespace Swift { static inline size_t sequenceLength(char firstByte) { if ((firstByte & 0x80) == 0) { return 1; } if ((firstByte & 0xE0) == 0xC0) { return 2; } if ((firstByte & 0xF0) == 0xE0) { return 3; } if ((firstByte & 0xF8) == 0xF0) { return 4; } if ((firstByte & 0xFC) == 0xF8) { return 5; } if ((firstByte & 0xFE) == 0xFC) { return 6; } assert(false); return 1; } std::vector<unsigned int> String::getUnicodeCodePoints(const std::string& s) { std::vector<unsigned int> result; for (size_t i = 0; i < s.size();) { unsigned int codePoint = 0; char firstChar = s[i]; @@ -59,94 +107,142 @@ std::vector<unsigned int> String::getUnicodeCodePoints(const std::string& s) { for (size_t j = 1; j < length; ++j) { codePoint = (codePoint<<6) | (s[i+j] & 0x3F); } result.push_back(codePoint); i += length; } return result; } std::pair<std::string,std::string> String::getSplittedAtFirst(const std::string& s, char c) { assert((c & 0x80) == 0); size_t firstMatch = s.find(c); if (firstMatch != s.npos) { return std::make_pair(s.substr(0,firstMatch),s.substr(firstMatch+1,s.npos)); } else { return std::make_pair(s, ""); } } void String::replaceAll(std::string& src, char c, const std::string& s) { size_t lastPos = 0; size_t matchingIndex = 0; while ((matchingIndex = src.find(c, lastPos)) != src.npos) { src.replace(matchingIndex, 1, s); lastPos = matchingIndex + s.size(); } } +bool String::isValidXMPPCharacter(std::uint32_t codepoint) { + // Special accepted characters: + if (codepoint == '\t' || codepoint == '\r' || codepoint == '\n') + return true; + // Discouraged characters: + if (codepoint >= 0x7Fu && codepoint <= 0x84u) + return false; + if (codepoint >= 0x86u && codepoint <= 0x9Fu) + return false; + if (codepoint >= 0xFDD0u && codepoint <= 0xFDEFu) + return false; + if (((codepoint & 0xFFFEu) == 0xFFEEu) || ((codepoint & 0xFFFFu) == 0xFFFFu)) + return false; + // Other valid characters (after filtering for discouraged ones above) + if (codepoint >= 0x20u && codepoint <= 0xD7FFu) + return true; + if (codepoint >= 0xE000u && codepoint <= 0xFFFDu) + return true; + if (codepoint >= 0x10000u && codepoint <= 0x10FFFFu) + return true; + return false; +} + +std::string String::sanitizeXMPPString(const std::string& input) { + std::string result; + result.reserve(input.length()); + + auto it = input.data(); + const auto end = it + input.length(); + + std::size_t consumed; + bool status = UTF8_ACCEPT; + + while (it < end) { + const auto codepoint = getNextCodepoint(it, end, consumed, status); + if (status) { + if (isValidXMPPCharacter(codepoint)) { + std::copy(it, it + consumed, std::back_inserter(result)); + } + it += consumed; + } + else { + ++it; + } + } + result.shrink_to_fit(); + return result; +} + std::vector<std::string> String::split(const std::string& s, char c) { assert((c & 0x80) == 0); std::vector<std::string> result; std::string accumulator; for (char i : s) { if (i == c) { result.push_back(accumulator); accumulator = ""; } else { accumulator += i; } } result.push_back(accumulator); return result; } std::string String::convertIntToHexString(int h) { std::stringstream ss; ss << std::setbase(16); ss << h; return ss.str(); } int String::convertHexStringToInt(const std::string& s) { std::stringstream ss; int h; ss << std::setbase(16); ss << s; ss >> h; return h; } - #ifdef SWIFTEN_PLATFORM_WINDOWS std::string convertWStringToString(const std::wstring& s) { int utf8Size = WideCharToMultiByte(CP_UTF8, 0, s.c_str(), -1, NULL, 0, NULL, NULL); if (utf8Size < 0) { throw std::runtime_error("Conversion error"); } std::vector<char> utf8Data(utf8Size); int result = WideCharToMultiByte( CP_UTF8, 0, s.c_str(), -1, vecptr(utf8Data), utf8Data.size(), NULL, NULL); if (result < 0) { throw std::runtime_error("Conversion error"); } return std::string(vecptr(utf8Data), utf8Size-1 /* trailing 0 character */); } std::wstring convertStringToWString(const std::string& s) { int utf16Size = MultiByteToWideChar(CP_UTF8, 0, s.c_str(), -1, NULL, 0); if (utf16Size < 0) { throw std::runtime_error("Conversion error"); } std::vector<wchar_t> utf16Data(utf16Size); int result = MultiByteToWideChar( CP_UTF8, 0, s.c_str(), -1, vecptr(utf16Data), utf16Data.size()); if (result < 0) { throw std::runtime_error("Conversion error"); } return std::wstring(vecptr(utf16Data), utf16Size-1 /* trailing 0 character */); } #endif diff --git a/Swiften/Base/String.h b/Swiften/Base/String.h index f19dca5..3a7ca65 100644 --- a/Swiften/Base/String.h +++ b/Swiften/Base/String.h @@ -1,53 +1,56 @@ /* * Copyright (c) 2010-2016 Isode Limited. * All rights reserved. * See the COPYING file for more information. */ #pragma once +#include <cstdint> #include <sstream> #include <string> #include <vector> #include <Swiften/Base/API.h> #include <Swiften/Base/Platform.h> #define SWIFTEN_STRING_TO_CFSTRING(a) \ CFStringCreateWithBytes(NULL, reinterpret_cast<const UInt8*>(a.c_str()), a.size(), kCFStringEncodingUTF8, false) namespace Swift { namespace String { SWIFTEN_API std::vector<unsigned int> getUnicodeCodePoints(const std::string&); SWIFTEN_API std::pair<std::string, std::string> getSplittedAtFirst(const std::string&, char c); SWIFTEN_API std::vector<std::string> split(const std::string&, char c); SWIFTEN_API void replaceAll(std::string&, char c, const std::string& s); + SWIFTEN_API bool isValidXMPPCharacter(std::uint32_t codepoint); + SWIFTEN_API std::string sanitizeXMPPString(const std::string& input); inline bool beginsWith(const std::string& s, char c) { return s.size() > 0 && s[0] == c; } inline bool endsWith(const std::string& s, char c) { return s.size() > 0 && s[s.size()-1] == c; } std::string convertIntToHexString(int h); int convertHexStringToInt(const std::string& s); } #ifdef SWIFTEN_PLATFORM_WINDOWS SWIFTEN_API std::string convertWStringToString(const std::wstring& s); SWIFTEN_API std::wstring convertStringToWString(const std::string& s); #endif class SWIFTEN_API makeString { public: template <typename T> makeString& operator<<(T const& v) { stream << v; return *this; } operator std::string() const { return stream.str(); } diff --git a/Swiften/Base/UnitTest/StringTest.cpp b/Swiften/Base/UnitTest/StringTest.cpp index e2e1665..889c9c7 100644 --- a/Swiften/Base/UnitTest/StringTest.cpp +++ b/Swiften/Base/UnitTest/StringTest.cpp @@ -1,57 +1,61 @@ /* * Copyright (c) 2010-2016 Isode Limited. * All rights reserved. * See the COPYING file for more information. */ #include <string> #include <cppunit/extensions/HelperMacros.h> #include <cppunit/extensions/TestFactoryRegistry.h> #include <Swiften/Base/Platform.h> #include <Swiften/Base/String.h> +#include <boost/format.hpp> + using namespace Swift; -class StringTest : public CppUnit::TestFixture { + class StringTest : public CppUnit::TestFixture { CPPUNIT_TEST_SUITE(StringTest); CPPUNIT_TEST(testGetUnicodeCodePoints); CPPUNIT_TEST(testGetSplittedAtFirst); CPPUNIT_TEST(testGetSplittedAtFirst_CharacterAtBegin); CPPUNIT_TEST(testGetSplittedAtFirst_CharacterAtEnd); CPPUNIT_TEST(testGetSplittedAtFirst_NoSuchCharacter); CPPUNIT_TEST(testReplaceAll); CPPUNIT_TEST(testReplaceAll_LastChar); CPPUNIT_TEST(testReplaceAll_ConsecutiveChars); CPPUNIT_TEST(testReplaceAll_MatchingReplace); + CPPUNIT_TEST(testIsValidXMPPCharacter); + CPPUNIT_TEST(testSanitizeXMPPString); CPPUNIT_TEST(testSplit); #ifdef SWIFTEN_PLATFORM_WINDOWS CPPUNIT_TEST(testConvertWStringToString); CPPUNIT_TEST(testConvertStringToWString); #endif CPPUNIT_TEST_SUITE_END(); public: void testGetUnicodeCodePoints() { std::string testling("$\xc2\xa2\xe2\x82\xac\xf4\x8a\xaf\x8d"); std::vector<unsigned int> points = String::getUnicodeCodePoints(testling); CPPUNIT_ASSERT_EQUAL(0x24U, points[0]); CPPUNIT_ASSERT_EQUAL(0xA2U, points[1]); CPPUNIT_ASSERT_EQUAL(0x20ACU, points[2]); CPPUNIT_ASSERT_EQUAL(0x10ABCDU, points[3]); } void testGetSplittedAtFirst() { std::string testling("ab@cd@ef"); std::pair<std::string,std::string> result = String::getSplittedAtFirst(testling, '@'); CPPUNIT_ASSERT_EQUAL(std::string("ab"), result.first); CPPUNIT_ASSERT_EQUAL(std::string("cd@ef"), result.second); } void testGetSplittedAtFirst_CharacterAtBegin() { std::string testling(" ab"); std::pair<std::string,std::string> result = String::getSplittedAtFirst(testling, ' '); @@ -80,51 +84,89 @@ class StringTest : public CppUnit::TestFixture { String::replaceAll(testling, 'b', "xyz"); CPPUNIT_ASSERT_EQUAL(std::string("axyzcxyzd"), testling); } void testReplaceAll_LastChar() { std::string testling("abc"); String::replaceAll(testling, 'c', "xyz"); CPPUNIT_ASSERT_EQUAL(std::string("abxyz"), testling); } void testReplaceAll_ConsecutiveChars() { std::string testling("abbc"); String::replaceAll(testling, 'b',"xyz"); CPPUNIT_ASSERT_EQUAL(std::string("axyzxyzc"), testling); } void testReplaceAll_MatchingReplace() { std::string testling("abc"); String::replaceAll(testling, 'b',"bbb"); CPPUNIT_ASSERT_EQUAL(std::string("abbbc"), testling); } + void testIsValidXMPPCharacter() { + const std::uint32_t testCharacters[] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, + 0x20, 0x7F, 0x80, 0x84, 0x85, 0xFF }; + const auto testLength = sizeof(testCharacters) / sizeof(std::uint32_t); + const bool expectedValid[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 1, 1 }; + static_assert(testLength == sizeof(expectedValid), "size of test data must match"); + + for (std::size_t i = 0; i != testLength; ++i) { + const auto c = testCharacters[i]; + CPPUNIT_ASSERT_EQUAL_MESSAGE(boost::str(boost::format("While testing at idx=%d: 0x%02x") % i % c), expectedValid[i], String::isValidXMPPCharacter(c)); + } + } + + void testSanitizeXMPPString() { + std::vector<std::pair<std::string, std::string>> testData = { + { "\0", "" }, + { std::string("\0\t", 3), "\t" }, + { "", "" }, + { std::string("\0", 1) , std::string() }, + { std::string("\0blah\0", 6) , std::string("blah", 4) }, + { "z\xC3\x9F\xE6\xB0\xB4\xF0\x9D\x84\x8B" , "z\xC3\x9F\xE6\xB0\xB4\xF0\x9D\x84\x8B" }, // or in u8 notation: u8"z\u00df\u6c34\U0001d10b" + { "\x7FT\t\x0c\xff\xfeT", "T\tT" }, + { "\x01Q\x0BW\x81T", "QWT" }, + { "\xF0\x9F\x98\x83" "ABC" "\xE2\xBE\xA6", "\xF0\x9F\x98\x83" "ABC" "\xE2\xBE\xA6" } + }; + + for (std::size_t i = 0; i != testData.size(); ++i) { + const auto & t = testData[i]; + const auto actual = String::sanitizeXMPPString(t.first); + CPPUNIT_ASSERT_EQUAL_MESSAGE(boost::str(boost::format("While testing string idx=%d") % i), std::string(actual), t.second); + } + } + void testSplit() { std::vector<std::string> result = String::split("abc def ghi", ' '); CPPUNIT_ASSERT_EQUAL(3, static_cast<int>(result.size())); CPPUNIT_ASSERT_EQUAL(std::string("abc"), result[0]); CPPUNIT_ASSERT_EQUAL(std::string("def"), result[1]); CPPUNIT_ASSERT_EQUAL(std::string("ghi"), result[2]); } #ifdef SWIFTEN_PLATFORM_WINDOWS void testConvertWStringToString() { CPPUNIT_ASSERT_EQUAL(std::string("tron\xc3\xa7on"), convertWStringToString(std::wstring(L"tron\xe7on"))); } void testConvertStringToWString() { CPPUNIT_ASSERT(std::wstring(L"tron\xe7on") == convertStringToWString(std::string("tron\xc3\xa7on"))); } #endif }; CPPUNIT_TEST_SUITE_REGISTRATION(StringTest); diff --git a/Swiften/Serializer/StanzaSerializer.cpp b/Swiften/Serializer/StanzaSerializer.cpp index f2f5529..3de706d 100644 --- a/Swiften/Serializer/StanzaSerializer.cpp +++ b/Swiften/Serializer/StanzaSerializer.cpp @@ -1,65 +1,67 @@ /* * Copyright (c) 2010-2016 Isode Limited. * All rights reserved. * See the COPYING file for more information. */ #include <Swiften/Serializer/StanzaSerializer.h> #include <sstream> #include <typeinfo> +#include <Swiften/Base/String.h> #include <Swiften/Base/Log.h> #include <Swiften/Elements/Stanza.h> #include <Swiften/Serializer/PayloadSerializer.h> #include <Swiften/Serializer/PayloadSerializerCollection.h> #include <Swiften/Serializer/XML/XMLElement.h> #include <Swiften/Serializer/XML/XMLRawTextNode.h> namespace Swift { StanzaSerializer::StanzaSerializer(const std::string& tag, PayloadSerializerCollection* payloadSerializers, const boost::optional<std::string>& explicitNS) : tag_(tag), payloadSerializers_(payloadSerializers), explicitDefaultNS_(explicitNS) { } SafeByteArray StanzaSerializer::serialize(std::shared_ptr<ToplevelElement> element) const { if (explicitDefaultNS_) { return serialize(element, explicitDefaultNS_.get()); } else { return serialize(element, ""); } } SafeByteArray StanzaSerializer::serialize(std::shared_ptr<ToplevelElement> element, const std::string& xmlns) const { std::shared_ptr<Stanza> stanza(std::dynamic_pointer_cast<Stanza>(element)); XMLElement stanzaElement(tag_, explicitDefaultNS_ ? explicitDefaultNS_.get() : xmlns); if (stanza->getFrom().isValid()) { stanzaElement.setAttribute("from", stanza->getFrom()); } if (stanza->getTo().isValid()) { stanzaElement.setAttribute("to", stanza->getTo()); } if (!stanza->getID().empty()) { stanzaElement.setAttribute("id", stanza->getID()); } setStanzaSpecificAttributes(stanza, stanzaElement); std::string serializedPayloads; for (const auto& payload : stanza->getPayloads()) { PayloadSerializer* serializer = payloadSerializers_->getPayloadSerializer(payload); if (serializer) { serializedPayloads += serializer->serialize(payload); } else { SWIFT_LOG(warning) << "Could not find serializer for " << typeid(*(payload.get())).name() << std::endl; } } + serializedPayloads = String::sanitizeXMPPString(serializedPayloads); if (!serializedPayloads.empty()) { stanzaElement.addNode(std::make_shared<XMLRawTextNode>(serializedPayloads)); } return createSafeByteArray(stanzaElement.serialize()); } } |