3 files changed, 143 insertions, 2 deletions
diff --git a/Swiften/Base/String.cpp b/Swiften/Base/String.cpp
index 4a5e4c9..bbc3003 100644
--- a/Swiften/Base/String.cpp
+++ b/Swiften/Base/String.cpp
@@ -1,49 +1,97 @@
 /*
  * Copyright (c) 2010-2016 Isode Limited.
  * All rights reserved.
  * See the COPYING file for more information.
  */
 
 #include <Swiften/Base/Platform.h>
 
 #include <cassert>
 #include <algorithm>
 #include <sstream>
 #include <iomanip>
 #ifdef SWIFTEN_PLATFORM_WINDOWS
 #include <windows.h>
 #endif
 
 #include <Swiften/Base/String.h>
 #include <Swiften/Base/ByteArray.h>
 
+namespace {
+const static std::uint32_t UTF8_ACCEPT = 0;
+
+const static std::uint8_t UTF8D[] = {
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
+    7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
+    8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
+    0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
+    0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
+    0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
+    1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
+    1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
+    1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
+};
+
+//http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
+std::uint32_t decode(std::uint32_t & state, std::uint32_t & codepoint, std::uint8_t byte) {
+    const auto type = UTF8D[byte];
+    codepoint = (state != UTF8_ACCEPT) ? (byte & 0x3fu) | (codepoint << 6) : (0xff >> type) & (byte);
+    state = UTF8D[256 + state * 16 + type];
+    return state;
+}
+
+std::uint32_t getNextCodepoint(const char * begin, const char * end, std::size_t & consumed, bool & ok) {
+    consumed = 0;
+    ok = true;
+
+    std::uint32_t state = 0;
+    std::uint32_t codepoint = 0;
+
+    for (auto it = begin; it != end; ++it) {
+        ++consumed;
+        if (!decode(state, codepoint, static_cast<std::uint8_t>(*it)))
+            return codepoint;
+    }
+    if (state != UTF8_ACCEPT) {
+        ok = false;
+    }
+    return codepoint;
+}
+
+}
+
 namespace Swift {
 
 static inline size_t sequenceLength(char firstByte) {
     if ((firstByte & 0x80) == 0) {
         return 1;
     }
     if ((firstByte & 0xE0) == 0xC0) {
         return 2;
     }
     if ((firstByte & 0xF0) == 0xE0) {
         return 3;
     }
     if ((firstByte & 0xF8) == 0xF0) {
         return 4;
     }
     if ((firstByte & 0xFC) == 0xF8) {
         return 5;
     }
     if ((firstByte & 0xFE) == 0xFC) {
         return 6;
     }
     assert(false);
     return 1;
 }
 
 std::vector<unsigned int> String::getUnicodeCodePoints(const std::string& s) {
     std::vector<unsigned int> result;
     for (size_t i = 0; i < s.size();) {
         unsigned int codePoint = 0;
         char firstChar = s[i];
@@ -59,94 +107,142 @@ std::vector<unsigned int> String::getUnicodeCodePoints(const std::string& s) {
         for (size_t j = 1; j < length; ++j) {
             codePoint = (codePoint<<6) | (s[i+j] & 0x3F);
         }
         result.push_back(codePoint);
         i += length;
     }
     return result;
 }
 
 
 std::pair<std::string,std::string> String::getSplittedAtFirst(const std::string& s, char c) {
     assert((c & 0x80) == 0);
     size_t firstMatch = s.find(c);
     if (firstMatch != s.npos) {
         return std::make_pair(s.substr(0,firstMatch),s.substr(firstMatch+1,s.npos));
     }
     else {
         return std::make_pair(s, "");
     }
 }
 
 void String::replaceAll(std::string& src, char c, const std::string& s) {
     size_t lastPos = 0;
     size_t matchingIndex = 0;
     while ((matchingIndex = src.find(c, lastPos)) != src.npos) {
         src.replace(matchingIndex, 1, s);
         lastPos = matchingIndex + s.size();
     }
 }
 
+bool String::isValidXMPPCharacter(std::uint32_t codepoint) {
+    // Special accepted characters:
+    if (codepoint == '\t' || codepoint == '\r' || codepoint == '\n')
+        return true;
+    // Discouraged characters:
+    if (codepoint >= 0x7Fu && codepoint <= 0x84u)
+        return false;
+    if (codepoint >= 0x86u && codepoint <= 0x9Fu)
+        return false;
+    if (codepoint >= 0xFDD0u && codepoint <= 0xFDEFu)
+        return false;
+    if (((codepoint & 0xFFFEu) == 0xFFEEu) || ((codepoint & 0xFFFFu) == 0xFFFFu))
+        return false;
+    // Other valid characters (after filtering for discouraged ones above)
+    if (codepoint >= 0x20u && codepoint <= 0xD7FFu)
+        return true;
+    if (codepoint >= 0xE000u && codepoint <= 0xFFFDu)
+        return true;
+    if (codepoint >= 0x10000u && codepoint <= 0x10FFFFu)
+        return true;
+    return false;
+}
+
+std::string String::sanitizeXMPPString(const std::string& input) {
+    std::string result;
+    result.reserve(input.length());
+
+    auto it = input.data();
+    const auto end = it + input.length();
+
+    std::size_t consumed;
+    bool status = UTF8_ACCEPT;
+
+    while (it < end) {
+        const auto codepoint = getNextCodepoint(it, end, consumed, status);
+        if (status) {
+            if (isValidXMPPCharacter(codepoint)) {
+                std::copy(it, it + consumed, std::back_inserter(result));
+            }
+            it += consumed;
+        }
+        else {
+            ++it;
+        }
+    }
+    result.shrink_to_fit();
+    return result;
+}
+
 std::vector<std::string> String::split(const std::string& s, char c) {
     assert((c & 0x80) == 0);
     std::vector<std::string> result;
     std::string accumulator;
     for (char i : s) {
         if (i == c) {
             result.push_back(accumulator);
             accumulator = "";
         }
         else {
             accumulator += i;
         }
     }
     result.push_back(accumulator);
     return result;
 }
 
 std::string String::convertIntToHexString(int h) {
     std::stringstream ss;
     ss << std::setbase(16);
     ss << h;
     return ss.str();
 }
 
 int String::convertHexStringToInt(const std::string& s) {
     std::stringstream ss;
     int h;
     ss << std::setbase(16);
     ss << s;
     ss >> h;
     return h;
 }
 
-
 #ifdef SWIFTEN_PLATFORM_WINDOWS
 std::string convertWStringToString(const std::wstring& s) {
     int utf8Size = WideCharToMultiByte(CP_UTF8, 0, s.c_str(), -1, NULL, 0, NULL, NULL);
     if (utf8Size < 0) {
         throw std::runtime_error("Conversion error");
     }
     std::vector<char> utf8Data(utf8Size);
     int result = WideCharToMultiByte(
             CP_UTF8, 0, s.c_str(), -1, vecptr(utf8Data), utf8Data.size(), NULL, NULL);
     if (result < 0) {
         throw std::runtime_error("Conversion error");
     }
     return std::string(vecptr(utf8Data), utf8Size-1 /* trailing 0 character */);
 }
 
 std::wstring convertStringToWString(const std::string& s) {
     int utf16Size = MultiByteToWideChar(CP_UTF8, 0, s.c_str(), -1, NULL, 0);
     if (utf16Size < 0) {
         throw std::runtime_error("Conversion error");
     }
     std::vector<wchar_t> utf16Data(utf16Size);
     int result = MultiByteToWideChar(
             CP_UTF8, 0, s.c_str(), -1, vecptr(utf16Data), utf16Data.size());
     if (result < 0) {
         throw std::runtime_error("Conversion error");
     }
     return std::wstring(vecptr(utf16Data), utf16Size-1 /* trailing 0 character */);
 }
 #endif
 
diff --git a/Swiften/Base/String.h b/Swiften/Base/String.h
index f19dca5..3a7ca65 100644
--- a/Swiften/Base/String.h
+++ b/Swiften/Base/String.h
@@ -1,53 +1,56 @@
 /*
  * Copyright (c) 2010-2016 Isode Limited.
  * All rights reserved.
  * See the COPYING file for more information.
  */
 
 #pragma once
 
+#include <cstdint>
 #include <sstream>
 #include <string>
 #include <vector>
 
 #include <Swiften/Base/API.h>
 #include <Swiften/Base/Platform.h>
 #define SWIFTEN_STRING_TO_CFSTRING(a) \
     CFStringCreateWithBytes(NULL, reinterpret_cast<const UInt8*>(a.c_str()), a.size(), kCFStringEncodingUTF8, false)
 
 namespace Swift {
     namespace String {
             SWIFTEN_API std::vector<unsigned int> getUnicodeCodePoints(const std::string&);
             SWIFTEN_API std::pair<std::string, std::string> getSplittedAtFirst(const std::string&, char c);
             SWIFTEN_API std::vector<std::string> split(const std::string&, char c);
             SWIFTEN_API void replaceAll(std::string&, char c, const std::string& s);
+            SWIFTEN_API bool isValidXMPPCharacter(std::uint32_t codepoint);
+            SWIFTEN_API std::string sanitizeXMPPString(const std::string& input);
 
             inline bool beginsWith(const std::string& s, char c) {
                 return s.size() > 0 && s[0] == c;
             }
 
             inline bool endsWith(const std::string& s, char c) {
                 return s.size() > 0 && s[s.size()-1] == c;
             }
 
             std::string convertIntToHexString(int h);
             int convertHexStringToInt(const std::string& s);
 
     }
 
 #ifdef SWIFTEN_PLATFORM_WINDOWS
     SWIFTEN_API std::string convertWStringToString(const std::wstring& s);
     SWIFTEN_API std::wstring convertStringToWString(const std::string& s);
 #endif
 
     class SWIFTEN_API makeString {
         public:
              template <typename T> makeString& operator<<(T const& v) {
                     stream << v;
                     return *this;
              }
 
              operator std::string() const {
                     return stream.str();
              }
 
diff --git a/Swiften/Base/UnitTest/StringTest.cpp b/Swiften/Base/UnitTest/StringTest.cpp
index e2e1665..889c9c7 100644
--- a/Swiften/Base/UnitTest/StringTest.cpp
+++ b/Swiften/Base/UnitTest/StringTest.cpp
@@ -1,57 +1,61 @@
 /*
  * Copyright (c) 2010-2016 Isode Limited.
  * All rights reserved.
  * See the COPYING file for more information.
  */
 
 #include <string>
 
 #include <cppunit/extensions/HelperMacros.h>
 #include <cppunit/extensions/TestFactoryRegistry.h>
 
 #include <Swiften/Base/Platform.h>
 #include <Swiften/Base/String.h>
 
+#include <boost/format.hpp>
+
 using namespace Swift;
 
-class StringTest : public CppUnit::TestFixture {
+ class StringTest : public CppUnit::TestFixture {
         CPPUNIT_TEST_SUITE(StringTest);
         CPPUNIT_TEST(testGetUnicodeCodePoints);
         CPPUNIT_TEST(testGetSplittedAtFirst);
         CPPUNIT_TEST(testGetSplittedAtFirst_CharacterAtBegin);
         CPPUNIT_TEST(testGetSplittedAtFirst_CharacterAtEnd);
         CPPUNIT_TEST(testGetSplittedAtFirst_NoSuchCharacter);
         CPPUNIT_TEST(testReplaceAll);
         CPPUNIT_TEST(testReplaceAll_LastChar);
         CPPUNIT_TEST(testReplaceAll_ConsecutiveChars);
         CPPUNIT_TEST(testReplaceAll_MatchingReplace);
+        CPPUNIT_TEST(testIsValidXMPPCharacter);
+        CPPUNIT_TEST(testSanitizeXMPPString);
         CPPUNIT_TEST(testSplit);
 #ifdef SWIFTEN_PLATFORM_WINDOWS
         CPPUNIT_TEST(testConvertWStringToString);
         CPPUNIT_TEST(testConvertStringToWString);
 #endif
         CPPUNIT_TEST_SUITE_END();
 
     public:
         void testGetUnicodeCodePoints() {
             std::string testling("$\xc2\xa2\xe2\x82\xac\xf4\x8a\xaf\x8d");
             std::vector<unsigned int> points = String::getUnicodeCodePoints(testling);
 
             CPPUNIT_ASSERT_EQUAL(0x24U, points[0]);
             CPPUNIT_ASSERT_EQUAL(0xA2U, points[1]);
             CPPUNIT_ASSERT_EQUAL(0x20ACU, points[2]);
             CPPUNIT_ASSERT_EQUAL(0x10ABCDU, points[3]);
         }
 
         void testGetSplittedAtFirst() {
             std::string testling("ab@cd@ef");
 
             std::pair<std::string,std::string> result = String::getSplittedAtFirst(testling, '@');
             CPPUNIT_ASSERT_EQUAL(std::string("ab"), result.first);
             CPPUNIT_ASSERT_EQUAL(std::string("cd@ef"), result.second);
         }
 
         void testGetSplittedAtFirst_CharacterAtBegin() {
             std::string testling(" ab");
 
             std::pair<std::string,std::string> result = String::getSplittedAtFirst(testling, ' ');
@@ -80,51 +84,89 @@ class StringTest : public CppUnit::TestFixture {
 
             String::replaceAll(testling, 'b', "xyz");
 
             CPPUNIT_ASSERT_EQUAL(std::string("axyzcxyzd"), testling);
         }
 
         void testReplaceAll_LastChar() {
             std::string testling("abc");
 
             String::replaceAll(testling, 'c', "xyz");
 
             CPPUNIT_ASSERT_EQUAL(std::string("abxyz"), testling);
         }
 
         void testReplaceAll_ConsecutiveChars() {
             std::string testling("abbc");
 
             String::replaceAll(testling, 'b',"xyz");
 
             CPPUNIT_ASSERT_EQUAL(std::string("axyzxyzc"), testling);
         }
 
         void testReplaceAll_MatchingReplace() {
             std::string testling("abc");
 
             String::replaceAll(testling, 'b',"bbb");
 
             CPPUNIT_ASSERT_EQUAL(std::string("abbbc"), testling);
         }
 
+        void testIsValidXMPPCharacter() {
+            const std::uint32_t testCharacters[] = {
+                0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+                0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
+                0x20, 0x7F, 0x80, 0x84, 0x85, 0xFF };
+            const auto testLength = sizeof(testCharacters) / sizeof(std::uint32_t);
+            const bool expectedValid[] = {
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                1, 0, 0, 0, 1, 1 };
+            static_assert(testLength == sizeof(expectedValid), "size of test data must match");
+
+            for (std::size_t i = 0; i != testLength; ++i) {
+                const auto c = testCharacters[i];
+                CPPUNIT_ASSERT_EQUAL_MESSAGE(boost::str(boost::format("While testing at idx=%d: 0x%02x") % i % c), expectedValid[i], String::isValidXMPPCharacter(c));
+            }
+        }
+
+        void testSanitizeXMPPString() {
+            std::vector<std::pair<std::string, std::string>> testData = {
+                { "\0", "" },
+                { std::string("\0\t", 3), "\t" },
+                { "", "" },
+                { std::string("\0", 1) , std::string() },
+                { std::string("\0blah\0", 6) , std::string("blah", 4) },
+                { "z\xC3\x9F\xE6\xB0\xB4\xF0\x9D\x84\x8B" , "z\xC3\x9F\xE6\xB0\xB4\xF0\x9D\x84\x8B" }, // or in u8 notation: u8"z\u00df\u6c34\U0001d10b"
+                { "\x7FT\t\x0c\xff\xfeT", "T\tT" },
+                { "\x01Q\x0BW\x81T", "QWT" },
+                { "\xF0\x9F\x98\x83" "ABC" "\xE2\xBE\xA6", "\xF0\x9F\x98\x83" "ABC" "\xE2\xBE\xA6" }
+            };
+
+            for (std::size_t i = 0; i != testData.size(); ++i) {
+                const auto & t = testData[i];
+                const auto actual = String::sanitizeXMPPString(t.first);
+                CPPUNIT_ASSERT_EQUAL_MESSAGE(boost::str(boost::format("While testing string idx=%d") % i), std::string(actual), t.second);
+            }
+        }
+
         void testSplit() {
             std::vector<std::string> result = String::split("abc def ghi", ' ');
 
             CPPUNIT_ASSERT_EQUAL(3, static_cast<int>(result.size()));
             CPPUNIT_ASSERT_EQUAL(std::string("abc"), result[0]);
             CPPUNIT_ASSERT_EQUAL(std::string("def"), result[1]);
             CPPUNIT_ASSERT_EQUAL(std::string("ghi"), result[2]);
         }
 
 #ifdef SWIFTEN_PLATFORM_WINDOWS
         void testConvertWStringToString() {
             CPPUNIT_ASSERT_EQUAL(std::string("tron\xc3\xa7on"), convertWStringToString(std::wstring(L"tron\xe7on")));
         }
 
         void testConvertStringToWString() {
             CPPUNIT_ASSERT(std::wstring(L"tron\xe7on") == convertStringToWString(std::string("tron\xc3\xa7on")));
         }
 #endif
 };
 
 CPPUNIT_TEST_SUITE_REGISTRATION(StringTest);