From 80801aaeba2d29e3a375a01d782cf081e778dfaf Mon Sep 17 00:00:00 2001
From: Joanna Hulboj <joanna.hulboj@isode.com>
Date: Fri, 27 Jan 2017 10:03:51 +0000
Subject: Add checks to verify if the input data is a valid XML/XMPP

Test-Information:

Unit tests pass OK.

Change-Id: Ibd7897c1c1249af7ac60a6549afd4c2362fcf0c0

diff --git a/COPYING.dependencies b/COPYING.dependencies
index d233d9f..08823e8 100644
--- a/COPYING.dependencies
+++ b/COPYING.dependencies
@@ -12,6 +12,7 @@ Source and binary distributions of this project may also contain third-party lib
 - libnatpmp
 Binary distributions may contain these third-party libraries, whose licenses are also below.
 - OpenSSL
+- UTF-8 DFA Decoder
 
 ===============
 Google Breakpad
@@ -832,3 +833,16 @@ derivative of this code cannot be changed.  i.e. this code cannot simply be
 copied and put under another distribution licence
 [including the GNU Public Licence.]
 
+=================
+UTF-8 DFA Decoder
+=================
+
+http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
+
+Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/Swiften/Base/String.cpp b/Swiften/Base/String.cpp
index 4a5e4c9..bbc3003 100644
--- a/Swiften/Base/String.cpp
+++ b/Swiften/Base/String.cpp
@@ -17,6 +17,54 @@
 #include <Swiften/Base/String.h>
 #include <Swiften/Base/ByteArray.h>
 
+namespace {
+const static std::uint32_t UTF8_ACCEPT = 0;
+
+const static std::uint8_t UTF8D[] = {
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
+    7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
+    8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
+    0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
+    0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
+    0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
+    1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
+    1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
+    1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
+};
+
+//http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
+std::uint32_t decode(std::uint32_t & state, std::uint32_t & codepoint, std::uint8_t byte) {
+    const auto type = UTF8D[byte];
+    codepoint = (state != UTF8_ACCEPT) ? (byte & 0x3fu) | (codepoint << 6) : (0xff >> type) & (byte);
+    state = UTF8D[256 + state * 16 + type];
+    return state;
+}
+
+std::uint32_t getNextCodepoint(const char * begin, const char * end, std::size_t & consumed, bool & ok) {
+    consumed = 0;
+    ok = true;
+
+    std::uint32_t state = 0;
+    std::uint32_t codepoint = 0;
+
+    for (auto it = begin; it != end; ++it) {
+        ++consumed;
+        if (!decode(state, codepoint, static_cast<std::uint8_t>(*it)))
+            return codepoint;
+    }
+    if (state != UTF8_ACCEPT) {
+        ok = false;
+    }
+    return codepoint;
+}
+
+}
+
 namespace Swift {
 
 static inline size_t sequenceLength(char firstByte) {
@@ -86,6 +134,55 @@ void String::replaceAll(std::string& src, char c, const std::string& s) {
     }
 }
 
+bool String::isValidXMPPCharacter(std::uint32_t codepoint) {
+    // Special accepted characters:
+    if (codepoint == '\t' || codepoint == '\r' || codepoint == '\n')
+        return true;
+    // Discouraged characters:
+    if (codepoint >= 0x7Fu && codepoint <= 0x84u)
+        return false;
+    if (codepoint >= 0x86u && codepoint <= 0x9Fu)
+        return false;
+    if (codepoint >= 0xFDD0u && codepoint <= 0xFDEFu)
+        return false;
+    if (((codepoint & 0xFFFEu) == 0xFFEEu) || ((codepoint & 0xFFFFu) == 0xFFFFu))
+        return false;
+    // Other valid characters (after filtering for discouraged ones above)
+    if (codepoint >= 0x20u && codepoint <= 0xD7FFu)
+        return true;
+    if (codepoint >= 0xE000u && codepoint <= 0xFFFDu)
+        return true;
+    if (codepoint >= 0x10000u && codepoint <= 0x10FFFFu)
+        return true;
+    return false;
+}
+
+std::string String::sanitizeXMPPString(const std::string& input) {
+    std::string result;
+    result.reserve(input.length());
+
+    auto it = input.data();
+    const auto end = it + input.length();
+
+    std::size_t consumed;
+    bool status = UTF8_ACCEPT;
+
+    while (it < end) {
+        const auto codepoint = getNextCodepoint(it, end, consumed, status);
+        if (status) {
+            if (isValidXMPPCharacter(codepoint)) {
+                std::copy(it, it + consumed, std::back_inserter(result));
+            }
+            it += consumed;
+        }
+        else {
+            ++it;
+        }
+    }
+    result.shrink_to_fit();
+    return result;
+}
+
 std::vector<std::string> String::split(const std::string& s, char c) {
     assert((c & 0x80) == 0);
     std::vector<std::string> result;
@@ -119,7 +216,6 @@ int String::convertHexStringToInt(const std::string& s) {
     return h;
 }
 
-
 #ifdef SWIFTEN_PLATFORM_WINDOWS
 std::string convertWStringToString(const std::wstring& s) {
     int utf8Size = WideCharToMultiByte(CP_UTF8, 0, s.c_str(), -1, NULL, 0, NULL, NULL);
diff --git a/Swiften/Base/String.h b/Swiften/Base/String.h
index f19dca5..3a7ca65 100644
--- a/Swiften/Base/String.h
+++ b/Swiften/Base/String.h
@@ -6,6 +6,7 @@
 
 #pragma once
 
+#include <cstdint>
 #include <sstream>
 #include <string>
 #include <vector>
@@ -21,6 +22,8 @@ namespace Swift {
             SWIFTEN_API std::pair<std::string, std::string> getSplittedAtFirst(const std::string&, char c);
             SWIFTEN_API std::vector<std::string> split(const std::string&, char c);
             SWIFTEN_API void replaceAll(std::string&, char c, const std::string& s);
+            SWIFTEN_API bool isValidXMPPCharacter(std::uint32_t codepoint);
+            SWIFTEN_API std::string sanitizeXMPPString(const std::string& input);
 
             inline bool beginsWith(const std::string& s, char c) {
                 return s.size() > 0 && s[0] == c;
diff --git a/Swiften/Base/UnitTest/StringTest.cpp b/Swiften/Base/UnitTest/StringTest.cpp
index e2e1665..889c9c7 100644
--- a/Swiften/Base/UnitTest/StringTest.cpp
+++ b/Swiften/Base/UnitTest/StringTest.cpp
@@ -12,9 +12,11 @@
 #include <Swiften/Base/Platform.h>
 #include <Swiften/Base/String.h>
 
+#include <boost/format.hpp>
+
 using namespace Swift;
 
-class StringTest : public CppUnit::TestFixture {
+ class StringTest : public CppUnit::TestFixture {
         CPPUNIT_TEST_SUITE(StringTest);
         CPPUNIT_TEST(testGetUnicodeCodePoints);
         CPPUNIT_TEST(testGetSplittedAtFirst);
@@ -25,6 +27,8 @@ class StringTest : public CppUnit::TestFixture {
         CPPUNIT_TEST(testReplaceAll_LastChar);
         CPPUNIT_TEST(testReplaceAll_ConsecutiveChars);
         CPPUNIT_TEST(testReplaceAll_MatchingReplace);
+        CPPUNIT_TEST(testIsValidXMPPCharacter);
+        CPPUNIT_TEST(testSanitizeXMPPString);
         CPPUNIT_TEST(testSplit);
 #ifdef SWIFTEN_PLATFORM_WINDOWS
         CPPUNIT_TEST(testConvertWStringToString);
@@ -107,6 +111,44 @@ class StringTest : public CppUnit::TestFixture {
             CPPUNIT_ASSERT_EQUAL(std::string("abbbc"), testling);
         }
 
+        void testIsValidXMPPCharacter() {
+            const std::uint32_t testCharacters[] = {
+                0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+                0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
+                0x20, 0x7F, 0x80, 0x84, 0x85, 0xFF };
+            const auto testLength = sizeof(testCharacters) / sizeof(std::uint32_t);
+            const bool expectedValid[] = {
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                1, 0, 0, 0, 1, 1 };
+            static_assert(testLength == sizeof(expectedValid), "size of test data must match");
+
+            for (std::size_t i = 0; i != testLength; ++i) {
+                const auto c = testCharacters[i];
+                CPPUNIT_ASSERT_EQUAL_MESSAGE(boost::str(boost::format("While testing at idx=%d: 0x%02x") % i % c), expectedValid[i], String::isValidXMPPCharacter(c));
+            }
+        }
+
+        void testSanitizeXMPPString() {
+            std::vector<std::pair<std::string, std::string>> testData = {
+                { "\0", "" },
+                { std::string("\0\t", 3), "\t" },
+                { "", "" },
+                { std::string("\0", 1) , std::string() },
+                { std::string("\0blah\0", 6) , std::string("blah", 4) },
+                { "z\xC3\x9F\xE6\xB0\xB4\xF0\x9D\x84\x8B" , "z\xC3\x9F\xE6\xB0\xB4\xF0\x9D\x84\x8B" }, // or in u8 notation: u8"z\u00df\u6c34\U0001d10b"
+                { "\x7FT\t\x0c\xff\xfeT", "T\tT" },
+                { "\x01Q\x0BW\x81T", "QWT" },
+                { "\xF0\x9F\x98\x83" "ABC" "\xE2\xBE\xA6", "\xF0\x9F\x98\x83" "ABC" "\xE2\xBE\xA6" }
+            };
+
+            for (std::size_t i = 0; i != testData.size(); ++i) {
+                const auto & t = testData[i];
+                const auto actual = String::sanitizeXMPPString(t.first);
+                CPPUNIT_ASSERT_EQUAL_MESSAGE(boost::str(boost::format("While testing string idx=%d") % i), std::string(actual), t.second);
+            }
+        }
+
         void testSplit() {
             std::vector<std::string> result = String::split("abc def ghi", ' ');
 
diff --git a/Swiften/Serializer/StanzaSerializer.cpp b/Swiften/Serializer/StanzaSerializer.cpp
index f2f5529..3de706d 100644
--- a/Swiften/Serializer/StanzaSerializer.cpp
+++ b/Swiften/Serializer/StanzaSerializer.cpp
@@ -9,6 +9,7 @@
 #include <sstream>
 #include <typeinfo>
 
+#include <Swiften/Base/String.h>
 #include <Swiften/Base/Log.h>
 #include <Swiften/Elements/Stanza.h>
 #include <Swiften/Serializer/PayloadSerializer.h>
@@ -55,6 +56,7 @@ SafeByteArray StanzaSerializer::serialize(std::shared_ptr<ToplevelElement> eleme
             SWIFT_LOG(warning) << "Could not find serializer for " << typeid(*(payload.get())).name() << std::endl;
         }
     }
+    serializedPayloads = String::sanitizeXMPPString(serializedPayloads);
     if (!serializedPayloads.empty()) {
         stanzaElement.addNode(std::make_shared<XMLRawTextNode>(serializedPayloads));
     }
-- 
cgit v0.10.2-6-g49f6