1 files changed, 200 insertions, 104 deletions
diff --git a/Swiften/Base/String.cpp b/Swiften/Base/String.cpp
index 40ea2e1..bbc3003 100644
--- a/Swiften/Base/String.cpp
+++ b/Swiften/Base/String.cpp
@@ -1,7 +1,7 @@
 /*
- * Copyright (c) 2010-2013 Remko Tronçon
- * Licensed under the GNU General Public License v3.
- * See Documentation/Licenses/GPLv3.txt for more information.
+ * Copyright (c) 2010-2016 Isode Limited.
+ * All rights reserved.
+ * See the COPYING file for more information.
  */
 
 #include <Swiften/Base/Platform.h>
@@ -17,136 +17,232 @@
 #include <Swiften/Base/String.h>
 #include <Swiften/Base/ByteArray.h>
 
+namespace {
+const static std::uint32_t UTF8_ACCEPT = 0;
+
+const static std::uint8_t UTF8D[] = {
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
+    7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
+    8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
+    0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
+    0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
+    0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
+    1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
+    1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
+    1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
+};
+
+//http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
+std::uint32_t decode(std::uint32_t & state, std::uint32_t & codepoint, std::uint8_t byte) {
+    const auto type = UTF8D[byte];
+    codepoint = (state != UTF8_ACCEPT) ? (byte & 0x3fu) | (codepoint << 6) : (0xff >> type) & (byte);
+    state = UTF8D[256 + state * 16 + type];
+    return state;
+}
+
+std::uint32_t getNextCodepoint(const char * begin, const char * end, std::size_t & consumed, bool & ok) {
+    consumed = 0;
+    ok = true;
+
+    std::uint32_t state = 0;
+    std::uint32_t codepoint = 0;
+
+    for (auto it = begin; it != end; ++it) {
+        ++consumed;
+        if (!decode(state, codepoint, static_cast<std::uint8_t>(*it)))
+            return codepoint;
+    }
+    if (state != UTF8_ACCEPT) {
+        ok = false;
+    }
+    return codepoint;
+}
+
+}
+
 namespace Swift {
 
 static inline size_t sequenceLength(char firstByte) {
-	if ((firstByte & 0x80) == 0) {
-		return 1;
-	}
-	if ((firstByte & 0xE0) == 0xC0) {
-		return 2;
-	}
-	if ((firstByte & 0xF0) == 0xE0) {
-		return 3;
-	}
-	if ((firstByte & 0xF8) == 0xF0) {
-		return 4;
-	}
-	if ((firstByte & 0xFC) == 0xF8) {
-		return 5;
-	}
-	if ((firstByte & 0xFE) == 0xFC) {
-		return 6;
-	}
-	assert(false);
-	return 1;
+    if ((firstByte & 0x80) == 0) {
+        return 1;
+    }
+    if ((firstByte & 0xE0) == 0xC0) {
+        return 2;
+    }
+    if ((firstByte & 0xF0) == 0xE0) {
+        return 3;
+    }
+    if ((firstByte & 0xF8) == 0xF0) {
+        return 4;
+    }
+    if ((firstByte & 0xFC) == 0xF8) {
+        return 5;
+    }
+    if ((firstByte & 0xFE) == 0xFC) {
+        return 6;
+    }
+    assert(false);
+    return 1;
 }
 
 std::vector<unsigned int> String::getUnicodeCodePoints(const std::string& s) {
-	std::vector<unsigned int> result;
-	for (size_t i = 0; i < s.size();) {
-		unsigned int codePoint = 0;
-		char firstChar = s[i];
-		size_t length = sequenceLength(firstChar);
-
-		// First character is special
-		size_t firstCharBitSize = 7 - length;
-		if (length == 1) {
-			firstCharBitSize = 7;
-		}
-		codePoint = firstChar & ((1<<(firstCharBitSize+1)) - 1);
-
-		for (size_t j = 1; j < length; ++j) {
-			codePoint = (codePoint<<6) | (s[i+j] & 0x3F);
-		}
-		result.push_back(codePoint);
-		i += length;
-	}
-	return result;
+    std::vector<unsigned int> result;
+    for (size_t i = 0; i < s.size();) {
+        unsigned int codePoint = 0;
+        char firstChar = s[i];
+        size_t length = sequenceLength(firstChar);
+
+        // First character is special
+        size_t firstCharBitSize = 7 - length;
+        if (length == 1) {
+            firstCharBitSize = 7;
+        }
+        codePoint = firstChar & ((1<<(firstCharBitSize+1)) - 1);
+
+        for (size_t j = 1; j < length; ++j) {
+            codePoint = (codePoint<<6) | (s[i+j] & 0x3F);
+        }
+        result.push_back(codePoint);
+        i += length;
+    }
+    return result;
 }
 
 
 std::pair<std::string,std::string> String::getSplittedAtFirst(const std::string& s, char c) {
-	assert((c & 0x80) == 0);
-	size_t firstMatch = s.find(c);
-	if (firstMatch != s.npos) {
-		return std::make_pair(s.substr(0,firstMatch),s.substr(firstMatch+1,s.npos));
-	}
-	else {
-		return std::make_pair(s, "");
-	}
+    assert((c & 0x80) == 0);
+    size_t firstMatch = s.find(c);
+    if (firstMatch != s.npos) {
+        return std::make_pair(s.substr(0,firstMatch),s.substr(firstMatch+1,s.npos));
+    }
+    else {
+        return std::make_pair(s, "");
+    }
 }
 
 void String::replaceAll(std::string& src, char c, const std::string& s) {
-	size_t lastPos = 0;
-	size_t matchingIndex = 0;
-	while ((matchingIndex = src.find(c, lastPos)) != src.npos) {
-		src.replace(matchingIndex, 1, s);
-		lastPos = matchingIndex + s.size();
-	}
+    size_t lastPos = 0;
+    size_t matchingIndex = 0;
+    while ((matchingIndex = src.find(c, lastPos)) != src.npos) {
+        src.replace(matchingIndex, 1, s);
+        lastPos = matchingIndex + s.size();
+    }
+}
+
+bool String::isValidXMPPCharacter(std::uint32_t codepoint) {
+    // Special accepted characters:
+    if (codepoint == '\t' || codepoint == '\r' || codepoint == '\n')
+        return true;
+    // Discouraged characters:
+    if (codepoint >= 0x7Fu && codepoint <= 0x84u)
+        return false;
+    if (codepoint >= 0x86u && codepoint <= 0x9Fu)
+        return false;
+    if (codepoint >= 0xFDD0u && codepoint <= 0xFDEFu)
+        return false;
+    if (((codepoint & 0xFFFEu) == 0xFFEEu) || ((codepoint & 0xFFFFu) == 0xFFFFu))
+        return false;
+    // Other valid characters (after filtering for discouraged ones above)
+    if (codepoint >= 0x20u && codepoint <= 0xD7FFu)
+        return true;
+    if (codepoint >= 0xE000u && codepoint <= 0xFFFDu)
+        return true;
+    if (codepoint >= 0x10000u && codepoint <= 0x10FFFFu)
+        return true;
+    return false;
+}
+
+std::string String::sanitizeXMPPString(const std::string& input) {
+    std::string result;
+    result.reserve(input.length());
+
+    auto it = input.data();
+    const auto end = it + input.length();
+
+    std::size_t consumed;
+    bool status = UTF8_ACCEPT;
+
+    while (it < end) {
+        const auto codepoint = getNextCodepoint(it, end, consumed, status);
+        if (status) {
+            if (isValidXMPPCharacter(codepoint)) {
+                std::copy(it, it + consumed, std::back_inserter(result));
+            }
+            it += consumed;
+        }
+        else {
+            ++it;
+        }
+    }
+    result.shrink_to_fit();
+    return result;
 }
 
 std::vector<std::string> String::split(const std::string& s, char c) {
-	assert((c & 0x80) == 0);
-	std::vector<std::string> result;
-	std::string accumulator;
-	for (size_t i = 0; i < s.size(); ++i) {
-		if (s[i] == c) {
-			result.push_back(accumulator);
-			accumulator = "";
-		}
-		else {
-			accumulator += s[i];
-		}
-	}
-	result.push_back(accumulator);
-	return result;
+    assert((c & 0x80) == 0);
+    std::vector<std::string> result;
+    std::string accumulator;
+    for (char i : s) {
+        if (i == c) {
+            result.push_back(accumulator);
+            accumulator = "";
+        }
+        else {
+            accumulator += i;
+        }
+    }
+    result.push_back(accumulator);
+    return result;
 }
 
 std::string String::convertIntToHexString(int h) {
-	std::stringstream ss;
-	ss << std::setbase(16);
-	ss << h;
-	return ss.str();
+    std::stringstream ss;
+    ss << std::setbase(16);
+    ss << h;
+    return ss.str();
 }
 
 int String::convertHexStringToInt(const std::string& s) {
-	std::stringstream ss;
-	int h;
-	ss << std::setbase(16);
-	ss << s;
-	ss >> h;
-	return h;
+    std::stringstream ss;
+    int h;
+    ss << std::setbase(16);
+    ss << s;
+    ss >> h;
+    return h;
 }
 
-
 #ifdef SWIFTEN_PLATFORM_WINDOWS
 std::string convertWStringToString(const std::wstring& s) {
-	int utf8Size = WideCharToMultiByte(CP_UTF8, 0, s.c_str(), -1, NULL, 0, NULL, NULL);
-	if (utf8Size < 0) {
-		throw std::runtime_error("Conversion error");
-	}
-	std::vector<char> utf8Data(utf8Size);
-	int result = WideCharToMultiByte(
-			CP_UTF8, 0, s.c_str(), -1, vecptr(utf8Data), utf8Data.size(), NULL, NULL);
-	if (result < 0) {
-		throw std::runtime_error("Conversion error");
-	}
-	return std::string(vecptr(utf8Data), utf8Size-1 /* trailing 0 character */);
+    int utf8Size = WideCharToMultiByte(CP_UTF8, 0, s.c_str(), -1, NULL, 0, NULL, NULL);
+    if (utf8Size < 0) {
+        throw std::runtime_error("Conversion error");
+    }
+    std::vector<char> utf8Data(utf8Size);
+    int result = WideCharToMultiByte(
+            CP_UTF8, 0, s.c_str(), -1, vecptr(utf8Data), utf8Data.size(), NULL, NULL);
+    if (result < 0) {
+        throw std::runtime_error("Conversion error");
+    }
+    return std::string(vecptr(utf8Data), utf8Size-1 /* trailing 0 character */);
 }
 
 std::wstring convertStringToWString(const std::string& s) {
-	int utf16Size = MultiByteToWideChar(CP_UTF8, 0, s.c_str(), -1, NULL, 0);
-	if (utf16Size < 0) {
-		throw std::runtime_error("Conversion error");
-	}
-	std::vector<wchar_t> utf16Data(utf16Size);
-	int result = MultiByteToWideChar(
-			CP_UTF8, 0, s.c_str(), -1, vecptr(utf16Data), utf16Data.size());
-	if (result < 0) {
-		throw std::runtime_error("Conversion error");
-	}
-	return std::wstring(vecptr(utf16Data), utf16Size-1 /* trailing 0 character */);
+    int utf16Size = MultiByteToWideChar(CP_UTF8, 0, s.c_str(), -1, NULL, 0);
+    if (utf16Size < 0) {
+        throw std::runtime_error("Conversion error");
+    }
+    std::vector<wchar_t> utf16Data(utf16Size);
+    int result = MultiByteToWideChar(
+            CP_UTF8, 0, s.c_str(), -1, vecptr(utf16Data), utf16Data.size());
+    if (result < 0) {
+        throw std::runtime_error("Conversion error");
+    }
+    return std::wstring(vecptr(utf16Data), utf16Size-1 /* trailing 0 character */);
 }
 #endif