summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
Diffstat (limited to 'Swiften/Base')
-rw-r--r--Swiften/Base/String.cpp98
-rw-r--r--Swiften/Base/String.h3
-rw-r--r--Swiften/Base/UnitTest/StringTest.cpp44
3 files changed, 143 insertions, 2 deletions
diff --git a/Swiften/Base/String.cpp b/Swiften/Base/String.cpp
index 4a5e4c9..bbc3003 100644
--- a/Swiften/Base/String.cpp
+++ b/Swiften/Base/String.cpp
@@ -17,6 +17,54 @@
#include <Swiften/Base/String.h>
#include <Swiften/Base/ByteArray.h>
+namespace {
+const static std::uint32_t UTF8_ACCEPT = 0;
+
+const static std::uint8_t UTF8D[] = {
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
+ 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
+ 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
+ 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
+ 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
+ 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
+ 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
+ 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
+ 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
+};
+
+//http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
+std::uint32_t decode(std::uint32_t & state, std::uint32_t & codepoint, std::uint8_t byte) {
+ const auto type = UTF8D[byte];
+ codepoint = (state != UTF8_ACCEPT) ? (byte & 0x3fu) | (codepoint << 6) : (0xff >> type) & (byte);
+ state = UTF8D[256 + state * 16 + type];
+ return state;
+}
+
+std::uint32_t getNextCodepoint(const char * begin, const char * end, std::size_t & consumed, bool & ok) {
+ consumed = 0;
+ ok = true;
+
+ std::uint32_t state = 0;
+ std::uint32_t codepoint = 0;
+
+ for (auto it = begin; it != end; ++it) {
+ ++consumed;
+ if (!decode(state, codepoint, static_cast<std::uint8_t>(*it)))
+ return codepoint;
+ }
+ if (state != UTF8_ACCEPT) {
+ ok = false;
+ }
+ return codepoint;
+}
+
+}
+
namespace Swift {
static inline size_t sequenceLength(char firstByte) {
@@ -86,6 +134,55 @@ void String::replaceAll(std::string& src, char c, const std::string& s) {
}
}
+bool String::isValidXMPPCharacter(std::uint32_t codepoint) {
+ // Special accepted characters:
+ if (codepoint == '\t' || codepoint == '\r' || codepoint == '\n')
+ return true;
+ // Discouraged characters:
+ if (codepoint >= 0x7Fu && codepoint <= 0x84u)
+ return false;
+ if (codepoint >= 0x86u && codepoint <= 0x9Fu)
+ return false;
+ if (codepoint >= 0xFDD0u && codepoint <= 0xFDEFu)
+ return false;
+ if (((codepoint & 0xFFFEu) == 0xFFEEu) || ((codepoint & 0xFFFFu) == 0xFFFFu))
+ return false;
+ // Other valid characters (after filtering for discouraged ones above)
+ if (codepoint >= 0x20u && codepoint <= 0xD7FFu)
+ return true;
+ if (codepoint >= 0xE000u && codepoint <= 0xFFFDu)
+ return true;
+ if (codepoint >= 0x10000u && codepoint <= 0x10FFFFu)
+ return true;
+ return false;
+}
+
+std::string String::sanitizeXMPPString(const std::string& input) {
+ std::string result;
+ result.reserve(input.length());
+
+ auto it = input.data();
+ const auto end = it + input.length();
+
+ std::size_t consumed;
+ bool status = UTF8_ACCEPT;
+
+ while (it < end) {
+ const auto codepoint = getNextCodepoint(it, end, consumed, status);
+ if (status) {
+ if (isValidXMPPCharacter(codepoint)) {
+ std::copy(it, it + consumed, std::back_inserter(result));
+ }
+ it += consumed;
+ }
+ else {
+ ++it;
+ }
+ }
+ result.shrink_to_fit();
+ return result;
+}
+
std::vector<std::string> String::split(const std::string& s, char c) {
assert((c & 0x80) == 0);
std::vector<std::string> result;
@@ -119,7 +216,6 @@ int String::convertHexStringToInt(const std::string& s) {
return h;
}
-
#ifdef SWIFTEN_PLATFORM_WINDOWS
std::string convertWStringToString(const std::wstring& s) {
int utf8Size = WideCharToMultiByte(CP_UTF8, 0, s.c_str(), -1, NULL, 0, NULL, NULL);
diff --git a/Swiften/Base/String.h b/Swiften/Base/String.h
index f19dca5..3a7ca65 100644
--- a/Swiften/Base/String.h
+++ b/Swiften/Base/String.h
@@ -6,6 +6,7 @@
#pragma once
+#include <cstdint>
#include <sstream>
#include <string>
#include <vector>
@@ -21,6 +22,8 @@ namespace Swift {
SWIFTEN_API std::pair<std::string, std::string> getSplittedAtFirst(const std::string&, char c);
SWIFTEN_API std::vector<std::string> split(const std::string&, char c);
SWIFTEN_API void replaceAll(std::string&, char c, const std::string& s);
+ SWIFTEN_API bool isValidXMPPCharacter(std::uint32_t codepoint);
+ SWIFTEN_API std::string sanitizeXMPPString(const std::string& input);
inline bool beginsWith(const std::string& s, char c) {
return s.size() > 0 && s[0] == c;
diff --git a/Swiften/Base/UnitTest/StringTest.cpp b/Swiften/Base/UnitTest/StringTest.cpp
index e2e1665..889c9c7 100644
--- a/Swiften/Base/UnitTest/StringTest.cpp
+++ b/Swiften/Base/UnitTest/StringTest.cpp
@@ -12,9 +12,11 @@
#include <Swiften/Base/Platform.h>
#include <Swiften/Base/String.h>
+#include <boost/format.hpp>
+
using namespace Swift;
-class StringTest : public CppUnit::TestFixture {
+ class StringTest : public CppUnit::TestFixture {
CPPUNIT_TEST_SUITE(StringTest);
CPPUNIT_TEST(testGetUnicodeCodePoints);
CPPUNIT_TEST(testGetSplittedAtFirst);
@@ -25,6 +27,8 @@ class StringTest : public CppUnit::TestFixture {
CPPUNIT_TEST(testReplaceAll_LastChar);
CPPUNIT_TEST(testReplaceAll_ConsecutiveChars);
CPPUNIT_TEST(testReplaceAll_MatchingReplace);
+ CPPUNIT_TEST(testIsValidXMPPCharacter);
+ CPPUNIT_TEST(testSanitizeXMPPString);
CPPUNIT_TEST(testSplit);
#ifdef SWIFTEN_PLATFORM_WINDOWS
CPPUNIT_TEST(testConvertWStringToString);
@@ -107,6 +111,44 @@ class StringTest : public CppUnit::TestFixture {
CPPUNIT_ASSERT_EQUAL(std::string("abbbc"), testling);
}
+ void testIsValidXMPPCharacter() {
+ const std::uint32_t testCharacters[] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
+ 0x20, 0x7F, 0x80, 0x84, 0x85, 0xFF };
+ const auto testLength = sizeof(testCharacters) / sizeof(std::uint32_t);
+ const bool expectedValid[] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 1, 0, 0, 0, 1, 1 };
+ static_assert(testLength == sizeof(expectedValid), "size of test data must match");
+
+ for (std::size_t i = 0; i != testLength; ++i) {
+ const auto c = testCharacters[i];
+ CPPUNIT_ASSERT_EQUAL_MESSAGE(boost::str(boost::format("While testing at idx=%d: 0x%02x") % i % c), expectedValid[i], String::isValidXMPPCharacter(c));
+ }
+ }
+
+ void testSanitizeXMPPString() {
+ std::vector<std::pair<std::string, std::string>> testData = {
+ { "\0", "" },
+ { std::string("\0\t", 3), "\t" },
+ { "", "" },
+ { std::string("\0", 1) , std::string() },
+ { std::string("\0blah\0", 6) , std::string("blah", 4) },
+ { "z\xC3\x9F\xE6\xB0\xB4\xF0\x9D\x84\x8B" , "z\xC3\x9F\xE6\xB0\xB4\xF0\x9D\x84\x8B" }, // or in u8 notation: u8"z\u00df\u6c34\U0001d10b"
+ { "\x7FT\t\x0c\xff\xfeT", "T\tT" },
+ { "\x01Q\x0BW\x81T", "QWT" },
+ { "\xF0\x9F\x98\x83" "ABC" "\xE2\xBE\xA6", "\xF0\x9F\x98\x83" "ABC" "\xE2\xBE\xA6" }
+ };
+
+ for (std::size_t i = 0; i != testData.size(); ++i) {
+ const auto & t = testData[i];
+ const auto actual = String::sanitizeXMPPString(t.first);
+ CPPUNIT_ASSERT_EQUAL_MESSAGE(boost::str(boost::format("While testing string idx=%d") % i), std::string(actual), t.second);
+ }
+ }
+
void testSplit() {
std::vector<std::string> result = String::split("abc def ghi", ' ');