summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
Diffstat (limited to 'SwifTools/SpellParser.cpp')
-rw-r--r--SwifTools/SpellParser.cpp69
1 files changed, 69 insertions, 0 deletions
diff --git a/SwifTools/SpellParser.cpp b/SwifTools/SpellParser.cpp
new file mode 100644
index 0000000..8f5120b
--- /dev/null
+++ b/SwifTools/SpellParser.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2011 Vlad Voicu
+ * Licensed under the Simplified BSD license.
+ * See Documentation/Licenses/BSD-simplified.txt for more information.
+ */
+
+#include <SwifTools/SpellParser.h>
+
+#include <boost/spirit/include/lex_lexertl.hpp>
+#include <boost/bind.hpp>
+#include <boost/ref.hpp>
+
+#include <string>
+
+namespace lex = boost::spirit::lex;
+
+namespace Swift {
+
+template <typename Lexer>
+struct word_count_tokens : lex::lexer<Lexer>
+{
+ word_count_tokens()
+ {
+ // define tokens (regular expresions) to match strings
+ // order is important
+ this->self.add
+ ("w{3}.[^ ]+", ID_WWW)
+ ("http:\\/\\/[^ ]+", ID_HTTP)
+ ("\\w{1,}['?|\\-?]?\\w{1,}", ID_WORD)
+ (".", ID_CHAR);
+ }
+};
+
+struct counter
+{
+ typedef bool result_type;
+ // the function operator gets called for each of the matched tokens
+ template <typename Token>
+ bool operator()(Token const& t, PositionPairVector& wordPositions, std::size_t& position) const
+ {
+ switch (t.id()) {
+ case ID_WWW:
+ position += t.value().size();
+ break;
+ case ID_HTTP:
+ position += t.value().size();
+ break;
+ case ID_WORD: // matched a word
+ wordPositions.push_back(boost::tuples::make_tuple(position, position + t.value().size()));
+ position += t.value().size();
+ break;
+ case ID_CHAR: // match a simple char
+ ++position;
+ break;
+ }
+ return true; // always continue to tokenize
+ }
+};
+
+void SpellParser::check(const std::string& fragment, PositionPairVector& wordPositions) {
+ std::size_t position = 0;
+ // create the token definition instance needed to invoke the lexical analyzer
+ word_count_tokens<lex::lexertl::lexer<> > word_count_functor;
+ char const* first = fragment.c_str();
+ char const* last = &first[fragment.size()];
+ lex::tokenize(first, last, word_count_functor, boost::bind(counter(), _1, boost::ref(wordPositions), boost::ref(position)));
+}
+
+}