summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVlad Voicu <vladvoic@gmail.com>2011-11-28 16:37:32 (GMT)
committerVlad Voicu <vladvoic@gmail.com>2013-03-15 09:21:52 (GMT)
commit2061b06eccca67595c50edd81c44c5b961bf108b (patch)
tree7fdc9e4cc80a9d8ddbe5364a531ef3449f72ab2b /SwifTools
parenta069a0df0f51a948a86e34d99f952a33eecd97ba (diff)
downloadswift-2061b06eccca67595c50edd81c44c5b961bf108b.zip
swift-2061b06eccca67595c50edd81c44c5b961bf108b.tar.bz2
Spell checker implementation using Hunspell
Change-Id: Ia15b6532edf6eef7c45bdfb273e77f65ce998f13 License: This patch is BSD-licensed, see Documentation/Licenses/BSD-simplified.txt for details
Diffstat (limited to 'SwifTools')
-rw-r--r--SwifTools/HunspellChecker.cpp55
-rw-r--r--SwifTools/HunspellChecker.h27
-rw-r--r--SwifTools/SConscript12
-rw-r--r--SwifTools/SpellChecker.h30
-rw-r--r--SwifTools/SpellCheckerFactory.cpp34
-rw-r--r--SwifTools/SpellCheckerFactory.h20
-rw-r--r--SwifTools/SpellParser.cpp69
-rw-r--r--SwifTools/SpellParser.h31
-rw-r--r--SwifTools/UnitTest/SConscript5
-rw-r--r--SwifTools/UnitTest/SpellParserTest.cpp51
10 files changed, 334 insertions, 0 deletions
diff --git a/SwifTools/HunspellChecker.cpp b/SwifTools/HunspellChecker.cpp
new file mode 100644
index 0000000..ecd352e
--- /dev/null
+++ b/SwifTools/HunspellChecker.cpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2011 Vlad Voicu
+ * Licensed under the Simplified BSD license.
+ * See Documentation/Licenses/BSD-simplified.txt for more information.
+ */
+
+#include <SwifTools/HunspellChecker.h>
+
+#include <algorithm>
+#include <hunspell/hunspell.hxx>
+#include <boost/algorithm/string.hpp>
+
+
+namespace Swift {
+
+HunspellChecker::HunspellChecker(const char* affix_path, const char* dictionary_path) {
+ speller_ = new Hunspell(affix_path, dictionary_path);
+}
+
+HunspellChecker::~HunspellChecker() {
+ delete speller_;
+}
+
+bool HunspellChecker::isCorrect(const std::string& word) {
+ return speller_->spell(word.c_str());
+}
+
+void HunspellChecker::getSuggestions(const std::string& word, std::vector<std::string>& list) {
+ char **suggestList;
+ int words_returned;
+ if (!word.empty()) {
+ words_returned = speller_->suggest(&suggestList, word.c_str());
+ }
+ for (int i = 0; i < words_returned; ++i) {
+ list.push_back(suggestList[i]);
+ free(suggestList[i]);
+ }
+ free(suggestList);
+}
+
+void HunspellChecker::checkFragment(const std::string& fragment, PositionPairList& misspelledPositions) {
+ if (!fragment.empty()) {
+ parser_->check(fragment, misspelledPositions);
+ for (PositionPairList::iterator it = misspelledPositions.begin(); it != misspelledPositions.end();) {
+ if (isCorrect(fragment.substr(boost::get<0>(*it), boost::get<1>(*it) - boost::get<0>(*it)))) {
+ it = misspelledPositions.erase(it);
+ }
+ else {
+ ++it;
+ }
+ }
+ }
+}
+
+}
diff --git a/SwifTools/HunspellChecker.h b/SwifTools/HunspellChecker.h
new file mode 100644
index 0000000..12c0485
--- /dev/null
+++ b/SwifTools/HunspellChecker.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2011 Vlad Voicu
+ * Licensed under the Simplified BSD license.
+ * See Documentation/Licenses/BSD-simplified.txt for more information.
+ */
+
+#include <vector>
+#include <boost/algorithm/string.hpp>
+#include <boost/tuple/tuple.hpp>
+#include <SwifTools/SpellChecker.h>
+
+#pragma once
+
+class Hunspell;
+
+namespace Swift {
+ class HunspellChecker : public SpellChecker {
+ public:
+ HunspellChecker(const char* affix_path, const char* dict_path);
+ virtual ~HunspellChecker();
+ virtual bool isCorrect(const std::string& word);
+ virtual void getSuggestions(const std::string& word, std::vector<std::string>& list);
+ virtual void checkFragment(const std::string& fragment, PositionPairList& misspelledPositions);
+ private:
+ Hunspell* speller_;
+ };
+}
diff --git a/SwifTools/SConscript b/SwifTools/SConscript
index fa2686a..eaf5787 100644
--- a/SwifTools/SConscript
+++ b/SwifTools/SConscript
@@ -9,6 +9,8 @@ if env["SCONS_STAGE"] == "flags" :
"LIBPATH": [Dir(".")],
"LIBS": ["SwifTools"]
}
+ if env["HAVE_HUNSPELL"] :
+ env.MergeFlags(env["HUNSPELL_FLAGS"])
################################################################################
# Build
@@ -30,6 +32,16 @@ if env["SCONS_STAGE"] == "build" :
"TabComplete.cpp",
"LastLineTracker.cpp",
]
+
+ if swiftools_env["HAVE_HUNSPELL"] :
+ swiftools_env.MergeFlags(swiftools_env["HUNSPELL_FLAGS"])
+ swiftools_env.Append(CPPDEFINES = ["HAVE_HUNSPELL"])
+ sources += [
+ "SpellCheckerFactory.cpp",
+ "HunspellChecker.cpp",
+ "SpellParser.cpp",
+ ]
+
if swiftools_env.get("HAVE_SPARKLE", 0) :
swiftools_env.MergeFlags(swiftools_env["SPARKLE_FLAGS"])
diff --git a/SwifTools/SpellChecker.h b/SwifTools/SpellChecker.h
new file mode 100644
index 0000000..746fcaf
--- /dev/null
+++ b/SwifTools/SpellChecker.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2011 Vlad Voicu
+ * Licensed under the Simplified BSD license.
+ * See Documentation/Licenses/BSD-simplified.txt for more information.
+ */
+
+#include <SwifTools/SpellParser.h>
+
+#include <boost/algorithm/string.hpp>
+#include <boost/tuple/tuple.hpp>
+#include <vector>
+
+#pragma once
+
+namespace Swift {
+ class SpellChecker {
+ public:
+ SpellChecker() {
+ parser_ = new SpellParser();
+ }
+ virtual ~SpellChecker() {
+ delete parser_;
+ };
+ virtual bool isCorrect(const std::string& word) = 0;
+ virtual void getSuggestions(const std::string& word, std::vector<std::string>& list) = 0;
+ virtual void checkFragment(const std::string& fragment, PositionPairList& misspelledPositions) = 0;
+ protected:
+ SpellParser *parser_;
+ };
+}
diff --git a/SwifTools/SpellCheckerFactory.cpp b/SwifTools/SpellCheckerFactory.cpp
new file mode 100644
index 0000000..6061d78
--- /dev/null
+++ b/SwifTools/SpellCheckerFactory.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2011 Vlad Voicu
+ * Licensed under the Simplified BSD license.
+ * See Documentation/Licenses/BSD-simplified.txt for more information.
+ */
+
+#include <boost/filesystem/operations.hpp>
+
+#include <SwifTools/SpellChecker.h>
+#include <SwifTools/HunspellChecker.h>
+#include <SwifTools/SpellCheckerFactory.h>
+
+#ifdef HAVE_HUNSPELL
+#include <hunspell/hunspell.hxx>
+#endif
+
+namespace Swift {
+
+SpellCheckerFactory::SpellCheckerFactory() {
+}
+
+SpellChecker* SpellCheckerFactory::createSpellChecker(const std::string& dictFile) {
+#ifdef HAVE_HUNSPELL
+ std::string affixFile(dictFile);
+ boost::replace_all(affixFile, ".dic", ".aff");
+ if ((boost::filesystem::exists(dictFile)) && (boost::filesystem::exists(affixFile))) {
+ return new HunspellChecker(affixFile.c_str(), dictFile.c_str());
+ }
+ // If dictionaries don't exist disable the checker
+#endif
+ return NULL;
+}
+
+}
diff --git a/SwifTools/SpellCheckerFactory.h b/SwifTools/SpellCheckerFactory.h
new file mode 100644
index 0000000..086ea66
--- /dev/null
+++ b/SwifTools/SpellCheckerFactory.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2011 Vlad Voicu
+ * Licensed under the Simplified BSD license.
+ * See Documentation/Licenses/BSD-simplified.txt for more information.
+ */
+
+#pragma once
+
+#ifdef HAVE_HUNSPELL
+#define HAVE_SPELLCHECKER
+#endif
+
+namespace Swift {
+ class SpellChecker;
+ class SpellCheckerFactory {
+ public:
+ SpellCheckerFactory();
+ SpellChecker* createSpellChecker(const std::string& dictFile);
+ };
+}
diff --git a/SwifTools/SpellParser.cpp b/SwifTools/SpellParser.cpp
new file mode 100644
index 0000000..7208cdb
--- /dev/null
+++ b/SwifTools/SpellParser.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2011 Vlad Voicu
+ * Licensed under the Simplified BSD license.
+ * See Documentation/Licenses/BSD-simplified.txt for more information.
+ */
+
+#include <SwifTools/SpellParser.h>
+
+#include <boost/spirit/include/lex_lexertl.hpp>
+#include <boost/bind.hpp>
+#include <boost/ref.hpp>
+
+#include <string>
+
+namespace lex = boost::spirit::lex;
+
+namespace Swift {
+
+template <typename Lexer>
+struct word_count_tokens : lex::lexer<Lexer>
+{
+ word_count_tokens()
+ {
+ // define tokens (regular expresions) to match strings
+ // order is important
+ this->self.add
+ ("w{3}.[^ ]+", ID_WWW)
+ ("http:\\/\\/[^ ]+", ID_HTTP)
+ ("\\w{1,}['?|\\-?]?\\w{1,}", ID_WORD)
+ (".", ID_CHAR);
+ }
+};
+
+struct counter
+{
+ typedef bool result_type;
+ // the function operator gets called for each of the matched tokens
+ template <typename Token>
+ bool operator()(Token const& t, PositionPairList& wordPositions, std::size_t& position) const
+ {
+ switch (t.id()) {
+ case ID_WWW:
+ position += t.value().size();
+ break;
+ case ID_HTTP:
+ position += t.value().size();
+ break;
+ case ID_WORD: // matched a word
+ wordPositions.push_back(boost::tuples::make_tuple(position, position + t.value().size()));
+ position += t.value().size();
+ break;
+ case ID_CHAR: // match a simple char
+ ++position;
+ break;
+ }
+ return true; // always continue to tokenize
+ }
+};
+
+void SpellParser::check(const std::string& fragment, PositionPairList& wordPositions) {
+ std::size_t position = 0;
+ // create the token definition instance needed to invoke the lexical analyzer
+ word_count_tokens<lex::lexertl::lexer<> > word_count_functor;
+ char const* first = fragment.c_str();
+ char const* last = &first[fragment.size()];
+ lex::tokenize(first, last, word_count_functor, boost::bind(counter(), _1, boost::ref(wordPositions), boost::ref(position)));
+}
+
+}
diff --git a/SwifTools/SpellParser.h b/SwifTools/SpellParser.h
new file mode 100644
index 0000000..a6eafb5
--- /dev/null
+++ b/SwifTools/SpellParser.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2011 Vlad Voicu
+ * Licensed under the Simplified BSD license.
+ * See Documentation/Licenses/BSD-simplified.txt for more information.
+ */
+
+#pragma once
+
+#include <boost/algorithm/string.hpp>
+#include <boost/tuple/tuple.hpp>
+#include <boost/algorithm/string.hpp>
+
+#include <vector>
+
+namespace Swift {
+ enum token_ids
+ {
+ ID_WWW = 1,
+ ID_HTTP = 2,
+ ID_WORD = 3,
+ ID_CHAR = 4,
+ };
+
+ typedef boost::tuple<int, int> PositionPair;
+ typedef std::vector<PositionPair > PositionPairList;
+
+ class SpellParser{
+ public:
+ void check(const std::string& fragment, PositionPairList& wordPositions);
+ };
+}
diff --git a/SwifTools/UnitTest/SConscript b/SwifTools/UnitTest/SConscript
index e469deb..dbd1ce5 100644
--- a/SwifTools/UnitTest/SConscript
+++ b/SwifTools/UnitTest/SConscript
@@ -5,3 +5,8 @@ env.Append(UNITTEST_SOURCES = [
File("TabCompleteTest.cpp"),
File("LastLineTrackerTest.cpp"),
])
+
+if env["HAVE_HUNSPELL"] :
+ env.Append(UNITTEST_SOURCES = [
+ File("SpellParserTest.cpp"),
+ ])
diff --git a/SwifTools/UnitTest/SpellParserTest.cpp b/SwifTools/UnitTest/SpellParserTest.cpp
new file mode 100644
index 0000000..09e686c
--- /dev/null
+++ b/SwifTools/UnitTest/SpellParserTest.cpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2012 Vlad Voicu
+ * Licensed under the Simplified BSD license.
+ * See Documentation/Licenses/BSD-simplified.txt for more information.
+ */
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/extensions/TestFactoryRegistry.h>
+
+#include <boost/algorithm/string.hpp>
+
+#include <SwifTools/SpellParser.h>
+
+using namespace Swift;
+
+class SpellParserTest : public CppUnit::TestFixture {
+ CPPUNIT_TEST_SUITE(SpellParserTest);
+ CPPUNIT_TEST(testSimpleCheckFragment);
+ CPPUNIT_TEST(testWWWCheckFragment);
+ CPPUNIT_TEST_SUITE_END();
+ public:
+ SpellParserTest() {
+ parser_ = new SpellParser();
+ };
+ void tearDown() {
+ position_.clear();
+ }
+ void testSimpleCheckFragment() {
+ parser_->check("fragment test", position_);
+ int size = position_.size();
+ CPPUNIT_ASSERT_EQUAL(2, size);
+ CPPUNIT_ASSERT_EQUAL(0, boost::get<0>(position_.front()));
+ CPPUNIT_ASSERT_EQUAL(8, boost::get<1>(position_.front()));
+ CPPUNIT_ASSERT_EQUAL(9, boost::get<0>(position_.back()));
+ CPPUNIT_ASSERT_EQUAL(13, boost::get<1>(position_.back()));
+ }
+ void testWWWCheckFragment() {
+ parser_->check("www.link.com fragment test", position_);
+ int size = position_.size();
+ CPPUNIT_ASSERT_EQUAL(2, size);
+ CPPUNIT_ASSERT_EQUAL(13, boost::get<0>(position_.front()));
+ CPPUNIT_ASSERT_EQUAL(21, boost::get<1>(position_.front()));
+ CPPUNIT_ASSERT_EQUAL(22, boost::get<0>(position_.back()));
+ CPPUNIT_ASSERT_EQUAL(26, boost::get<1>(position_.back()));
+ }
+ private:
+ SpellParser *parser_;
+ PositionPairList position_;
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(SpellParserTest);