diff options
author | Vlad Voicu <vladvoic@gmail.com> | 2011-11-28 16:37:32 (GMT) |
---|---|---|
committer | Vlad Voicu <vladvoic@gmail.com> | 2013-03-15 09:21:52 (GMT) |
commit | 2061b06eccca67595c50edd81c44c5b961bf108b (patch) | |
tree | 7fdc9e4cc80a9d8ddbe5364a531ef3449f72ab2b /SwifTools | |
parent | a069a0df0f51a948a86e34d99f952a33eecd97ba (diff) | |
download | swift-2061b06eccca67595c50edd81c44c5b961bf108b.zip swift-2061b06eccca67595c50edd81c44c5b961bf108b.tar.bz2 |
Spell checker implementation using Hunspell
Change-Id: Ia15b6532edf6eef7c45bdfb273e77f65ce998f13
License: This patch is BSD-licensed, see Documentation/Licenses/BSD-simplified.txt for details
Diffstat (limited to 'SwifTools')
-rw-r--r-- | SwifTools/HunspellChecker.cpp | 55 | ||||
-rw-r--r-- | SwifTools/HunspellChecker.h | 27 | ||||
-rw-r--r-- | SwifTools/SConscript | 12 | ||||
-rw-r--r-- | SwifTools/SpellChecker.h | 30 | ||||
-rw-r--r-- | SwifTools/SpellCheckerFactory.cpp | 34 | ||||
-rw-r--r-- | SwifTools/SpellCheckerFactory.h | 20 | ||||
-rw-r--r-- | SwifTools/SpellParser.cpp | 69 | ||||
-rw-r--r-- | SwifTools/SpellParser.h | 31 | ||||
-rw-r--r-- | SwifTools/UnitTest/SConscript | 5 | ||||
-rw-r--r-- | SwifTools/UnitTest/SpellParserTest.cpp | 51 |
10 files changed, 334 insertions, 0 deletions
diff --git a/SwifTools/HunspellChecker.cpp b/SwifTools/HunspellChecker.cpp new file mode 100644 index 0000000..ecd352e --- /dev/null +++ b/SwifTools/HunspellChecker.cpp @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2011 Vlad Voicu + * Licensed under the Simplified BSD license. + * See Documentation/Licenses/BSD-simplified.txt for more information. + */ + +#include <SwifTools/HunspellChecker.h> + +#include <algorithm> +#include <hunspell/hunspell.hxx> +#include <boost/algorithm/string.hpp> + + +namespace Swift { + +HunspellChecker::HunspellChecker(const char* affix_path, const char* dictionary_path) { + speller_ = new Hunspell(affix_path, dictionary_path); +} + +HunspellChecker::~HunspellChecker() { + delete speller_; +} + +bool HunspellChecker::isCorrect(const std::string& word) { + return speller_->spell(word.c_str()); +} + +void HunspellChecker::getSuggestions(const std::string& word, std::vector<std::string>& list) { + char **suggestList; + int words_returned; + if (!word.empty()) { + words_returned = speller_->suggest(&suggestList, word.c_str()); + } + for (int i = 0; i < words_returned; ++i) { + list.push_back(suggestList[i]); + free(suggestList[i]); + } + free(suggestList); +} + +void HunspellChecker::checkFragment(const std::string& fragment, PositionPairList& misspelledPositions) { + if (!fragment.empty()) { + parser_->check(fragment, misspelledPositions); + for (PositionPairList::iterator it = misspelledPositions.begin(); it != misspelledPositions.end();) { + if (isCorrect(fragment.substr(boost::get<0>(*it), boost::get<1>(*it) - boost::get<0>(*it)))) { + it = misspelledPositions.erase(it); + } + else { + ++it; + } + } + } +} + +} diff --git a/SwifTools/HunspellChecker.h b/SwifTools/HunspellChecker.h new file mode 100644 index 0000000..12c0485 --- /dev/null +++ b/SwifTools/HunspellChecker.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2011 Vlad Voicu + * Licensed under the Simplified BSD license. + * See Documentation/Licenses/BSD-simplified.txt for more information. + */ + +#include <vector> +#include <boost/algorithm/string.hpp> +#include <boost/tuple/tuple.hpp> +#include <SwifTools/SpellChecker.h> + +#pragma once + +class Hunspell; + +namespace Swift { + class HunspellChecker : public SpellChecker { + public: + HunspellChecker(const char* affix_path, const char* dict_path); + virtual ~HunspellChecker(); + virtual bool isCorrect(const std::string& word); + virtual void getSuggestions(const std::string& word, std::vector<std::string>& list); + virtual void checkFragment(const std::string& fragment, PositionPairList& misspelledPositions); + private: + Hunspell* speller_; + }; +} diff --git a/SwifTools/SConscript b/SwifTools/SConscript index fa2686a..eaf5787 100644 --- a/SwifTools/SConscript +++ b/SwifTools/SConscript @@ -9,6 +9,8 @@ if env["SCONS_STAGE"] == "flags" : "LIBPATH": [Dir(".")], "LIBS": ["SwifTools"] } + if env["HAVE_HUNSPELL"] : + env.MergeFlags(env["HUNSPELL_FLAGS"]) ################################################################################ # Build @@ -30,6 +32,16 @@ if env["SCONS_STAGE"] == "build" : "TabComplete.cpp", "LastLineTracker.cpp", ] + + if swiftools_env["HAVE_HUNSPELL"] : + swiftools_env.MergeFlags(swiftools_env["HUNSPELL_FLAGS"]) + swiftools_env.Append(CPPDEFINES = ["HAVE_HUNSPELL"]) + sources += [ + "SpellCheckerFactory.cpp", + "HunspellChecker.cpp", + "SpellParser.cpp", + ] + if swiftools_env.get("HAVE_SPARKLE", 0) : swiftools_env.MergeFlags(swiftools_env["SPARKLE_FLAGS"]) diff --git a/SwifTools/SpellChecker.h b/SwifTools/SpellChecker.h new file mode 100644 index 0000000..746fcaf --- /dev/null +++ b/SwifTools/SpellChecker.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2011 Vlad Voicu + * Licensed under the Simplified BSD license. + * See Documentation/Licenses/BSD-simplified.txt for more information. + */ + +#include <SwifTools/SpellParser.h> + +#include <boost/algorithm/string.hpp> +#include <boost/tuple/tuple.hpp> +#include <vector> + +#pragma once + +namespace Swift { + class SpellChecker { + public: + SpellChecker() { + parser_ = new SpellParser(); + } + virtual ~SpellChecker() { + delete parser_; + }; + virtual bool isCorrect(const std::string& word) = 0; + virtual void getSuggestions(const std::string& word, std::vector<std::string>& list) = 0; + virtual void checkFragment(const std::string& fragment, PositionPairList& misspelledPositions) = 0; + protected: + SpellParser *parser_; + }; +} diff --git a/SwifTools/SpellCheckerFactory.cpp b/SwifTools/SpellCheckerFactory.cpp new file mode 100644 index 0000000..6061d78 --- /dev/null +++ b/SwifTools/SpellCheckerFactory.cpp @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2011 Vlad Voicu + * Licensed under the Simplified BSD license. + * See Documentation/Licenses/BSD-simplified.txt for more information. + */ + +#include <boost/filesystem/operations.hpp> + +#include <SwifTools/SpellChecker.h> +#include <SwifTools/HunspellChecker.h> +#include <SwifTools/SpellCheckerFactory.h> + +#ifdef HAVE_HUNSPELL +#include <hunspell/hunspell.hxx> +#endif + +namespace Swift { + +SpellCheckerFactory::SpellCheckerFactory() { +} + +SpellChecker* SpellCheckerFactory::createSpellChecker(const std::string& dictFile) { +#ifdef HAVE_HUNSPELL + std::string affixFile(dictFile); + boost::replace_all(affixFile, ".dic", ".aff"); + if ((boost::filesystem::exists(dictFile)) && (boost::filesystem::exists(affixFile))) { + return new HunspellChecker(affixFile.c_str(), dictFile.c_str()); + } + // If dictionaries don't exist disable the checker +#endif + return NULL; +} + +} diff --git a/SwifTools/SpellCheckerFactory.h b/SwifTools/SpellCheckerFactory.h new file mode 100644 index 0000000..086ea66 --- /dev/null +++ b/SwifTools/SpellCheckerFactory.h @@ -0,0 +1,20 @@ +/* + * Copyright (c) 2011 Vlad Voicu + * Licensed under the Simplified BSD license. + * See Documentation/Licenses/BSD-simplified.txt for more information. + */ + +#pragma once + +#ifdef HAVE_HUNSPELL +#define HAVE_SPELLCHECKER +#endif + +namespace Swift { + class SpellChecker; + class SpellCheckerFactory { + public: + SpellCheckerFactory(); + SpellChecker* createSpellChecker(const std::string& dictFile); + }; +} diff --git a/SwifTools/SpellParser.cpp b/SwifTools/SpellParser.cpp new file mode 100644 index 0000000..7208cdb --- /dev/null +++ b/SwifTools/SpellParser.cpp @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2011 Vlad Voicu + * Licensed under the Simplified BSD license. + * See Documentation/Licenses/BSD-simplified.txt for more information. + */ + +#include <SwifTools/SpellParser.h> + +#include <boost/spirit/include/lex_lexertl.hpp> +#include <boost/bind.hpp> +#include <boost/ref.hpp> + +#include <string> + +namespace lex = boost::spirit::lex; + +namespace Swift { + +template <typename Lexer> +struct word_count_tokens : lex::lexer<Lexer> +{ + word_count_tokens() + { + // define tokens (regular expresions) to match strings + // order is important + this->self.add + ("w{3}.[^ ]+", ID_WWW) + ("http:\\/\\/[^ ]+", ID_HTTP) + ("\\w{1,}['?|\\-?]?\\w{1,}", ID_WORD) + (".", ID_CHAR); + } +}; + +struct counter +{ + typedef bool result_type; + // the function operator gets called for each of the matched tokens + template <typename Token> + bool operator()(Token const& t, PositionPairList& wordPositions, std::size_t& position) const + { + switch (t.id()) { + case ID_WWW: + position += t.value().size(); + break; + case ID_HTTP: + position += t.value().size(); + break; + case ID_WORD: // matched a word + wordPositions.push_back(boost::tuples::make_tuple(position, position + t.value().size())); + position += t.value().size(); + break; + case ID_CHAR: // match a simple char + ++position; + break; + } + return true; // always continue to tokenize + } +}; + +void SpellParser::check(const std::string& fragment, PositionPairList& wordPositions) { + std::size_t position = 0; + // create the token definition instance needed to invoke the lexical analyzer + word_count_tokens<lex::lexertl::lexer<> > word_count_functor; + char const* first = fragment.c_str(); + char const* last = &first[fragment.size()]; + lex::tokenize(first, last, word_count_functor, boost::bind(counter(), _1, boost::ref(wordPositions), boost::ref(position))); +} + +} diff --git a/SwifTools/SpellParser.h b/SwifTools/SpellParser.h new file mode 100644 index 0000000..a6eafb5 --- /dev/null +++ b/SwifTools/SpellParser.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2011 Vlad Voicu + * Licensed under the Simplified BSD license. + * See Documentation/Licenses/BSD-simplified.txt for more information. + */ + +#pragma once + +#include <boost/algorithm/string.hpp> +#include <boost/tuple/tuple.hpp> +#include <boost/algorithm/string.hpp> + +#include <vector> + +namespace Swift { + enum token_ids + { + ID_WWW = 1, + ID_HTTP = 2, + ID_WORD = 3, + ID_CHAR = 4, + }; + + typedef boost::tuple<int, int> PositionPair; + typedef std::vector<PositionPair > PositionPairList; + + class SpellParser{ + public: + void check(const std::string& fragment, PositionPairList& wordPositions); + }; +} diff --git a/SwifTools/UnitTest/SConscript b/SwifTools/UnitTest/SConscript index e469deb..dbd1ce5 100644 --- a/SwifTools/UnitTest/SConscript +++ b/SwifTools/UnitTest/SConscript @@ -5,3 +5,8 @@ env.Append(UNITTEST_SOURCES = [ File("TabCompleteTest.cpp"), File("LastLineTrackerTest.cpp"), ]) + +if env["HAVE_HUNSPELL"] : + env.Append(UNITTEST_SOURCES = [ + File("SpellParserTest.cpp"), + ]) diff --git a/SwifTools/UnitTest/SpellParserTest.cpp b/SwifTools/UnitTest/SpellParserTest.cpp new file mode 100644 index 0000000..09e686c --- /dev/null +++ b/SwifTools/UnitTest/SpellParserTest.cpp @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2012 Vlad Voicu + * Licensed under the Simplified BSD license. + * See Documentation/Licenses/BSD-simplified.txt for more information. + */ + +#include <cppunit/extensions/HelperMacros.h> +#include <cppunit/extensions/TestFactoryRegistry.h> + +#include <boost/algorithm/string.hpp> + +#include <SwifTools/SpellParser.h> + +using namespace Swift; + +class SpellParserTest : public CppUnit::TestFixture { + CPPUNIT_TEST_SUITE(SpellParserTest); + CPPUNIT_TEST(testSimpleCheckFragment); + CPPUNIT_TEST(testWWWCheckFragment); + CPPUNIT_TEST_SUITE_END(); + public: + SpellParserTest() { + parser_ = new SpellParser(); + }; + void tearDown() { + position_.clear(); + } + void testSimpleCheckFragment() { + parser_->check("fragment test", position_); + int size = position_.size(); + CPPUNIT_ASSERT_EQUAL(2, size); + CPPUNIT_ASSERT_EQUAL(0, boost::get<0>(position_.front())); + CPPUNIT_ASSERT_EQUAL(8, boost::get<1>(position_.front())); + CPPUNIT_ASSERT_EQUAL(9, boost::get<0>(position_.back())); + CPPUNIT_ASSERT_EQUAL(13, boost::get<1>(position_.back())); + } + void testWWWCheckFragment() { + parser_->check("www.link.com fragment test", position_); + int size = position_.size(); + CPPUNIT_ASSERT_EQUAL(2, size); + CPPUNIT_ASSERT_EQUAL(13, boost::get<0>(position_.front())); + CPPUNIT_ASSERT_EQUAL(21, boost::get<1>(position_.front())); + CPPUNIT_ASSERT_EQUAL(22, boost::get<0>(position_.back())); + CPPUNIT_ASSERT_EQUAL(26, boost::get<1>(position_.back())); + } + private: + SpellParser *parser_; + PositionPairList position_; +}; + +CPPUNIT_TEST_SUITE_REGISTRATION(SpellParserTest); |