diff options
Diffstat (limited to '3rdParty/Boost/src/boost/spirit/home/support/detail/lexer/parser/tokeniser/re_tokeniser.hpp')
-rw-r--r-- | 3rdParty/Boost/src/boost/spirit/home/support/detail/lexer/parser/tokeniser/re_tokeniser.hpp | 574 |
1 files changed, 574 insertions, 0 deletions
diff --git a/3rdParty/Boost/src/boost/spirit/home/support/detail/lexer/parser/tokeniser/re_tokeniser.hpp b/3rdParty/Boost/src/boost/spirit/home/support/detail/lexer/parser/tokeniser/re_tokeniser.hpp new file mode 100644 index 0000000..7bdeb80 --- /dev/null +++ b/3rdParty/Boost/src/boost/spirit/home/support/detail/lexer/parser/tokeniser/re_tokeniser.hpp @@ -0,0 +1,574 @@ +// tokeniser.hpp +// Copyright (c) 2007-2009 Ben Hanson (http://www.benhanson.net/) +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +#ifndef BOOST_LEXER_RE_TOKENISER_HPP +#define BOOST_LEXER_RE_TOKENISER_HPP + +// memcpy() +#include <cstring> +#include <map> +#include "num_token.hpp" +#include "../../runtime_error.hpp" +#include "../../size_t.hpp" +#include <sstream> +#include "../../string_token.hpp" +#include "re_tokeniser_helper.hpp" + +namespace boost +{ +namespace lexer +{ +namespace detail +{ +template<typename CharT> +class basic_re_tokeniser +{ +public: + typedef basic_num_token<CharT> num_token; + typedef basic_re_tokeniser_state<CharT> state; + typedef basic_string_token<CharT> string_token; + typedef typename string_token::string string; + typedef std::map<string_token, std::size_t> token_map; + typedef std::pair<string_token, std::size_t> token_pair; + + static void next (state &state_, token_map &map_, num_token &token_) + { + CharT ch_ = 0; + bool eos_ = state_.next (ch_); + + token_.min_max (0, false, 0); + + while (!eos_ && ch_ == '"') + { + state_._in_string ^= 1; + eos_ = state_.next (ch_); + } + + if (eos_) + { + if (state_._in_string) + { + throw runtime_error ("Unexpected end of regex " + "(missing '\"')."); + } + + if (state_._paren_count) + { + throw runtime_error ("Unexpected end of regex " + "(missing ')')."); + } + + token_.set (num_token::END, null_token); + } + else + { + if (ch_ == '\\') + { + // Even if we are in a string, respect escape sequences... + escape (state_, map_, token_); + } + else if (state_._in_string) + { + // All other meta characters lose their special meaning + // inside a string. + create_charset_token (string (1, ch_), false, map_, token_); + } + else + { + // Not an escape sequence and not inside a string, so + // check for meta characters. + switch (ch_) + { + case '(': + token_.set (num_token::OPENPAREN, null_token); + ++state_._paren_count; + read_options (state_); + break; + case ')': + --state_._paren_count; + + if (state_._paren_count < 0) + { + std::ostringstream ss_; + + ss_ << "Number of open parenthesis < 0 at index " << + state_.index () - 1 << '.'; + throw runtime_error (ss_.str ().c_str ()); + } + + token_.set (num_token::CLOSEPAREN, null_token); + + if (!state_._flags_stack.empty ()) + { + state_._flags = state_._flags_stack.top (); + state_._flags_stack.pop (); + } + break; + case '?': + if (!state_.eos () && *state_._curr == '?') + { + token_.set (num_token::AOPT, null_token); + state_.increment (); + } + else + { + token_.set (num_token::OPT, null_token); + } + + break; + case '*': + if (!state_.eos () && *state_._curr == '?') + { + token_.set (num_token::AZEROORMORE, null_token); + state_.increment (); + } + else + { + token_.set (num_token::ZEROORMORE, null_token); + } + + break; + case '+': + if (!state_.eos () && *state_._curr == '?') + { + token_.set (num_token::AONEORMORE, null_token); + state_.increment (); + } + else + { + token_.set (num_token::ONEORMORE, null_token); + } + + break; + case '{': + open_curly (state_, token_); + break; + case '|': + token_.set (num_token::OR, null_token); + break; + case '^': + if (state_._curr - 1 == state_._start) + { + token_.set (num_token::CHARSET, bol_token); + state_._seen_BOL_assertion = true; + } + else + { + create_charset_token (string (1, ch_), false, + map_, token_); + } + + break; + case '$': + if (state_._curr == state_._end) + { + token_.set (num_token::CHARSET, eol_token); + state_._seen_EOL_assertion = true; + } + else + { + create_charset_token (string (1, ch_), false, + map_, token_); + } + + break; + case '.': + { + string dot_; + + if (state_._flags & dot_not_newline) + { + dot_ = '\n'; + } + + create_charset_token (dot_, true, map_, token_); + break; + } + case '[': + { + charset (state_, map_, token_); + break; + } + case '/': + throw runtime_error("Lookahead ('/') is not supported yet."); + break; + default: + if ((state_._flags & icase) && + (std::isupper (ch_, state_._locale) || + std::islower (ch_, state_._locale))) + { + CharT upper_ = std::toupper (ch_, state_._locale); + CharT lower_ = std::tolower (ch_, state_._locale); + + string str_ (1, upper_); + + str_ += lower_; + create_charset_token (str_, false, map_, token_); + } + else + { + create_charset_token (string (1, ch_), false, + map_, token_); + } + + break; + } + } + } + } + +private: + typedef basic_re_tokeniser_helper<CharT> tokeniser_helper; + + static void read_options (state &state_) + { + if (!state_.eos () && *state_._curr == '?') + { + CharT ch_ = 0; + bool eos_ = false; + bool negate_ = false; + + state_.increment (); + eos_ = state_.next (ch_); + state_._flags_stack.push (state_._flags); + + while (!eos_ && ch_ != ':') + { + switch (ch_) + { + case '-': + negate_ ^= 1; + break; + case 'i': + if (negate_) + { + state_._flags = static_cast<regex_flags> + (state_._flags & ~icase); + } + else + { + state_._flags = static_cast<regex_flags> + (state_._flags | icase); + } + + negate_ = false; + break; + case 's': + if (negate_) + { + state_._flags = static_cast<regex_flags> + (state_._flags | dot_not_newline); + } + else + { + state_._flags = static_cast<regex_flags> + (state_._flags & ~dot_not_newline); + } + + negate_ = false; + break; + default: + { + std::ostringstream ss_; + + ss_ << "Unknown option at index " << + state_.index () - 1 << '.'; + throw runtime_error (ss_.str ().c_str ()); + } + } + + eos_ = state_.next (ch_); + } + + // End of string handler will handle early termination + } + else if (!state_._flags_stack.empty ()) + { + state_._flags_stack.push (state_._flags); + } + } + + static void escape (state &state_, token_map &map_, num_token &token_) + { + CharT ch_ = 0; + std::size_t str_len_ = 0; + const CharT *str_ = tokeniser_helper::escape_sequence (state_, + ch_, str_len_); + + if (str_) + { + state state2_ (str_ + 1, str_ + str_len_, state_._flags, + state_._locale); + + charset (state2_, map_, token_); + } + else + { + create_charset_token (string (1, ch_), false, map_, token_); + } + } + + static void charset (state &state_, token_map &map_, num_token &token_) + { + string chars_; + bool negated_ = false; + + tokeniser_helper::charset (state_, chars_, negated_); + create_charset_token (chars_, negated_, map_, token_); + } + + static void create_charset_token (const string &charset_, + const bool negated_, token_map &map_, num_token &token_) + { + std::size_t id_ = null_token; + string_token stok_ (negated_, charset_); + + stok_.remove_duplicates (); + stok_.normalise (); + + typename token_map::const_iterator iter_ = map_.find (stok_); + + if (iter_ == map_.end ()) + { + id_ = map_.size (); + map_.insert (token_pair (stok_, id_)); + } + else + { + id_ = iter_->second; + } + + token_.set (num_token::CHARSET, id_); + } + + static void open_curly (state &state_, num_token &token_) + { + if (state_.eos ()) + { + throw runtime_error ("Unexpected end of regex " + "(missing '}')."); + } + else if (*state_._curr >= '0' && *state_._curr <= '9') + { + repeat_n (state_, token_); + + if (!state_.eos () && *state_._curr == '?') + { + token_._type = num_token::AREPEATN; + state_.increment (); + } + } + else + { + macro (state_, token_); + } + } + + // SYNTAX: + // {n[,[n]]} + // SEMANTIC RULES: + // {0} - INVALID (throw exception) + // {0,} = * + // {0,0} - INVALID (throw exception) + // {0,1} = ? + // {1,} = + + // {min,max} where min == max - {min} + // {min,max} where max < min - INVALID (throw exception) + static void repeat_n (state &state_, num_token &token_) + { + CharT ch_ = 0; + bool eos_ = state_.next (ch_); + + while (!eos_ && ch_ >= '0' && ch_ <= '9') + { + token_._min *= 10; + token_._min += ch_ - '0'; + eos_ = state_.next (ch_); + } + + if (eos_) + { + throw runtime_error ("Unexpected end of regex " + "(missing '}')."); + } + + bool min_max_ = false; + bool repeatn_ = true; + + token_._comma = ch_ == ','; + + if (token_._comma) + { + eos_ = state_.next (ch_); + + if (eos_) + { + throw runtime_error ("Unexpected end of regex " + "(missing '}')."); + } + + if (ch_ == '}') + { + // Small optimisation: Check for '*' equivalency. + if (token_._min == 0) + { + token_.set (num_token::ZEROORMORE, null_token); + repeatn_ = false; + } + // Small optimisation: Check for '+' equivalency. + else if (token_._min == 1) + { + token_.set (num_token::ONEORMORE, null_token); + repeatn_ = false; + } + } + else + { + if (ch_ < '0' || ch_ > '9') + { + std::ostringstream ss_; + + ss_ << "Missing '}' at index " << + state_.index () - 1 << '.'; + throw runtime_error (ss_.str ().c_str ()); + } + + min_max_ = true; + + do + { + token_._max *= 10; + token_._max += ch_ - '0'; + eos_ = state_.next (ch_); + } while (!eos_ && ch_ >= '0' && ch_ <= '9'); + + if (eos_) + { + throw runtime_error ("Unexpected end of regex " + "(missing '}')."); + } + + // Small optimisation: Check for '?' equivalency. + if (token_._min == 0 && token_._max == 1) + { + token_.set (num_token::OPT, null_token); + repeatn_ = false; + } + // Small optimisation: if min == max, then min. + else if (token_._min == token_._max) + { + token_._comma = false; + min_max_ = false; + token_._max = 0; + } + } + } + + if (ch_ != '}') + { + std::ostringstream ss_; + + ss_ << "Missing '}' at index " << state_.index () - 1 << '.'; + throw runtime_error (ss_.str ().c_str ()); + } + + if (repeatn_) + { + // SEMANTIC VALIDATION follows: + // NOTE: {0,} has already become * + // therefore we don't check for a comma. + if (token_._min == 0 && token_._max == 0) + { + std::ostringstream ss_; + + ss_ << "Cannot have exactly zero repeats preceding index " << + state_.index () << '.'; + throw runtime_error (ss_.str ().c_str ()); + } + + if (min_max_ && token_._max < token_._min) + { + std::ostringstream ss_; + + ss_ << "Max less than min preceding index " << + state_.index () << '.'; + throw runtime_error (ss_.str ().c_str ()); + } + + token_.set (num_token::REPEATN, null_token); + } + } + + static void macro (state &state_, num_token &token_) + { + CharT ch_ = 0; + bool eos_ = false; + const CharT *start_ = state_._curr; + + state_.next (ch_); + + if (ch_ != '_' && !(ch_ >= 'A' && ch_ <= 'Z') && + !(ch_ >= 'a' && ch_ <= 'z')) + { + std::ostringstream ss_; + + ss_ << "Invalid MACRO name at index " << + state_.index () - 1 << '.'; + throw runtime_error (ss_.str ().c_str ()); + } + + do + { + eos_ = state_.next (ch_); + + if (eos_) + { + throw runtime_error ("Unexpected end of regex " + "(missing '}')."); + } + } while (ch_ == '_' || ch_ == '-' || (ch_ >= 'A' && ch_ <= 'Z') || + (ch_ >= 'a' && ch_ <= 'z') || (ch_ >= '0' && ch_ <= '9')); + + if (ch_ != '}') + { + std::ostringstream ss_; + + ss_ << "Missing '}' at index " << state_.index () - 1 << '.'; + throw runtime_error (ss_.str ().c_str ()); + } + + std::size_t len_ = state_._curr - 1 - start_; + + if (len_ > max_macro_len) + { + std::basic_stringstream<CharT> ss_; + std::ostringstream os_; + + os_ << "MACRO name '"; + + while (len_) + { + os_ << ss_.narrow (*start_++, ' '); + --len_; + } + + os_ << "' too long."; + throw runtime_error (os_.str ()); + } + + token_.set (num_token::MACRO, null_token); + + // Some systems have memcpy in namespace std. + using namespace std; + + memcpy (token_._macro, start_, len_ * sizeof (CharT)); + token_._macro[len_] = 0; + } +}; +} +} +} + +#endif |