summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
Diffstat (limited to '3rdParty/Boost/src/boost/spirit/home/support/detail/lexer/parser/tokeniser/re_tokeniser.hpp')
-rw-r--r--3rdParty/Boost/src/boost/spirit/home/support/detail/lexer/parser/tokeniser/re_tokeniser.hpp574
1 files changed, 574 insertions, 0 deletions
diff --git a/3rdParty/Boost/src/boost/spirit/home/support/detail/lexer/parser/tokeniser/re_tokeniser.hpp b/3rdParty/Boost/src/boost/spirit/home/support/detail/lexer/parser/tokeniser/re_tokeniser.hpp
new file mode 100644
index 0000000..7bdeb80
--- /dev/null
+++ b/3rdParty/Boost/src/boost/spirit/home/support/detail/lexer/parser/tokeniser/re_tokeniser.hpp
@@ -0,0 +1,574 @@
+// tokeniser.hpp
+// Copyright (c) 2007-2009 Ben Hanson (http://www.benhanson.net/)
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+#ifndef BOOST_LEXER_RE_TOKENISER_HPP
+#define BOOST_LEXER_RE_TOKENISER_HPP
+
+// memcpy()
+#include <cstring>
+#include <map>
+#include "num_token.hpp"
+#include "../../runtime_error.hpp"
+#include "../../size_t.hpp"
+#include <sstream>
+#include "../../string_token.hpp"
+#include "re_tokeniser_helper.hpp"
+
+namespace boost
+{
+namespace lexer
+{
+namespace detail
+{
+template<typename CharT>
+class basic_re_tokeniser
+{
+public:
+ typedef basic_num_token<CharT> num_token;
+ typedef basic_re_tokeniser_state<CharT> state;
+ typedef basic_string_token<CharT> string_token;
+ typedef typename string_token::string string;
+ typedef std::map<string_token, std::size_t> token_map;
+ typedef std::pair<string_token, std::size_t> token_pair;
+
+ static void next (state &state_, token_map &map_, num_token &token_)
+ {
+ CharT ch_ = 0;
+ bool eos_ = state_.next (ch_);
+
+ token_.min_max (0, false, 0);
+
+ while (!eos_ && ch_ == '"')
+ {
+ state_._in_string ^= 1;
+ eos_ = state_.next (ch_);
+ }
+
+ if (eos_)
+ {
+ if (state_._in_string)
+ {
+ throw runtime_error ("Unexpected end of regex "
+ "(missing '\"').");
+ }
+
+ if (state_._paren_count)
+ {
+ throw runtime_error ("Unexpected end of regex "
+ "(missing ')').");
+ }
+
+ token_.set (num_token::END, null_token);
+ }
+ else
+ {
+ if (ch_ == '\\')
+ {
+ // Even if we are in a string, respect escape sequences...
+ escape (state_, map_, token_);
+ }
+ else if (state_._in_string)
+ {
+ // All other meta characters lose their special meaning
+ // inside a string.
+ create_charset_token (string (1, ch_), false, map_, token_);
+ }
+ else
+ {
+ // Not an escape sequence and not inside a string, so
+ // check for meta characters.
+ switch (ch_)
+ {
+ case '(':
+ token_.set (num_token::OPENPAREN, null_token);
+ ++state_._paren_count;
+ read_options (state_);
+ break;
+ case ')':
+ --state_._paren_count;
+
+ if (state_._paren_count < 0)
+ {
+ std::ostringstream ss_;
+
+ ss_ << "Number of open parenthesis < 0 at index " <<
+ state_.index () - 1 << '.';
+ throw runtime_error (ss_.str ().c_str ());
+ }
+
+ token_.set (num_token::CLOSEPAREN, null_token);
+
+ if (!state_._flags_stack.empty ())
+ {
+ state_._flags = state_._flags_stack.top ();
+ state_._flags_stack.pop ();
+ }
+ break;
+ case '?':
+ if (!state_.eos () && *state_._curr == '?')
+ {
+ token_.set (num_token::AOPT, null_token);
+ state_.increment ();
+ }
+ else
+ {
+ token_.set (num_token::OPT, null_token);
+ }
+
+ break;
+ case '*':
+ if (!state_.eos () && *state_._curr == '?')
+ {
+ token_.set (num_token::AZEROORMORE, null_token);
+ state_.increment ();
+ }
+ else
+ {
+ token_.set (num_token::ZEROORMORE, null_token);
+ }
+
+ break;
+ case '+':
+ if (!state_.eos () && *state_._curr == '?')
+ {
+ token_.set (num_token::AONEORMORE, null_token);
+ state_.increment ();
+ }
+ else
+ {
+ token_.set (num_token::ONEORMORE, null_token);
+ }
+
+ break;
+ case '{':
+ open_curly (state_, token_);
+ break;
+ case '|':
+ token_.set (num_token::OR, null_token);
+ break;
+ case '^':
+ if (state_._curr - 1 == state_._start)
+ {
+ token_.set (num_token::CHARSET, bol_token);
+ state_._seen_BOL_assertion = true;
+ }
+ else
+ {
+ create_charset_token (string (1, ch_), false,
+ map_, token_);
+ }
+
+ break;
+ case '$':
+ if (state_._curr == state_._end)
+ {
+ token_.set (num_token::CHARSET, eol_token);
+ state_._seen_EOL_assertion = true;
+ }
+ else
+ {
+ create_charset_token (string (1, ch_), false,
+ map_, token_);
+ }
+
+ break;
+ case '.':
+ {
+ string dot_;
+
+ if (state_._flags & dot_not_newline)
+ {
+ dot_ = '\n';
+ }
+
+ create_charset_token (dot_, true, map_, token_);
+ break;
+ }
+ case '[':
+ {
+ charset (state_, map_, token_);
+ break;
+ }
+ case '/':
+ throw runtime_error("Lookahead ('/') is not supported yet.");
+ break;
+ default:
+ if ((state_._flags & icase) &&
+ (std::isupper (ch_, state_._locale) ||
+ std::islower (ch_, state_._locale)))
+ {
+ CharT upper_ = std::toupper (ch_, state_._locale);
+ CharT lower_ = std::tolower (ch_, state_._locale);
+
+ string str_ (1, upper_);
+
+ str_ += lower_;
+ create_charset_token (str_, false, map_, token_);
+ }
+ else
+ {
+ create_charset_token (string (1, ch_), false,
+ map_, token_);
+ }
+
+ break;
+ }
+ }
+ }
+ }
+
+private:
+ typedef basic_re_tokeniser_helper<CharT> tokeniser_helper;
+
+ static void read_options (state &state_)
+ {
+ if (!state_.eos () && *state_._curr == '?')
+ {
+ CharT ch_ = 0;
+ bool eos_ = false;
+ bool negate_ = false;
+
+ state_.increment ();
+ eos_ = state_.next (ch_);
+ state_._flags_stack.push (state_._flags);
+
+ while (!eos_ && ch_ != ':')
+ {
+ switch (ch_)
+ {
+ case '-':
+ negate_ ^= 1;
+ break;
+ case 'i':
+ if (negate_)
+ {
+ state_._flags = static_cast<regex_flags>
+ (state_._flags & ~icase);
+ }
+ else
+ {
+ state_._flags = static_cast<regex_flags>
+ (state_._flags | icase);
+ }
+
+ negate_ = false;
+ break;
+ case 's':
+ if (negate_)
+ {
+ state_._flags = static_cast<regex_flags>
+ (state_._flags | dot_not_newline);
+ }
+ else
+ {
+ state_._flags = static_cast<regex_flags>
+ (state_._flags & ~dot_not_newline);
+ }
+
+ negate_ = false;
+ break;
+ default:
+ {
+ std::ostringstream ss_;
+
+ ss_ << "Unknown option at index " <<
+ state_.index () - 1 << '.';
+ throw runtime_error (ss_.str ().c_str ());
+ }
+ }
+
+ eos_ = state_.next (ch_);
+ }
+
+ // End of string handler will handle early termination
+ }
+ else if (!state_._flags_stack.empty ())
+ {
+ state_._flags_stack.push (state_._flags);
+ }
+ }
+
+ static void escape (state &state_, token_map &map_, num_token &token_)
+ {
+ CharT ch_ = 0;
+ std::size_t str_len_ = 0;
+ const CharT *str_ = tokeniser_helper::escape_sequence (state_,
+ ch_, str_len_);
+
+ if (str_)
+ {
+ state state2_ (str_ + 1, str_ + str_len_, state_._flags,
+ state_._locale);
+
+ charset (state2_, map_, token_);
+ }
+ else
+ {
+ create_charset_token (string (1, ch_), false, map_, token_);
+ }
+ }
+
+ static void charset (state &state_, token_map &map_, num_token &token_)
+ {
+ string chars_;
+ bool negated_ = false;
+
+ tokeniser_helper::charset (state_, chars_, negated_);
+ create_charset_token (chars_, negated_, map_, token_);
+ }
+
+ static void create_charset_token (const string &charset_,
+ const bool negated_, token_map &map_, num_token &token_)
+ {
+ std::size_t id_ = null_token;
+ string_token stok_ (negated_, charset_);
+
+ stok_.remove_duplicates ();
+ stok_.normalise ();
+
+ typename token_map::const_iterator iter_ = map_.find (stok_);
+
+ if (iter_ == map_.end ())
+ {
+ id_ = map_.size ();
+ map_.insert (token_pair (stok_, id_));
+ }
+ else
+ {
+ id_ = iter_->second;
+ }
+
+ token_.set (num_token::CHARSET, id_);
+ }
+
+ static void open_curly (state &state_, num_token &token_)
+ {
+ if (state_.eos ())
+ {
+ throw runtime_error ("Unexpected end of regex "
+ "(missing '}').");
+ }
+ else if (*state_._curr >= '0' && *state_._curr <= '9')
+ {
+ repeat_n (state_, token_);
+
+ if (!state_.eos () && *state_._curr == '?')
+ {
+ token_._type = num_token::AREPEATN;
+ state_.increment ();
+ }
+ }
+ else
+ {
+ macro (state_, token_);
+ }
+ }
+
+ // SYNTAX:
+ // {n[,[n]]}
+ // SEMANTIC RULES:
+ // {0} - INVALID (throw exception)
+ // {0,} = *
+ // {0,0} - INVALID (throw exception)
+ // {0,1} = ?
+ // {1,} = +
+ // {min,max} where min == max - {min}
+ // {min,max} where max < min - INVALID (throw exception)
+ static void repeat_n (state &state_, num_token &token_)
+ {
+ CharT ch_ = 0;
+ bool eos_ = state_.next (ch_);
+
+ while (!eos_ && ch_ >= '0' && ch_ <= '9')
+ {
+ token_._min *= 10;
+ token_._min += ch_ - '0';
+ eos_ = state_.next (ch_);
+ }
+
+ if (eos_)
+ {
+ throw runtime_error ("Unexpected end of regex "
+ "(missing '}').");
+ }
+
+ bool min_max_ = false;
+ bool repeatn_ = true;
+
+ token_._comma = ch_ == ',';
+
+ if (token_._comma)
+ {
+ eos_ = state_.next (ch_);
+
+ if (eos_)
+ {
+ throw runtime_error ("Unexpected end of regex "
+ "(missing '}').");
+ }
+
+ if (ch_ == '}')
+ {
+ // Small optimisation: Check for '*' equivalency.
+ if (token_._min == 0)
+ {
+ token_.set (num_token::ZEROORMORE, null_token);
+ repeatn_ = false;
+ }
+ // Small optimisation: Check for '+' equivalency.
+ else if (token_._min == 1)
+ {
+ token_.set (num_token::ONEORMORE, null_token);
+ repeatn_ = false;
+ }
+ }
+ else
+ {
+ if (ch_ < '0' || ch_ > '9')
+ {
+ std::ostringstream ss_;
+
+ ss_ << "Missing '}' at index " <<
+ state_.index () - 1 << '.';
+ throw runtime_error (ss_.str ().c_str ());
+ }
+
+ min_max_ = true;
+
+ do
+ {
+ token_._max *= 10;
+ token_._max += ch_ - '0';
+ eos_ = state_.next (ch_);
+ } while (!eos_ && ch_ >= '0' && ch_ <= '9');
+
+ if (eos_)
+ {
+ throw runtime_error ("Unexpected end of regex "
+ "(missing '}').");
+ }
+
+ // Small optimisation: Check for '?' equivalency.
+ if (token_._min == 0 && token_._max == 1)
+ {
+ token_.set (num_token::OPT, null_token);
+ repeatn_ = false;
+ }
+ // Small optimisation: if min == max, then min.
+ else if (token_._min == token_._max)
+ {
+ token_._comma = false;
+ min_max_ = false;
+ token_._max = 0;
+ }
+ }
+ }
+
+ if (ch_ != '}')
+ {
+ std::ostringstream ss_;
+
+ ss_ << "Missing '}' at index " << state_.index () - 1 << '.';
+ throw runtime_error (ss_.str ().c_str ());
+ }
+
+ if (repeatn_)
+ {
+ // SEMANTIC VALIDATION follows:
+ // NOTE: {0,} has already become *
+ // therefore we don't check for a comma.
+ if (token_._min == 0 && token_._max == 0)
+ {
+ std::ostringstream ss_;
+
+ ss_ << "Cannot have exactly zero repeats preceding index " <<
+ state_.index () << '.';
+ throw runtime_error (ss_.str ().c_str ());
+ }
+
+ if (min_max_ && token_._max < token_._min)
+ {
+ std::ostringstream ss_;
+
+ ss_ << "Max less than min preceding index " <<
+ state_.index () << '.';
+ throw runtime_error (ss_.str ().c_str ());
+ }
+
+ token_.set (num_token::REPEATN, null_token);
+ }
+ }
+
+ static void macro (state &state_, num_token &token_)
+ {
+ CharT ch_ = 0;
+ bool eos_ = false;
+ const CharT *start_ = state_._curr;
+
+ state_.next (ch_);
+
+ if (ch_ != '_' && !(ch_ >= 'A' && ch_ <= 'Z') &&
+ !(ch_ >= 'a' && ch_ <= 'z'))
+ {
+ std::ostringstream ss_;
+
+ ss_ << "Invalid MACRO name at index " <<
+ state_.index () - 1 << '.';
+ throw runtime_error (ss_.str ().c_str ());
+ }
+
+ do
+ {
+ eos_ = state_.next (ch_);
+
+ if (eos_)
+ {
+ throw runtime_error ("Unexpected end of regex "
+ "(missing '}').");
+ }
+ } while (ch_ == '_' || ch_ == '-' || (ch_ >= 'A' && ch_ <= 'Z') ||
+ (ch_ >= 'a' && ch_ <= 'z') || (ch_ >= '0' && ch_ <= '9'));
+
+ if (ch_ != '}')
+ {
+ std::ostringstream ss_;
+
+ ss_ << "Missing '}' at index " << state_.index () - 1 << '.';
+ throw runtime_error (ss_.str ().c_str ());
+ }
+
+ std::size_t len_ = state_._curr - 1 - start_;
+
+ if (len_ > max_macro_len)
+ {
+ std::basic_stringstream<CharT> ss_;
+ std::ostringstream os_;
+
+ os_ << "MACRO name '";
+
+ while (len_)
+ {
+ os_ << ss_.narrow (*start_++, ' ');
+ --len_;
+ }
+
+ os_ << "' too long.";
+ throw runtime_error (os_.str ());
+ }
+
+ token_.set (num_token::MACRO, null_token);
+
+ // Some systems have memcpy in namespace std.
+ using namespace std;
+
+ memcpy (token_._macro, start_, len_ * sizeof (CharT));
+ token_._macro[len_] = 0;
+ }
+};
+}
+}
+}
+
+#endif