// tokeniser.hpp // Copyright (c) 2007-2009 Ben Hanson (http://www.benhanson.net/) // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) #ifndef BOOST_LEXER_RE_TOKENISER_HPP #define BOOST_LEXER_RE_TOKENISER_HPP // memcpy() #include #include #include "num_token.hpp" #include "../../runtime_error.hpp" #include "../../size_t.hpp" #include #include "../../string_token.hpp" #include "re_tokeniser_helper.hpp" namespace boost { namespace lexer { namespace detail { template class basic_re_tokeniser { public: typedef basic_num_token num_token; typedef basic_re_tokeniser_state state; typedef basic_string_token string_token; typedef typename string_token::string string; typedef std::map token_map; typedef std::pair token_pair; static void next (state &state_, token_map &map_, num_token &token_) { CharT ch_ = 0; bool eos_ = state_.next (ch_); token_.min_max (0, false, 0); while (!eos_ && ch_ == '"') { state_._in_string ^= 1; eos_ = state_.next (ch_); } if (eos_) { if (state_._in_string) { throw runtime_error ("Unexpected end of regex " "(missing '\"')."); } if (state_._paren_count) { throw runtime_error ("Unexpected end of regex " "(missing ')')."); } token_.set (num_token::END, null_token); } else { if (ch_ == '\\') { // Even if we are in a string, respect escape sequences... escape (state_, map_, token_); } else if (state_._in_string) { // All other meta characters lose their special meaning // inside a string. create_charset_token (string (1, ch_), false, map_, token_); } else { // Not an escape sequence and not inside a string, so // check for meta characters. switch (ch_) { case '(': token_.set (num_token::OPENPAREN, null_token); ++state_._paren_count; read_options (state_); break; case ')': --state_._paren_count; if (state_._paren_count < 0) { std::ostringstream ss_; ss_ << "Number of open parenthesis < 0 at index " << state_.index () - 1 << '.'; throw runtime_error (ss_.str ().c_str ()); } token_.set (num_token::CLOSEPAREN, null_token); if (!state_._flags_stack.empty ()) { state_._flags = state_._flags_stack.top (); state_._flags_stack.pop (); } break; case '?': if (!state_.eos () && *state_._curr == '?') { token_.set (num_token::AOPT, null_token); state_.increment (); } else { token_.set (num_token::OPT, null_token); } break; case '*': if (!state_.eos () && *state_._curr == '?') { token_.set (num_token::AZEROORMORE, null_token); state_.increment (); } else { token_.set (num_token::ZEROORMORE, null_token); } break; case '+': if (!state_.eos () && *state_._curr == '?') { token_.set (num_token::AONEORMORE, null_token); state_.increment (); } else { token_.set (num_token::ONEORMORE, null_token); } break; case '{': open_curly (state_, token_); break; case '|': token_.set (num_token::OR, null_token); break; case '^': if (state_._curr - 1 == state_._start) { token_.set (num_token::CHARSET, bol_token); state_._seen_BOL_assertion = true; } else { create_charset_token (string (1, ch_), false, map_, token_); } break; case '$': if (state_._curr == state_._end) { token_.set (num_token::CHARSET, eol_token); state_._seen_EOL_assertion = true; } else { create_charset_token (string (1, ch_), false, map_, token_); } break; case '.': { string dot_; if (state_._flags & dot_not_newline) { dot_ = '\n'; } create_charset_token (dot_, true, map_, token_); break; } case '[': { charset (state_, map_, token_); break; } case '/': throw runtime_error("Lookahead ('/') is not supported yet."); break; default: if ((state_._flags & icase) && (std::isupper (ch_, state_._locale) || std::islower (ch_, state_._locale))) { CharT upper_ = std::toupper (ch_, state_._locale); CharT lower_ = std::tolower (ch_, state_._locale); string str_ (1, upper_); str_ += lower_; create_charset_token (str_, false, map_, token_); } else { create_charset_token (string (1, ch_), false, map_, token_); } break; } } } } private: typedef basic_re_tokeniser_helper tokeniser_helper; static void read_options (state &state_) { if (!state_.eos () && *state_._curr == '?') { CharT ch_ = 0; bool eos_ = false; bool negate_ = false; state_.increment (); eos_ = state_.next (ch_); state_._flags_stack.push (state_._flags); while (!eos_ && ch_ != ':') { switch (ch_) { case '-': negate_ ^= 1; break; case 'i': if (negate_) { state_._flags = static_cast (state_._flags & ~icase); } else { state_._flags = static_cast (state_._flags | icase); } negate_ = false; break; case 's': if (negate_) { state_._flags = static_cast (state_._flags | dot_not_newline); } else { state_._flags = static_cast (state_._flags & ~dot_not_newline); } negate_ = false; break; default: { std::ostringstream ss_; ss_ << "Unknown option at index " << state_.index () - 1 << '.'; throw runtime_error (ss_.str ().c_str ()); } } eos_ = state_.next (ch_); } // End of string handler will handle early termination } else if (!state_._flags_stack.empty ()) { state_._flags_stack.push (state_._flags); } } static void escape (state &state_, token_map &map_, num_token &token_) { CharT ch_ = 0; std::size_t str_len_ = 0; const CharT *str_ = tokeniser_helper::escape_sequence (state_, ch_, str_len_); if (str_) { state state2_ (str_ + 1, str_ + str_len_, state_._flags, state_._locale); charset (state2_, map_, token_); } else { create_charset_token (string (1, ch_), false, map_, token_); } } static void charset (state &state_, token_map &map_, num_token &token_) { string chars_; bool negated_ = false; tokeniser_helper::charset (state_, chars_, negated_); create_charset_token (chars_, negated_, map_, token_); } static void create_charset_token (const string &charset_, const bool negated_, token_map &map_, num_token &token_) { std::size_t id_ = null_token; string_token stok_ (negated_, charset_); stok_.remove_duplicates (); stok_.normalise (); typename token_map::const_iterator iter_ = map_.find (stok_); if (iter_ == map_.end ()) { id_ = map_.size (); map_.insert (token_pair (stok_, id_)); } else { id_ = iter_->second; } token_.set (num_token::CHARSET, id_); } static void open_curly (state &state_, num_token &token_) { if (state_.eos ()) { throw runtime_error ("Unexpected end of regex " "(missing '}')."); } else if (*state_._curr >= '0' && *state_._curr <= '9') { repeat_n (state_, token_); if (!state_.eos () && *state_._curr == '?') { token_._type = num_token::AREPEATN; state_.increment (); } } else { macro (state_, token_); } } // SYNTAX: // {n[,[n]]} // SEMANTIC RULES: // {0} - INVALID (throw exception) // {0,} = * // {0,0} - INVALID (throw exception) // {0,1} = ? // {1,} = + // {min,max} where min == max - {min} // {min,max} where max < min - INVALID (throw exception) static void repeat_n (state &state_, num_token &token_) { CharT ch_ = 0; bool eos_ = state_.next (ch_); while (!eos_ && ch_ >= '0' && ch_ <= '9') { token_._min *= 10; token_._min += ch_ - '0'; eos_ = state_.next (ch_); } if (eos_) { throw runtime_error ("Unexpected end of regex " "(missing '}')."); } bool min_max_ = false; bool repeatn_ = true; token_._comma = ch_ == ','; if (token_._comma) { eos_ = state_.next (ch_); if (eos_) { throw runtime_error ("Unexpected end of regex " "(missing '}')."); } if (ch_ == '}') { // Small optimisation: Check for '*' equivalency. if (token_._min == 0) { token_.set (num_token::ZEROORMORE, null_token); repeatn_ = false; } // Small optimisation: Check for '+' equivalency. else if (token_._min == 1) { token_.set (num_token::ONEORMORE, null_token); repeatn_ = false; } } else { if (ch_ < '0' || ch_ > '9') { std::ostringstream ss_; ss_ << "Missing '}' at index " << state_.index () - 1 << '.'; throw runtime_error (ss_.str ().c_str ()); } min_max_ = true; do { token_._max *= 10; token_._max += ch_ - '0'; eos_ = state_.next (ch_); } while (!eos_ && ch_ >= '0' && ch_ <= '9'); if (eos_) { throw runtime_error ("Unexpected end of regex " "(missing '}')."); } // Small optimisation: Check for '?' equivalency. if (token_._min == 0 && token_._max == 1) { token_.set (num_token::OPT, null_token); repeatn_ = false; } // Small optimisation: if min == max, then min. else if (token_._min == token_._max) { token_._comma = false; min_max_ = false; token_._max = 0; } } } if (ch_ != '}') { std::ostringstream ss_; ss_ << "Missing '}' at index " << state_.index () - 1 << '.'; throw runtime_error (ss_.str ().c_str ()); } if (repeatn_) { // SEMANTIC VALIDATION follows: // NOTE: {0,} has already become * // therefore we don't check for a comma. if (token_._min == 0 && token_._max == 0) { std::ostringstream ss_; ss_ << "Cannot have exactly zero repeats preceding index " << state_.index () << '.'; throw runtime_error (ss_.str ().c_str ()); } if (min_max_ && token_._max < token_._min) { std::ostringstream ss_; ss_ << "Max less than min preceding index " << state_.index () << '.'; throw runtime_error (ss_.str ().c_str ()); } token_.set (num_token::REPEATN, null_token); } } static void macro (state &state_, num_token &token_) { CharT ch_ = 0; bool eos_ = false; const CharT *start_ = state_._curr; state_.next (ch_); if (ch_ != '_' && !(ch_ >= 'A' && ch_ <= 'Z') && !(ch_ >= 'a' && ch_ <= 'z')) { std::ostringstream ss_; ss_ << "Invalid MACRO name at index " << state_.index () - 1 << '.'; throw runtime_error (ss_.str ().c_str ()); } do { eos_ = state_.next (ch_); if (eos_) { throw runtime_error ("Unexpected end of regex " "(missing '}')."); } } while (ch_ == '_' || ch_ == '-' || (ch_ >= 'A' && ch_ <= 'Z') || (ch_ >= 'a' && ch_ <= 'z') || (ch_ >= '0' && ch_ <= '9')); if (ch_ != '}') { std::ostringstream ss_; ss_ << "Missing '}' at index " << state_.index () - 1 << '.'; throw runtime_error (ss_.str ().c_str ()); } std::size_t len_ = state_._curr - 1 - start_; if (len_ > max_macro_len) { std::basic_stringstream ss_; std::ostringstream os_; os_ << "MACRO name '"; while (len_) { os_ << ss_.narrow (*start_++, ' '); --len_; } os_ << "' too long."; throw runtime_error (os_.str ()); } token_.set (num_token::MACRO, null_token); // Some systems have memcpy in namespace std. using namespace std; memcpy (token_._macro, start_, len_ * sizeof (CharT)); token_._macro[len_] = 0; } }; } } } #endif