// tokeniser_helper.hpp // Copyright (c) 2007-2009 Ben Hanson (http://www.benhanson.net/) // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) #ifndef BOOST_LEXER_RE_TOKENISER_HELPER_H #define BOOST_LEXER_RE_TOKENISER_HELPER_H #include "../../char_traits.hpp" // strlen() #include #include "../../size_t.hpp" #include "re_tokeniser_state.hpp" namespace boost { namespace lexer { namespace detail { template > class basic_re_tokeniser_helper { public: typedef basic_re_tokeniser_state state; typedef std::basic_string string; static const CharT *escape_sequence (state &state_, CharT &ch_, std::size_t &str_len_) { bool eos_ = state_.eos (); if (eos_) { throw runtime_error ("Unexpected end of regex " "following '\\'."); } const CharT *str_ = charset_shortcut (*state_._curr, str_len_); if (str_) { state_.increment (); } else { ch_ = chr (state_); } return str_; } // This function can call itself. static void charset (state &state_, string &chars_, bool &negated_) { CharT ch_ = 0; bool eos_ = state_.next (ch_); if (eos_) { // Pointless returning index if at end of string throw runtime_error ("Unexpected end of regex " "following '['."); } negated_ = ch_ == '^'; if (negated_) { eos_ = state_.next (ch_); if (eos_) { // Pointless returning index if at end of string throw runtime_error ("Unexpected end of regex " "following '^'."); } } bool chset_ = false; CharT prev_ = 0; while (ch_ != ']') { if (ch_ == '\\') { std::size_t str_len_ = 0; const CharT *str_ = escape_sequence (state_, prev_, str_len_); chset_ = str_ != 0; if (chset_) { state temp_state_ (str_ + 1, str_ + str_len_, state_._flags, state_._locale); string temp_chars_; bool temp_negated_ = false; charset (temp_state_, temp_chars_, temp_negated_); if (negated_ != temp_negated_) { std::ostringstream ss_; ss_ << "Mismatch in charset negation preceding " "index " << state_.index () << '.'; throw runtime_error (ss_.str ().c_str ()); } chars_ += temp_chars_; } } /* else if (ch_ == '[' && !state_.eos () && *state_._curr == ':') { // TODO: POSIX charsets } */ else { chset_ = false; prev_ = ch_; } eos_ = state_.next (ch_); // Covers preceding if, else if and else if (eos_) { // Pointless returning index if at end of string throw runtime_error ("Unexpected end of regex " "(missing ']')."); } if (ch_ == '-') { charset_range (chset_, state_, eos_, ch_, prev_, chars_); } else if (!chset_) { if ((state_._flags & icase) && (std::isupper (prev_, state_._locale) || std::islower (prev_, state_._locale))) { CharT upper_ = std::toupper (prev_, state_._locale); CharT lower_ = std::tolower (prev_, state_._locale); chars_ += upper_; chars_ += lower_; } else { chars_ += prev_; } } } if (!negated_ && chars_.empty ()) { throw runtime_error ("Empty charsets not allowed."); } } static CharT chr (state &state_) { CharT ch_ = 0; // eos_ has already been checked for. switch (*state_._curr) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': ch_ = decode_octal (state_); break; case 'a': ch_ = '\a'; state_.increment (); break; case 'b': ch_ = '\b'; state_.increment (); break; case 'c': ch_ = decode_control_char (state_); break; case 'e': ch_ = 27; // '\e' not recognised by compiler state_.increment (); break; case 'f': ch_ = '\f'; state_.increment (); break; case 'n': ch_ = '\n'; state_.increment (); break; case 'r': ch_ = '\r'; state_.increment (); break; case 't': ch_ = '\t'; state_.increment (); break; case 'v': ch_ = '\v'; state_.increment (); break; case 'x': ch_ = decode_hex (state_); break; default: ch_ = *state_._curr; state_.increment (); break; } return ch_; } private: static const char *charset_shortcut (const char ch_, std::size_t &str_len_) { const char *str_ = 0; switch (ch_) { case 'd': str_ = "[0-9]"; break; case 'D': str_ = "[^0-9]"; break; case 's': str_ = "[ \t\n\r\f\v]"; break; case 'S': str_ = "[^ \t\n\r\f\v]"; break; case 'w': str_ = "[_0-9A-Za-z]"; break; case 'W': str_ = "[^_0-9A-Za-z]"; break; } if (str_) { // Some systems have strlen in namespace std. using namespace std; str_len_ = strlen (str_); } else { str_len_ = 0; } return str_; } static const wchar_t *charset_shortcut (const wchar_t ch_, std::size_t &str_len_) { const wchar_t *str_ = 0; switch (ch_) { case 'd': str_ = L"[0-9]"; break; case 'D': str_ = L"[^0-9]"; break; case 's': str_ = L"[ \t\n\r\f\v]"; break; case 'S': str_ = L"[^ \t\n\r\f\v]"; break; case 'w': str_ = L"[_0-9A-Za-z]"; break; case 'W': str_ = L"[^_0-9A-Za-z]"; break; } if (str_) { // Some systems have wcslen in namespace std. using namespace std; str_len_ = wcslen (str_); } else { str_len_ = 0; } return str_; } static CharT decode_octal (state &state_) { std::size_t accumulator_ = 0; CharT ch_ = *state_._curr; unsigned short count_ = 3; bool eos_ = false; for (;;) { accumulator_ *= 8; accumulator_ += ch_ - '0'; --count_; state_.increment (); eos_ = state_.eos (); if (!count_ || eos_) break; ch_ = *state_._curr; // Don't consume invalid chars! if (ch_ < '0' || ch_ > '7') { break; } } return static_cast (accumulator_); } static CharT decode_control_char (state &state_) { // Skip over 'c' state_.increment (); CharT ch_ = 0; bool eos_ = state_.next (ch_); if (eos_) { // Pointless returning index if at end of string throw runtime_error ("Unexpected end of regex following \\c."); } else { if (ch_ >= 'a' && ch_ <= 'z') { ch_ -= 'a' - 1; } else if (ch_ >= 'A' && ch_ <= 'Z') { ch_ -= 'A' - 1; } else if (ch_ == '@') { // Apparently... ch_ = 0; } else { std::ostringstream ss_; ss_ << "Invalid control char at index " << state_.index () - 1 << '.'; throw runtime_error (ss_.str ().c_str ()); } } return ch_; } static CharT decode_hex (state &state_) { // Skip over 'x' state_.increment (); CharT ch_ = 0; bool eos_ = state_.next (ch_); if (eos_) { // Pointless returning index if at end of string throw runtime_error ("Unexpected end of regex following \\x."); } if (!((ch_ >= '0' && ch_ <= '9') || (ch_ >= 'a' && ch_ <= 'f') || (ch_ >= 'A' && ch_ <= 'F'))) { std::ostringstream ss_; ss_ << "Illegal char following \\x at index " << state_.index () - 1 << '.'; throw runtime_error (ss_.str ().c_str ()); } std::size_t hex_ = 0; do { hex_ *= 16; if (ch_ >= '0' && ch_ <= '9') { hex_ += ch_ - '0'; } else if (ch_ >= 'a' && ch_ <= 'f') { hex_ += 10 + (ch_ - 'a'); } else { hex_ += 10 + (ch_ - 'A'); } eos_ = state_.eos (); if (!eos_) { ch_ = *state_._curr; // Don't consume invalid chars! if (((ch_ >= '0' && ch_ <= '9') || (ch_ >= 'a' && ch_ <= 'f') || (ch_ >= 'A' && ch_ <= 'F'))) { state_.increment (); } else { eos_ = true; } } } while (!eos_); return static_cast (hex_); } static void charset_range (const bool chset_, state &state_, bool &eos_, CharT &ch_, const CharT prev_, string &chars_) { if (chset_) { std::ostringstream ss_; ss_ << "Charset cannot form start of range preceding " "index " << state_.index () - 1 << '.'; throw runtime_error (ss_.str ().c_str ()); } eos_ = state_.next (ch_); if (eos_) { // Pointless returning index if at end of string throw runtime_error ("Unexpected end of regex " "following '-'."); } CharT curr_ = 0; if (ch_ == '\\') { std::size_t str_len_ = 0; if (escape_sequence (state_, curr_, str_len_)) { std::ostringstream ss_; ss_ << "Charset cannot form end of range preceding index " << state_.index () << '.'; throw runtime_error (ss_.str ().c_str ()); } } /* else if (ch_ == '[' && !state_.eos () && *state_._curr == ':') { std::ostringstream ss_; ss_ << "POSIX char class cannot form end of range at " "index " << state_.index () - 1 << '.'; throw runtime_error (ss_.str ().c_str ()); } */ else { curr_ = ch_; } eos_ = state_.next (ch_); // Covers preceding if and else if (eos_) { // Pointless returning index if at end of string throw runtime_error ("Unexpected end of regex " "(missing ']')."); } std::size_t start_ = static_cast (prev_); std::size_t end_ = static_cast (curr_); // Semanic check if (end_ < start_) { std::ostringstream ss_; ss_ << "Invalid range in charset preceding index " << state_.index () - 1 << '.'; throw runtime_error (ss_.str ().c_str ()); } chars_.reserve (chars_.size () + (end_ + 1 - start_)); for (; start_ <= end_; ++start_) { CharT ch_ = static_cast (start_); if ((state_._flags & icase) && (std::isupper (ch_, state_._locale) || std::islower (ch_, state_._locale))) { CharT upper_ = std::toupper (ch_, state_._locale); CharT lower_ = std::tolower (ch_, state_._locale); chars_ += (upper_); chars_ += (lower_); } else { chars_ += (ch_); } } } }; } } } #endif