// parser.hpp // Copyright (c) 2007-2009 Ben Hanson (http://www.benhanson.net/) // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) #ifndef BOOST_LEXER_PARSER_HPP #define BOOST_LEXER_PARSER_HPP #include #include "tree/end_node.hpp" #include "tree/iteration_node.hpp" #include "tree/leaf_node.hpp" #include "../runtime_error.hpp" #include "tree/selection_node.hpp" #include "tree/sequence_node.hpp" #include "../size_t.hpp" #include "tokeniser/re_tokeniser.hpp" namespace boost { namespace lexer { namespace detail { template class basic_parser { public: typedef basic_re_tokeniser tokeniser; typedef typename tokeniser::string string; typedef std::map macro_map; typedef node::node_ptr_vector node_ptr_vector; typedef typename tokeniser::num_token token; /* General principles of regex parsing: - Every regex is a sequence of sub-regexes. - Regexes consist of operands and operators - All operators decompose to sequence, selection ('|') and iteration ('*') - Regex tokens are stored on the stack. - When a complete sequence of regex tokens is on the stack it is processed. Grammar: -> -> | '|' -> -> | -> -> charset | macro | '('')' | -> '?' | '*' | '+' | '{n[,[m]]}' */ static node *parse (const CharT *start_, const CharT * const end_, const std::size_t id_, const std::size_t unique_id_, const std::size_t dfa_state_, const regex_flags flags_, const std::locale &locale_, node_ptr_vector &node_ptr_vector_, const macro_map ¯omap_, typename tokeniser::token_map &map_, bool &seen_BOL_assertion_, bool &seen_EOL_assertion_) { node *root_ = 0; state state_ (start_, end_, flags_, locale_); token lhs_token_; token rhs_token_; token_stack token_stack_; tree_node_stack tree_node_stack_; char action_ = 0; token_stack_.push (rhs_token_); tokeniser::next (state_, map_, rhs_token_); do { lhs_token_ = token_stack_.top (); action_ = lhs_token_.precedence (rhs_token_._type); switch (action_) { case '<': case '=': token_stack_.push (rhs_token_); tokeniser::next (state_, map_, rhs_token_); break; case '>': reduce (token_stack_, macromap_, node_ptr_vector_, tree_node_stack_); break; default: std::ostringstream ss_; ss_ << "A syntax error occured: '" << lhs_token_.precedence_string () << "' against '" << rhs_token_.precedence_string () << "' at index " << state_.index () << "."; throw runtime_error (ss_.str ().c_str ()); break; } } while (!token_stack_.empty ()); if (tree_node_stack_.empty ()) { throw runtime_error ("Empty rules are not allowed."); } BOOST_ASSERT(tree_node_stack_.size () == 1); node *lhs_node_ = tree_node_stack_.top (); tree_node_stack_.pop (); if (id_ == 0) { // Macros have no end state... root_ = lhs_node_; } else { node_ptr_vector_->push_back (static_cast(0)); node *rhs_node_ = new end_node (id_, unique_id_, dfa_state_); node_ptr_vector_->back () = rhs_node_; node_ptr_vector_->push_back (static_cast(0)); node_ptr_vector_->back () = new sequence_node (lhs_node_, rhs_node_); root_ = node_ptr_vector_->back (); } // Done this way as bug in VC++ 6 prevents |= operator working // properly! if (state_._seen_BOL_assertion) seen_BOL_assertion_ = true; if (state_._seen_EOL_assertion) seen_EOL_assertion_ = true; return root_; } private: typedef typename tokeniser::state state; typedef std::stack token_stack; typedef node::node_stack tree_node_stack; static void reduce (token_stack &token_stack_, const macro_map ¯omap_, node_ptr_vector &node_vector_ptr_, tree_node_stack &tree_node_stack_) { typename tokeniser::num_token lhs_; typename tokeniser::num_token rhs_; token_stack handle_; char action_ = 0; do { rhs_ = token_stack_.top (); token_stack_.pop (); handle_.push (rhs_); if (!token_stack_.empty ()) { lhs_ = token_stack_.top (); action_ = lhs_.precedence (rhs_._type); } } while (!token_stack_.empty () && action_ == '='); BOOST_ASSERT(token_stack_.empty () || action_ == '<'); switch (rhs_._type) { case token::BEGIN: // finished processing so exit break; case token::REGEX: // finished parsing, nothing to do break; case token::OREXP: orexp (handle_, token_stack_, node_vector_ptr_, tree_node_stack_); break; case token::SEQUENCE: token_stack_.push (token::OREXP); break; case token::SUB: sub (handle_, token_stack_, node_vector_ptr_, tree_node_stack_); break; case token::EXPRESSION: token_stack_.push (token::SUB); break; case token::REPEAT: repeat (handle_, token_stack_); break; case token::CHARSET: charset (handle_, token_stack_, node_vector_ptr_, tree_node_stack_); break; case token::MACRO: macro (handle_, token_stack_, macromap_, node_vector_ptr_, tree_node_stack_); break; case token::OPENPAREN: openparen (handle_, token_stack_); break; case token::OPT: case token::AOPT: optional (rhs_._type == token::OPT, node_vector_ptr_, tree_node_stack_); token_stack_.push (token::DUP); break; case token::ZEROORMORE: case token::AZEROORMORE: zero_or_more (rhs_._type == token::ZEROORMORE, node_vector_ptr_, tree_node_stack_); token_stack_.push (token::DUP); break; case token::ONEORMORE: case token::AONEORMORE: one_or_more (rhs_._type == token::ONEORMORE, node_vector_ptr_, tree_node_stack_); token_stack_.push (token::DUP); break; case token::REPEATN: case token::AREPEATN: repeatn (rhs_._type == token::REPEATN, handle_.top (), node_vector_ptr_, tree_node_stack_); token_stack_.push (token::DUP); break; default: throw runtime_error ("Internal error regex_parser::reduce"); break; } } static void orexp (token_stack &handle_, token_stack &token_stack_, node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_) { BOOST_ASSERT(handle_.top ()._type == token::OREXP && (handle_.size () == 1 || handle_.size () == 3)); if (handle_.size () == 1) { token_stack_.push (token::REGEX); } else { handle_.pop (); BOOST_ASSERT(handle_.top ()._type == token::OR); handle_.pop (); BOOST_ASSERT(handle_.top ()._type == token::SEQUENCE); perform_or (node_ptr_vector_, tree_node_stack_); token_stack_.push (token::OREXP); } } static void sub (token_stack &handle_, token_stack &token_stack_, node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_) { BOOST_ASSERT(handle_.top ()._type == token::SUB && (handle_.size () == 1 || handle_.size () == 2)); if (handle_.size () == 1) { token_stack_.push (token::SEQUENCE); } else { handle_.pop (); BOOST_ASSERT(handle_.top ()._type == token::EXPRESSION); // perform join sequence (node_ptr_vector_, tree_node_stack_); token_stack_.push (token::SUB); } } static void repeat (token_stack &handle_, token_stack &token_stack_) { BOOST_ASSERT(handle_.top ()._type == token::REPEAT && handle_.size () >= 1 && handle_.size () <= 3); if (handle_.size () == 1) { token_stack_.push (token::EXPRESSION); } else { handle_.pop (); BOOST_ASSERT(handle_.top ()._type == token::DUP); token_stack_.push (token::REPEAT); } } static void charset (token_stack &handle_, token_stack &token_stack_, node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_) { BOOST_ASSERT(handle_.top ()._type == token::CHARSET && handle_.size () == 1); // store charset node_ptr_vector_->push_back (static_cast(0)); const size_t id_ = handle_.top ()._id; node_ptr_vector_->back () = new leaf_node (id_, true); tree_node_stack_.push (node_ptr_vector_->back ()); token_stack_.push (token::REPEAT); } static void macro (token_stack &handle_, token_stack &token_stack_, const macro_map ¯omap_, node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_) { token &top_ = handle_.top (); BOOST_ASSERT(top_._type == token::MACRO && handle_.size () == 1); typename macro_map::const_iterator iter_ = macromap_.find (top_._macro); if (iter_ == macromap_.end ()) { const CharT *name_ = top_._macro; std::basic_stringstream ss_; std::ostringstream os_; os_ << "Unknown MACRO name '"; while (*name_) { os_ << ss_.narrow (*name_++, ' '); } os_ << "'."; throw runtime_error (os_.str ()); } tree_node_stack_.push (iter_->second->copy (node_ptr_vector_)); token_stack_.push (token::REPEAT); } static void openparen (token_stack &handle_, token_stack &token_stack_) { BOOST_ASSERT(handle_.top ()._type == token::OPENPAREN && handle_.size () == 3); handle_.pop (); BOOST_ASSERT(handle_.top ()._type == token::REGEX); handle_.pop (); BOOST_ASSERT(handle_.top ()._type == token::CLOSEPAREN); token_stack_.push (token::REPEAT); } static void perform_or (node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_) { // perform or node *rhs_ = tree_node_stack_.top (); tree_node_stack_.pop (); node *lhs_ = tree_node_stack_.top (); node_ptr_vector_->push_back (static_cast(0)); node_ptr_vector_->back () = new selection_node (lhs_, rhs_); tree_node_stack_.top () = node_ptr_vector_->back (); } static void sequence (node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_) { node *rhs_ = tree_node_stack_.top (); tree_node_stack_.pop (); node *lhs_ = tree_node_stack_.top (); node_ptr_vector_->push_back (static_cast(0)); node_ptr_vector_->back () = new sequence_node (lhs_, rhs_); tree_node_stack_.top () = node_ptr_vector_->back (); } static void optional (const bool greedy_, node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_) { // perform ? node *lhs_ = tree_node_stack_.top (); // You don't know if lhs_ is a leaf_node, so get firstpos. node::node_vector &firstpos_ = lhs_->firstpos (); for (node::node_vector::iterator iter_ = firstpos_.begin (), end_ = firstpos_.end (); iter_ != end_; ++iter_) { // These are leaf_nodes! (*iter_)->greedy (greedy_); } node_ptr_vector_->push_back (static_cast(0)); node *rhs_ = new leaf_node (null_token, greedy_); node_ptr_vector_->back () = rhs_; node_ptr_vector_->push_back (static_cast(0)); node_ptr_vector_->back () = new selection_node (lhs_, rhs_); tree_node_stack_.top () = node_ptr_vector_->back (); } static void zero_or_more (const bool greedy_, node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_) { // perform * node *ptr_ = tree_node_stack_.top (); node_ptr_vector_->push_back (static_cast(0)); node_ptr_vector_->back () = new iteration_node (ptr_, greedy_); tree_node_stack_.top () = node_ptr_vector_->back (); } static void one_or_more (const bool greedy_, node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_) { // perform + node *lhs_ = tree_node_stack_.top (); node *copy_ = lhs_->copy (node_ptr_vector_); node_ptr_vector_->push_back (static_cast(0)); node *rhs_ = new iteration_node (copy_, greedy_); node_ptr_vector_->back () = rhs_; node_ptr_vector_->push_back (static_cast(0)); node_ptr_vector_->back () = new sequence_node (lhs_, rhs_); tree_node_stack_.top () = node_ptr_vector_->back (); } // This is one of the most mind bending routines in this code... static void repeatn (const bool greedy_, const token &token_, node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_) { // perform {n[,[m]]} // Semantic checks have already been performed. // {0,} = * // {0,1} = ? // {1,} = + // therefore we do not check for these cases. if (!(token_._min == 1 && !token_._comma)) { const std::size_t top_ = token_._min > 0 ? token_._min : token_._max; if (token_._min == 0) { optional (greedy_, node_ptr_vector_, tree_node_stack_); } node *prev_ = tree_node_stack_.top ()->copy (node_ptr_vector_); node *curr_ = 0; for (std::size_t i_ = 2; i_ < top_; ++i_) { curr_ = prev_->copy (node_ptr_vector_); tree_node_stack_.push (static_cast(0)); tree_node_stack_.top () = prev_; sequence (node_ptr_vector_, tree_node_stack_); prev_ = curr_; } if (token_._comma && token_._min > 0) { if (token_._min > 1) { curr_ = prev_->copy (node_ptr_vector_); tree_node_stack_.push (static_cast(0)); tree_node_stack_.top () = prev_; sequence (node_ptr_vector_, tree_node_stack_); prev_ = curr_; } if (token_._comma && token_._max) { tree_node_stack_.push (static_cast(0)); tree_node_stack_.top () = prev_; optional (greedy_, node_ptr_vector_, tree_node_stack_); prev_ = tree_node_stack_.top (); tree_node_stack_.pop (); const std::size_t count_ = token_._max - token_._min; for (std::size_t i_ = 1; i_ < count_; ++i_) { curr_ = prev_->copy (node_ptr_vector_); tree_node_stack_.push (static_cast(0)); tree_node_stack_.top () = prev_; sequence (node_ptr_vector_, tree_node_stack_); prev_ = curr_; } } else { tree_node_stack_.push (static_cast(0)); tree_node_stack_.top () = prev_; zero_or_more (greedy_, node_ptr_vector_, tree_node_stack_); prev_ = tree_node_stack_.top (); tree_node_stack_.pop (); } } tree_node_stack_.push (static_cast(0)); tree_node_stack_.top () = prev_; sequence (node_ptr_vector_, tree_node_stack_); } } }; } } } #endif