// generator.hpp // Copyright (c) 2007-2009 Ben Hanson (http://www.benhanson.net/) // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) #ifndef BOOST_LEXER_GENERATOR_HPP #define BOOST_LEXER_GENERATOR_HPP #include "char_traits.hpp" // memcmp() #include #include "partition/charset.hpp" #include "partition/equivset.hpp" #include #include "parser/tree/node.hpp" #include "parser/parser.hpp" #include "containers/ptr_list.hpp" #include "rules.hpp" #include "state_machine.hpp" namespace boost { namespace lexer { template > class basic_generator { public: typedef typename detail::internals::size_t_vector size_t_vector; typedef basic_rules rules; static void build (const rules &rules_, basic_state_machine &state_machine_) { std::size_t index_ = 0; std::size_t size_ = rules_.statemap ().size (); node_ptr_vector node_ptr_vector_; detail::internals &internals_ = const_cast (state_machine_.data ()); bool seen_BOL_assertion_ = false; bool seen_EOL_assertion_ = false; state_machine_.clear (); for (; index_ < size_; ++index_) { internals_._lookup->push_back (static_cast(0)); internals_._lookup->back () = new size_t_vector; internals_._dfa_alphabet.push_back (0); internals_._dfa->push_back (static_cast(0)); internals_._dfa->back () = new size_t_vector; } for (index_ = 0, size_ = internals_._lookup->size (); index_ < size_; ++index_) { internals_._lookup[index_]->resize (sizeof (CharT) == 1 ? num_chars : num_wchar_ts, dead_state_index); if (!rules_.regexes ()[index_].empty ()) { // vector mapping token indexes to partitioned token index sets index_set_vector set_mapping_; // syntax tree detail::node *root_ = build_tree (rules_, index_, node_ptr_vector_, internals_, set_mapping_); build_dfa (root_, set_mapping_, internals_._dfa_alphabet[index_], *internals_._dfa[index_]); if (internals_._seen_BOL_assertion) { seen_BOL_assertion_ = true; } if (internals_._seen_EOL_assertion) { seen_EOL_assertion_ = true; } internals_._seen_BOL_assertion = false; internals_._seen_EOL_assertion = false; } } internals_._seen_BOL_assertion = seen_BOL_assertion_; internals_._seen_EOL_assertion = seen_EOL_assertion_; } static void minimise (basic_state_machine &state_machine_) { detail::internals &internals_ = const_cast (state_machine_.data ()); const std::size_t machines_ = internals_._dfa->size (); for (std::size_t i_ = 0; i_ < machines_; ++i_) { const std::size_t dfa_alphabet_ = internals_._dfa_alphabet[i_]; size_t_vector *dfa_ = internals_._dfa[i_]; if (dfa_alphabet_ != 0) { std::size_t size_ = 0; do { size_ = dfa_->size (); minimise_dfa (dfa_alphabet_, *dfa_, size_); } while (dfa_->size () != size_); } } } protected: typedef detail::basic_charset charset; typedef detail::ptr_list charset_list; typedef std::auto_ptr charset_ptr; typedef detail::equivset equivset; typedef detail::ptr_list equivset_list; typedef std::auto_ptr equivset_ptr; typedef typename charset::index_set index_set; typedef std::vector index_set_vector; typedef detail::basic_parser parser; typedef typename parser::node_ptr_vector node_ptr_vector; typedef std::set node_set; typedef detail::ptr_vector node_set_vector; typedef std::vector node_vector; typedef detail::ptr_vector node_vector_vector; typedef typename parser::string string; typedef std::pair string_pair; typedef typename parser::tokeniser::string_token string_token; typedef std::deque macro_deque; typedef std::pair macro_pair; typedef typename parser::macro_map::iterator macro_iter; typedef std::pair macro_iter_pair; typedef typename parser::tokeniser::token_map token_map; static detail::node *build_tree (const rules &rules_, const std::size_t state_, node_ptr_vector &node_ptr_vector_, detail::internals &internals_, index_set_vector &set_mapping_) { size_t_vector *lookup_ = internals_._lookup[state_]; const typename rules::string_deque_deque ®exes_ = rules_.regexes (); const typename rules::id_vector_deque &ids_ = rules_.ids (); const typename rules::id_vector_deque &unique_ids_ = rules_.unique_ids (); const typename rules::id_vector_deque &states_ = rules_.states (); typename rules::string_deque::const_iterator regex_iter_ = regexes_[state_].begin (); typename rules::string_deque::const_iterator regex_iter_end_ = regexes_[state_].end (); typename rules::id_vector::const_iterator ids_iter_ = ids_[state_].begin (); typename rules::id_vector::const_iterator unique_ids_iter_ = unique_ids_[state_].begin (); typename rules::id_vector::const_iterator states_iter_ = states_[state_].begin (); const typename rules::string ®ex_ = *regex_iter_; // map of regex charset tokens (strings) to index token_map token_map_; const typename rules::string_pair_deque ¯odeque_ = rules_.macrodeque (); typename parser::macro_map macromap_; typename detail::node::node_vector tree_vector_; build_macros (token_map_, macrodeque_, macromap_, rules_.flags (), rules_.locale (), node_ptr_vector_, internals_._seen_BOL_assertion, internals_._seen_EOL_assertion); detail::node *root_ = parser::parse (regex_.c_str (), regex_.c_str () + regex_.size (), *ids_iter_, *unique_ids_iter_, *states_iter_, rules_.flags (), rules_.locale (), node_ptr_vector_, macromap_, token_map_, internals_._seen_BOL_assertion, internals_._seen_EOL_assertion); ++regex_iter_; ++ids_iter_; ++unique_ids_iter_; ++states_iter_; tree_vector_.push_back (root_); // build syntax trees while (regex_iter_ != regex_iter_end_) { // re-declare var, otherwise we perform an assignment..! const typename rules::string ®ex_ = *regex_iter_; root_ = parser::parse (regex_.c_str (), regex_.c_str () + regex_.size (), *ids_iter_, *unique_ids_iter_, *states_iter_, rules_.flags (), rules_.locale (), node_ptr_vector_, macromap_, token_map_, internals_._seen_BOL_assertion, internals_._seen_EOL_assertion); tree_vector_.push_back (root_); ++regex_iter_; ++ids_iter_; ++unique_ids_iter_; ++states_iter_; } if (internals_._seen_BOL_assertion) { // Fixup BOLs typename detail::node::node_vector::iterator iter_ = tree_vector_.begin (); typename detail::node::node_vector::iterator end_ = tree_vector_.end (); for (; iter_ != end_; ++iter_) { fixup_bol (*iter_, node_ptr_vector_); } } // join trees { typename detail::node::node_vector::iterator iter_ = tree_vector_.begin (); typename detail::node::node_vector::iterator end_ = tree_vector_.end (); if (iter_ != end_) { root_ = *iter_; ++iter_; } for (; iter_ != end_; ++iter_) { node_ptr_vector_->push_back (static_cast(0)); node_ptr_vector_->back () = new detail::selection_node (root_, *iter_); root_ = node_ptr_vector_->back (); } } // partitioned token list charset_list token_list_; set_mapping_.resize (token_map_.size ()); partition_tokens (token_map_, token_list_); typename charset_list::list::const_iterator iter_ = token_list_->begin (); typename charset_list::list::const_iterator end_ = token_list_->end (); std::size_t index_ = 0; for (; iter_ != end_; ++iter_, ++index_) { const charset *cs_ = *iter_; typename charset::index_set::const_iterator set_iter_ = cs_->_index_set.begin (); typename charset::index_set::const_iterator set_end_ = cs_->_index_set.end (); fill_lookup (cs_->_token, lookup_, index_); for (; set_iter_ != set_end_; ++set_iter_) { set_mapping_[*set_iter_].insert (index_); } } internals_._dfa_alphabet[state_] = token_list_->size () + dfa_offset; return root_; } static void build_macros (token_map &token_map_, const macro_deque ¯odeque_, typename parser::macro_map ¯omap_, const regex_flags flags_, const std::locale &locale_, node_ptr_vector &node_ptr_vector_, bool &seen_BOL_assertion_, bool &seen_EOL_assertion_) { for (typename macro_deque::const_iterator iter_ = macrodeque_.begin (), end_ = macrodeque_.end (); iter_ != end_; ++iter_) { const typename rules::string &name_ = iter_->first; const typename rules::string ®ex_ = iter_->second; detail::node *node_ = parser::parse (regex_.c_str (), regex_.c_str () + regex_.size (), 0, 0, 0, flags_, locale_, node_ptr_vector_, macromap_, token_map_, seen_BOL_assertion_, seen_EOL_assertion_); macro_iter_pair map_iter_ = macromap_. insert (macro_pair (name_, static_cast (0))); map_iter_.first->second = node_; } } static void build_dfa (detail::node *root_, const index_set_vector &set_mapping_, const std::size_t dfa_alphabet_, size_t_vector &dfa_) { typename detail::node::node_vector *followpos_ = &root_->firstpos (); node_set_vector seen_sets_; node_vector_vector seen_vectors_; size_t_vector hash_vector_; // 'jam' state dfa_.resize (dfa_alphabet_, 0); closure (followpos_, seen_sets_, seen_vectors_, hash_vector_, dfa_alphabet_, dfa_); std::size_t *ptr_ = 0; for (std::size_t index_ = 0; index_ < seen_vectors_->size (); ++index_) { equivset_list equiv_list_; build_equiv_list (seen_vectors_[index_], set_mapping_, equiv_list_); for (typename equivset_list::list::const_iterator iter_ = equiv_list_->begin (), end_ = equiv_list_->end (); iter_ != end_; ++iter_) { equivset *equivset_ = *iter_; const std::size_t transition_ = closure (&equivset_->_followpos, seen_sets_, seen_vectors_, hash_vector_, dfa_alphabet_, dfa_); if (transition_ != npos) { ptr_ = &dfa_.front () + ((index_ + 1) * dfa_alphabet_); // Prune abstemious transitions from end states. if (*ptr_ && !equivset_->_greedy) continue; for (typename detail::equivset::index_vector::const_iterator equiv_iter_ = equivset_->_index_vector.begin (), equiv_end_ = equivset_->_index_vector.end (); equiv_iter_ != equiv_end_; ++equiv_iter_) { const std::size_t index_ = *equiv_iter_; if (index_ == bol_token) { if (ptr_[eol_index] == 0) { ptr_[bol_index] = transition_; } } else if (index_ == eol_token) { if (ptr_[bol_index] == 0) { ptr_[eol_index] = transition_; } } else { ptr_[index_ + dfa_offset] = transition_; } } } } } } static std::size_t closure (typename detail::node::node_vector *followpos_, node_set_vector &seen_sets_, node_vector_vector &seen_vectors_, size_t_vector &hash_vector_, const std::size_t size_, size_t_vector &dfa_) { bool end_state_ = false; std::size_t id_ = 0; std::size_t unique_id_ = npos; std::size_t state_ = 0; std::size_t hash_ = 0; if (followpos_->empty ()) return npos; std::size_t index_ = 0; std::auto_ptr set_ptr_ (new node_set); std::auto_ptr vector_ptr_ (new node_vector); for (typename detail::node::node_vector::const_iterator iter_ = followpos_->begin (), end_ = followpos_->end (); iter_ != end_; ++iter_) { closure_ex (*iter_, end_state_, id_, unique_id_, state_, set_ptr_.get (), vector_ptr_.get (), hash_); } bool found_ = false; typename size_t_vector::const_iterator hash_iter_ = hash_vector_.begin (); typename size_t_vector::const_iterator hash_end_ = hash_vector_.end (); typename node_set_vector::vector::const_iterator set_iter_ = seen_sets_->begin (); for (; hash_iter_ != hash_end_; ++hash_iter_, ++set_iter_) { found_ = *hash_iter_ == hash_ && *(*set_iter_) == *set_ptr_; ++index_; if (found_) break; } if (!found_) { seen_sets_->push_back (static_cast(0)); seen_sets_->back () = set_ptr_.release (); seen_vectors_->push_back (static_cast(0)); seen_vectors_->back () = vector_ptr_.release (); hash_vector_.push_back (hash_); // State 0 is the jam state... index_ = seen_sets_->size (); const std::size_t old_size_ = dfa_.size (); dfa_.resize (old_size_ + size_, 0); if (end_state_) { dfa_[old_size_] |= end_state; dfa_[old_size_ + id_index] = id_; dfa_[old_size_ + unique_id_index] = unique_id_; dfa_[old_size_ + state_index] = state_; } } return index_; } static void closure_ex (detail::node *node_, bool &end_state_, std::size_t &id_, std::size_t &unique_id_, std::size_t &state_, node_set *set_ptr_, node_vector *vector_ptr_, std::size_t &hash_) { const bool temp_end_state_ = node_->end_state (); if (temp_end_state_) { if (!end_state_) { end_state_ = true; id_ = node_->id (); unique_id_ = node_->unique_id (); state_ = node_->lexer_state (); } } if (set_ptr_->insert (node_).second) { vector_ptr_->push_back (node_); hash_ += reinterpret_cast (node_); } } static void partition_tokens (const token_map &map_, charset_list &lhs_) { charset_list rhs_; fill_rhs_list (map_, rhs_); if (!rhs_->empty ()) { typename charset_list::list::iterator iter_; typename charset_list::list::iterator end_; charset_ptr overlap_ (new charset); lhs_->push_back (static_cast(0)); lhs_->back () = rhs_->front (); rhs_->pop_front (); while (!rhs_->empty ()) { charset_ptr r_ (rhs_->front ()); rhs_->pop_front (); iter_ = lhs_->begin (); end_ = lhs_->end (); while (!r_->empty () && iter_ != end_) { typename charset_list::list::iterator l_iter_ = iter_; (*l_iter_)->intersect (*r_.get (), *overlap_.get ()); if (overlap_->empty ()) { ++iter_; } else if ((*l_iter_)->empty ()) { delete *l_iter_; *l_iter_ = overlap_.release (); // VC++ 6 Hack: charset_ptr temp_overlap_ (new charset); overlap_ = temp_overlap_; ++iter_; } else if (r_->empty ()) { delete r_.release (); r_ = overlap_; // VC++ 6 Hack: charset_ptr temp_overlap_ (new charset); overlap_ = temp_overlap_; break; } else { iter_ = lhs_->insert (++iter_, static_cast(0)); *iter_ = overlap_.release (); // VC++ 6 Hack: charset_ptr temp_overlap_ (new charset); overlap_ = temp_overlap_; ++iter_; end_ = lhs_->end (); } } if (!r_->empty ()) { lhs_->push_back (static_cast(0)); lhs_->back () = r_.release (); } } } } static void fill_rhs_list (const token_map &map_, charset_list &list_) { typename parser::tokeniser::token_map::const_iterator iter_ = map_.begin (); typename parser::tokeniser::token_map::const_iterator end_ = map_.end (); for (; iter_ != end_; ++iter_) { list_->push_back (static_cast(0)); list_->back () = new charset (iter_->first, iter_->second); } } static void fill_lookup (const string_token &token_, size_t_vector *lookup_, const std::size_t index_) { const CharT *curr_ = token_._charset.c_str (); const CharT *chars_end_ = curr_ + token_._charset.size (); std::size_t *ptr_ = &lookup_->front (); const std::size_t max_ = sizeof (CharT) == 1 ? num_chars : num_wchar_ts; if (token_._negated) { CharT curr_char_ = sizeof (CharT) == 1 ? -128 : 0; std::size_t i_ = 0; while (curr_ < chars_end_) { while (*curr_ > curr_char_) { ptr_[static_cast (curr_char_)] = index_ + dfa_offset; ++curr_char_; ++i_; } ++curr_char_; ++curr_; ++i_; } for (; i_ < max_; ++i_) { ptr_[static_cast(curr_char_)] = index_ + dfa_offset; ++curr_char_; } } else { while (curr_ < chars_end_) { ptr_[static_cast(*curr_)] = index_ + dfa_offset; ++curr_; } } } static void build_equiv_list (const node_vector *vector_, const index_set_vector &set_mapping_, equivset_list &lhs_) { equivset_list rhs_; fill_rhs_list (vector_, set_mapping_, rhs_); if (!rhs_->empty ()) { typename equivset_list::list::iterator iter_; typename equivset_list::list::iterator end_; equivset_ptr overlap_ (new equivset); lhs_->push_back (static_cast(0)); lhs_->back () = rhs_->front (); rhs_->pop_front (); while (!rhs_->empty ()) { equivset_ptr r_ (rhs_->front ()); rhs_->pop_front (); iter_ = lhs_->begin (); end_ = lhs_->end (); while (!r_->empty () && iter_ != end_) { typename equivset_list::list::iterator l_iter_ = iter_; (*l_iter_)->intersect (*r_.get (), *overlap_.get ()); if (overlap_->empty ()) { ++iter_; } else if ((*l_iter_)->empty ()) { delete *l_iter_; *l_iter_ = overlap_.release (); // VC++ 6 Hack: equivset_ptr temp_overlap_ (new equivset); overlap_ = temp_overlap_; ++iter_; } else if (r_->empty ()) { delete r_.release (); r_ = overlap_; // VC++ 6 Hack: equivset_ptr temp_overlap_ (new equivset); overlap_ = temp_overlap_; break; } else { iter_ = lhs_->insert (++iter_, static_cast(0)); *iter_ = overlap_.release (); // VC++ 6 Hack: equivset_ptr temp_overlap_ (new equivset); overlap_ = temp_overlap_; ++iter_; end_ = lhs_->end (); } } if (!r_->empty ()) { lhs_->push_back (static_cast(0)); lhs_->back () = r_.release (); } } } } static void fill_rhs_list (const node_vector *vector_, const index_set_vector &set_mapping_, equivset_list &list_) { typename node_vector::const_iterator iter_ = vector_->begin (); typename node_vector::const_iterator end_ = vector_->end (); for (; iter_ != end_; ++iter_) { const detail::node *node_ = *iter_; if (!node_->end_state ()) { const std::size_t token_ = node_->token (); if (token_ != null_token) { list_->push_back (static_cast(0)); if (token_ == bol_token || token_ == eol_token) { std::set index_set_; index_set_.insert (token_); list_->back () = new equivset (index_set_, node_->greedy (), token_, node_->followpos ()); } else { list_->back () = new equivset (set_mapping_[token_], node_->greedy (), token_, node_->followpos ()); } } } } } static void fixup_bol (detail::node * &root_, node_ptr_vector &node_ptr_vector_) { typename detail::node::node_vector *first_ = &root_->firstpos (); bool found_ = false; typename detail::node::node_vector::const_iterator iter_ = first_->begin (); typename detail::node::node_vector::const_iterator end_ = first_->end (); for (; iter_ != end_; ++iter_) { const detail::node *node_ = *iter_; found_ = !node_->end_state () && node_->token () == bol_token; if (found_) break; } if (!found_) { node_ptr_vector_->push_back (static_cast(0)); node_ptr_vector_->back () = new detail::leaf_node (bol_token, true); detail::node *lhs_ = node_ptr_vector_->back (); node_ptr_vector_->push_back (static_cast(0)); node_ptr_vector_->back () = new detail::leaf_node (null_token, true); detail::node *rhs_ = node_ptr_vector_->back (); node_ptr_vector_->push_back (static_cast(0)); node_ptr_vector_->back () = new detail::selection_node (lhs_, rhs_); lhs_ = node_ptr_vector_->back (); node_ptr_vector_->push_back (static_cast(0)); node_ptr_vector_->back () = new detail::sequence_node (lhs_, root_); root_ = node_ptr_vector_->back (); } } static void minimise_dfa (const std::size_t dfa_alphabet_, size_t_vector &dfa_, std::size_t size_) { const std::size_t *first_ = &dfa_.front (); const std::size_t *second_ = 0; const std::size_t *end_ = first_ + size_; std::size_t index_ = 1; std::size_t new_index_ = 1; std::size_t curr_index_ = 0; index_set index_set_; size_t_vector lookup_; std::size_t *lookup_ptr_ = 0; lookup_.resize (size_ / dfa_alphabet_, null_token); lookup_ptr_ = &lookup_.front (); *lookup_ptr_ = 0; // Only one 'jam' state, so skip it. first_ += dfa_alphabet_; for (; first_ < end_; first_ += dfa_alphabet_, ++index_) { for (second_ = first_ + dfa_alphabet_, curr_index_ = index_ + 1; second_ < end_; second_ += dfa_alphabet_, ++curr_index_) { if (index_set_.find (curr_index_) != index_set_.end ()) { continue; } // Some systems have memcmp in namespace std. using namespace std; if (memcmp (first_, second_, sizeof (std::size_t) * dfa_alphabet_) == 0) { index_set_.insert (curr_index_); lookup_ptr_[curr_index_] = new_index_; } } if (lookup_ptr_[index_] == null_token) { lookup_ptr_[index_] = new_index_; ++new_index_; } } if (!index_set_.empty ()) { const std::size_t *front_ = &dfa_.front (); size_t_vector new_dfa_ (front_, front_ + dfa_alphabet_); typename index_set::iterator set_end_ = index_set_.end (); const std::size_t *ptr_ = front_ + dfa_alphabet_; std::size_t *new_ptr_ = 0; new_dfa_.resize (size_ - index_set_.size () * dfa_alphabet_, 0); new_ptr_ = &new_dfa_.front () + dfa_alphabet_; size_ /= dfa_alphabet_; for (index_ = 1; index_ < size_; ++index_) { if (index_set_.find (index_) != set_end_) { ptr_ += dfa_alphabet_; continue; } new_ptr_[end_state_index] = ptr_[end_state_index]; new_ptr_[id_index] = ptr_[id_index]; new_ptr_[unique_id_index] = ptr_[unique_id_index]; new_ptr_[state_index] = ptr_[state_index]; new_ptr_[bol_index] = lookup_ptr_[ptr_[bol_index]]; new_ptr_[eol_index] = lookup_ptr_[ptr_[eol_index]]; new_ptr_ += dfa_offset; ptr_ += dfa_offset; for (std::size_t i_ = dfa_offset; i_ < dfa_alphabet_; ++i_) { *new_ptr_++ = lookup_ptr_[*ptr_++]; } } dfa_.swap (new_dfa_); } } }; typedef basic_generator generator; typedef basic_generator wgenerator; } } #endif