summaryrefslogtreecommitdiffstats
blob: 0f8af55d0dacd302843340a308a69e2f4c45650c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
//  Copyright (c) 2001-2011 Hartmut Kaiser
//
//  Distributed under the Boost Software License, Version 1.0. (See accompanying
//  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

#if !defined(BOOST_SPIRIT_LEX_LEXER_MAR_17_2007_0139PM)
#define BOOST_SPIRIT_LEX_LEXER_MAR_17_2007_0139PM

#if defined(_MSC_VER)
#pragma once
#endif

#include <iosfwd>

#include <boost/spirit/home/support/detail/lexer/generator.hpp>
#include <boost/spirit/home/support/detail/lexer/rules.hpp>
#include <boost/spirit/home/support/detail/lexer/consts.hpp>
#include <boost/spirit/home/support/unused.hpp>

#include <boost/spirit/home/lex/lexer/lexertl/token.hpp>
#include <boost/spirit/home/lex/lexer/lexertl/functor.hpp>
#include <boost/spirit/home/lex/lexer/lexertl/functor_data.hpp>
#include <boost/spirit/home/lex/lexer/lexertl/iterator.hpp>
#if defined(BOOST_SPIRIT_LEXERTL_DEBUG)
#include <boost/spirit/home/support/detail/lexer/debug.hpp>
#endif

#include <boost/foreach.hpp>

namespace boost { namespace spirit { namespace lex { namespace lexertl
{
    ///////////////////////////////////////////////////////////////////////////
    namespace detail
    {
        ///////////////////////////////////////////////////////////////////////
        //  The must_escape function checks if the given character value needs
        //  to be preceded by a backslash character to disable its special
        //  meaning in the context of a regular expression
        ///////////////////////////////////////////////////////////////////////
        template <typename Char>
        inline bool must_escape(Char c)
        {
            // FIXME: more needed?
            switch (c) {
            case '+': case '/': case '*': case '?':
            case '|':
            case '(': case ')':
            case '[': case ']':
            case '{': case '}':
            case '.':
            case '^': case '$':
            case '\\':
            case '"':
                return true;

            default:
                break;
            }
            return false;
        }

        ///////////////////////////////////////////////////////////////////////
        //  The escape function returns the string representation of the given
        //  character value, possibly escaped with a backslash character, to
        //  allow it being safely used in a regular expression definition.
        ///////////////////////////////////////////////////////////////////////
        template <typename Char>
        inline std::basic_string<Char> escape(Char ch)
        {
            std::basic_string<Char> result(1, ch);
            if (detail::must_escape(ch))
            {
                typedef typename std::basic_string<Char>::size_type size_type;
                result.insert((size_type)0, 1, '\\');
            }
            return result;
        }

        ///////////////////////////////////////////////////////////////////////
        //
        ///////////////////////////////////////////////////////////////////////
        inline boost::lexer::regex_flags map_flags(unsigned int flags)
        {
            unsigned int retval = boost::lexer::none;
            if (flags & match_flags::match_not_dot_newline)
                retval |= boost::lexer::dot_not_newline;
            if (flags & match_flags::match_icase)
                retval |= boost::lexer::icase;

            return boost::lexer::regex_flags(retval);
        }
    }

    ///////////////////////////////////////////////////////////////////////////
    template <typename Lexer, typename F>
    bool generate_static(Lexer const&
      , std::basic_ostream<typename Lexer::char_type>&
      , typename Lexer::char_type const*, F);

    ///////////////////////////////////////////////////////////////////////////
    //
    //  Every lexer type to be used as a lexer for Spirit has to conform to
    //  the following public interface:
    //
    //    typedefs:
    //        iterator_type   The type of the iterator exposed by this lexer.
    //        token_type      The type of the tokens returned from the exposed
    //                        iterators.
    //
    //    functions:
    //        default constructor
    //                        Since lexers are instantiated as base classes
    //                        only it might be a good idea to make this
    //                        constructor protected.
    //        begin, end      Return a pair of iterators, when dereferenced
    //                        returning the sequence of tokens recognized in
    //                        the input stream given as the parameters to the
    //                        begin() function.
    //        add_token       Should add the definition of a token to be
    //                        recognized by this lexer.
    //        clear           Should delete all current token definitions
    //                        associated with the given state of this lexer
    //                        object.
    //
    //    template parameters:
    //        Iterator        The type of the iterator used to access the
    //                        underlying character stream.
    //        Token           The type of the tokens to be returned from the
    //                        exposed token iterator.
    //        Functor         The type of the InputPolicy to use to instantiate
    //                        the multi_pass iterator type to be used as the
    //                        token iterator (returned from begin()/end()).
    //
    ///////////////////////////////////////////////////////////////////////////

    ///////////////////////////////////////////////////////////////////////////
    //
    //  The lexer class is a implementation of a Spirit.Lex lexer on
    //  top of Ben Hanson's lexertl library as outlined above (For more
    //  information about lexertl go here: http://www.benhanson.net/lexertl.html).
    //
    //  This class is supposed to be used as the first and only template
    //  parameter while instantiating instances of a lex::lexer class.
    //
    ///////////////////////////////////////////////////////////////////////////
    template <typename Token = token<>
      , typename Iterator = typename Token::iterator_type
      , typename Functor = functor<Token, lexertl::detail::data, Iterator> >
    class lexer
    {
    private:
        struct dummy { void true_() {} };
        typedef void (dummy::*safe_bool)();

        static std::size_t const all_states_id = static_cast<std::size_t>(-2);

    public:
        operator safe_bool() const
            { return initialized_dfa_ ? &dummy::true_ : 0; }

        typedef typename boost::detail::iterator_traits<Iterator>::value_type
            char_type;
        typedef std::basic_string<char_type> string_type;

        typedef boost::lexer::basic_rules<char_type> basic_rules_type;

        //  Every lexer type to be used as a lexer for Spirit has to conform to
        //  a public interface .
        typedef Token token_type;
        typedef typename Token::id_type id_type;
        typedef iterator<Functor> iterator_type;

    private:
        // this type is purely used for the iterator_type construction below
        struct iterator_data_type
        {
            typedef typename Functor::semantic_actions_type semantic_actions_type;

            iterator_data_type(
                    boost::lexer::basic_state_machine<char_type> const& sm
                  , boost::lexer::basic_rules<char_type> const& rules
                  , semantic_actions_type const& actions)
              : state_machine_(sm), rules_(rules), actions_(actions)
            {}

            boost::lexer::basic_state_machine<char_type> const& state_machine_;
            boost::lexer::basic_rules<char_type> const& rules_;
            semantic_actions_type const& actions_;

        private:
            // silence MSVC warning C4512: assignment operator could not be generated
            iterator_data_type& operator= (iterator_data_type const&);
        };

    public:
        //  Return the start iterator usable for iterating over the generated
        //  tokens.
        iterator_type begin(Iterator& first, Iterator const& last
          , char_type const* initial_state = 0) const
        {
            if (!init_dfa())    // never minimize DFA for dynamic lexers
                return iterator_type();

            iterator_data_type iterator_data(state_machine_, rules_, actions_);
            return iterator_type(iterator_data, first, last, initial_state);
        }

        //  Return the end iterator usable to stop iterating over the generated
        //  tokens.
        iterator_type end() const
        {
            return iterator_type();
        }

    protected:
        //  Lexer instances can be created by means of a derived class only.
        lexer(unsigned int flags)
          : flags_(detail::map_flags(flags))
          , rules_(flags_)
          , initialized_dfa_(false)
        {}

    public:
        // interface for token definition management
        std::size_t add_token(char_type const* state, char_type tokendef,
            std::size_t token_id, char_type const* targetstate)
        {
            add_state(state);
            initialized_dfa_ = false;
            if (state == all_states())
                return rules_.add(state, detail::escape(tokendef), token_id, rules_.dot());

            if (0 == targetstate)
                targetstate = state;
            else
                add_state(targetstate);
            return rules_.add(state, detail::escape(tokendef), token_id, targetstate);
        }
        std::size_t add_token(char_type const* state, string_type const& tokendef,
            std::size_t token_id, char_type const* targetstate)
        {
            add_state(state);
            initialized_dfa_ = false;
            if (state == all_states())
                return rules_.add(state, tokendef, token_id, rules_.dot());

            if (0 == targetstate)
                targetstate = state;
            else
                add_state(targetstate);
            return rules_.add(state, tokendef, token_id, targetstate);
        }

        // interface for pattern definition management
        void add_pattern (char_type const* state, string_type const& name,
            string_type const& patterndef)
        {
            add_state(state);
            rules_.add_macro(name.c_str(), patterndef);
            initialized_dfa_ = false;
        }

        boost::lexer::rules const& get_rules() const { return rules_; }

        void clear(char_type const* state)
        {
            std::size_t s = rules_.state(state);
            if (boost::lexer::npos != s)
                rules_.clear(state);
            initialized_dfa_ = false;
        }
        std::size_t add_state(char_type const* state)
        {
            if (state == all_states())
                return all_states_id;

            std::size_t stateid = rules_.state(state);
            if (boost::lexer::npos == stateid) {
                stateid = rules_.add_state(state);
                initialized_dfa_ = false;
            }
            return stateid;
        }
        string_type initial_state() const
        {
            return string_type(rules_.initial());
        }
        string_type all_states() const
        {
            return string_type(rules_.all_states());
        }

        //  Register a semantic action with the given id
        template <typename F>
        void add_action(std::size_t unique_id, std::size_t state, F act)
        {
            // If you see an error here stating add_action is not a member of
            // fusion::unused_type then you are probably having semantic actions
            // attached to at least one token in the lexer definition without
            // using the lex::lexertl::actor_lexer<> as its base class.
            typedef typename Functor::wrap_action_type wrapper_type;
            if (state == all_states_id) {
                // add the action to all known states
                typedef typename
                    basic_rules_type::string_size_t_map::value_type
                state_type;

                std::size_t states = rules_.statemap().size();
                BOOST_FOREACH(state_type const& s, rules_.statemap()) {
                    for (std::size_t j = 0; j < states; ++j)
                        actions_.add_action(unique_id + j, s.second, wrapper_type::call(act));
                }
            }
            else {
                actions_.add_action(unique_id, state, wrapper_type::call(act));
            }
        }
//         template <typename F>
//         void add_action(std::size_t unique_id, char_type const* state, F act)
//         {
//             typedef typename Functor::wrap_action_type wrapper_type;
//             actions_.add_action(unique_id, add_state(state), wrapper_type::call(act));
//         }

        // We do not minimize the state machine by default anymore because
        // Ben said: "If you can afford to generate a lexer at runtime, there
        //            is little point in calling minimise."
        // Go figure.
        bool init_dfa(bool minimize = false) const
        {
            if (!initialized_dfa_) {
                state_machine_.clear();
                typedef boost::lexer::basic_generator<char_type> generator;
                generator::build (rules_, state_machine_);
                if (minimize)
                    generator::minimise (state_machine_);

#if defined(BOOST_SPIRIT_LEXERTL_DEBUG)
                boost::lexer::debug::dump(state_machine_, std::cerr);
#endif
                initialized_dfa_ = true;

//                 // release memory held by rules description
//                 basic_rules_type rules;
//                 rules.init_state_info(rules_);        // preserve states
//                 std::swap(rules, rules_);
            }
            return true;
        }

    private:
        // lexertl specific data
        mutable boost::lexer::basic_state_machine<char_type> state_machine_;
        boost::lexer::regex_flags flags_;
        /*mutable*/ basic_rules_type rules_;

        typename Functor::semantic_actions_type actions_;
        mutable bool initialized_dfa_;

        // generator functions must be able to access members directly
        template <typename Lexer, typename F>
        friend bool generate_static(Lexer const&
          , std::basic_ostream<typename Lexer::char_type>&
          , typename Lexer::char_type const*, F);
    };

    ///////////////////////////////////////////////////////////////////////////
    //
    //  The actor_lexer class is another implementation of a Spirit.Lex
    //  lexer on top of Ben Hanson's lexertl library as outlined above (For
    //  more information about lexertl go here:
    //  http://www.benhanson.net/lexertl.html).
    //
    //  The only difference to the lexer class above is that token_def
    //  definitions may have semantic (lexer) actions attached while being
    //  defined:
    //
    //      int w;
    //      token_def word = "[^ \t\n]+";
    //      self = word[++ref(w)];        // see example: word_count_lexer
    //
    //  This class is supposed to be used as the first and only template
    //  parameter while instantiating instances of a lex::lexer class.
    //
    ///////////////////////////////////////////////////////////////////////////
    template <typename Token = token<>
      , typename Iterator = typename Token::iterator_type
      , typename Functor = functor<Token, lexertl::detail::data, Iterator, mpl::true_> >
    class actor_lexer : public lexer<Token, Iterator, Functor>
    {
    protected:
        //  Lexer instances can be created by means of a derived class only.
        actor_lexer(unsigned int flags)
          : lexer<Token, Iterator, Functor>(flags) {}
    };

}}}}

#endif