1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
|
/*
*
* Copyright (c) 1998-2002
* John Maddock
*
* Use, modification and distribution are subject to the
* Boost Software License, Version 1.0. (See accompanying file
* LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
*
*/
/*
* LOCATION: see http://www.boost.org for most recent version.
* FILE states.cpp
* VERSION see <boost/version.hpp>
* DESCRIPTION: Declares internal state machine structures.
*/
#ifndef BOOST_REGEX_V4_STATES_HPP
#define BOOST_REGEX_V4_STATES_HPP
#ifdef BOOST_MSVC
#pragma warning(push)
#pragma warning(disable: 4103)
#endif
#ifdef BOOST_HAS_ABI_HEADERS
# include BOOST_ABI_PREFIX
#endif
#ifdef BOOST_MSVC
#pragma warning(pop)
#endif
namespace boost{
namespace re_detail{
/*** mask_type *******************************************************
Whenever we have a choice of two alternatives, we use an array of bytes
to indicate which of the two alternatives it is possible to take for any
given input character. If mask_take is set, then we can take the next
state, and if mask_skip is set then we can take the alternative.
***********************************************************************/
enum mask_type
{
mask_take = 1,
mask_skip = 2,
mask_init = 4,
mask_any = mask_skip | mask_take,
mask_all = mask_any
};
/*** helpers **********************************************************
These helpers let us use function overload resolution to detect whether
we have narrow or wide character strings:
***********************************************************************/
struct _narrow_type{};
struct _wide_type{};
template <class charT> struct is_byte;
template<> struct is_byte<char> { typedef _narrow_type width_type; };
template<> struct is_byte<unsigned char>{ typedef _narrow_type width_type; };
template<> struct is_byte<signed char> { typedef _narrow_type width_type; };
template <class charT> struct is_byte { typedef _wide_type width_type; };
/*** enum syntax_element_type ******************************************
Every record in the state machine falls into one of the following types:
***********************************************************************/
enum syntax_element_type
{
// start of a marked sub-expression, or perl-style (?...) extension
syntax_element_startmark = 0,
// end of a marked sub-expression, or perl-style (?...) extension
syntax_element_endmark = syntax_element_startmark + 1,
// any sequence of literal characters
syntax_element_literal = syntax_element_endmark + 1,
// start of line assertion: ^
syntax_element_start_line = syntax_element_literal + 1,
// end of line assertion $
syntax_element_end_line = syntax_element_start_line + 1,
// match any character: .
syntax_element_wild = syntax_element_end_line + 1,
// end of expression: we have a match when we get here
syntax_element_match = syntax_element_wild + 1,
// perl style word boundary: \b
syntax_element_word_boundary = syntax_element_match + 1,
// perl style within word boundary: \B
syntax_element_within_word = syntax_element_word_boundary + 1,
// start of word assertion: \<
syntax_element_word_start = syntax_element_within_word + 1,
// end of word assertion: \>
syntax_element_word_end = syntax_element_word_start + 1,
// start of buffer assertion: \`
syntax_element_buffer_start = syntax_element_word_end + 1,
// end of buffer assertion: \'
syntax_element_buffer_end = syntax_element_buffer_start + 1,
// backreference to previously matched sub-expression
syntax_element_backref = syntax_element_buffer_end + 1,
// either a wide character set [..] or one with multicharacter collating elements:
syntax_element_long_set = syntax_element_backref + 1,
// narrow character set: [...]
syntax_element_set = syntax_element_long_set + 1,
// jump to a new state in the machine:
syntax_element_jump = syntax_element_set + 1,
// choose between two production states:
syntax_element_alt = syntax_element_jump + 1,
// a repeat
syntax_element_rep = syntax_element_alt + 1,
// match a combining character sequence
syntax_element_combining = syntax_element_rep + 1,
// perl style soft buffer end: \z
syntax_element_soft_buffer_end = syntax_element_combining + 1,
// perl style continuation: \G
syntax_element_restart_continue = syntax_element_soft_buffer_end + 1,
// single character repeats:
syntax_element_dot_rep = syntax_element_restart_continue + 1,
syntax_element_char_rep = syntax_element_dot_rep + 1,
syntax_element_short_set_rep = syntax_element_char_rep + 1,
syntax_element_long_set_rep = syntax_element_short_set_rep + 1,
// a backstep for lookbehind repeats:
syntax_element_backstep = syntax_element_long_set_rep + 1,
// an assertion that a mark was matched:
syntax_element_assert_backref = syntax_element_backstep + 1,
syntax_element_toggle_case = syntax_element_assert_backref + 1
};
#ifdef BOOST_REGEX_DEBUG
// dwa 09/26/00 - This is needed to suppress warnings about an ambiguous conversion
std::ostream& operator<<(std::ostream&, syntax_element_type);
#endif
struct re_syntax_base;
/*** union offset_type ************************************************
Points to another state in the machine. During machine construction
we use integral offsets, but these are converted to pointers before
execution of the machine.
***********************************************************************/
union offset_type
{
re_syntax_base* p;
std::ptrdiff_t i;
};
/*** struct re_syntax_base ********************************************
Base class for all states in the machine.
***********************************************************************/
struct re_syntax_base
{
syntax_element_type type; // what kind of state this is
offset_type next; // next state in the machine
};
/*** struct re_brace **************************************************
A marked parenthesis.
***********************************************************************/
struct re_brace : public re_syntax_base
{
// The index to match, can be zero (don't mark the sub-expression)
// or negative (for perl style (?...) extentions):
int index;
};
/*** struct re_dot **************************************************
Match anything.
***********************************************************************/
enum
{
dont_care = 1,
force_not_newline = 0,
force_newline = 2,
test_not_newline = 2,
test_newline = 3
};
struct re_dot : public re_syntax_base
{
unsigned char mask;
};
/*** struct re_literal ************************************************
A string of literals, following this structure will be an
array of characters: charT[length]
***********************************************************************/
struct re_literal : public re_syntax_base
{
unsigned int length;
};
/*** struct re_case ************************************************
Indicates whether we are moving to a case insensive block or not
***********************************************************************/
struct re_case : public re_syntax_base
{
bool icase;
};
/*** struct re_set_long ***********************************************
A wide character set of characters, following this structure will be
an array of type charT:
First csingles null-terminated strings
Then 2 * cranges NULL terminated strings
Then cequivalents NULL terminated strings
***********************************************************************/
template <class mask_type>
struct re_set_long : public re_syntax_base
{
unsigned int csingles, cranges, cequivalents;
mask_type cclasses;
mask_type cnclasses;
bool isnot;
bool singleton;
};
/*** struct re_set ****************************************************
A set of narrow-characters, matches any of _map which is none-zero
***********************************************************************/
struct re_set : public re_syntax_base
{
unsigned char _map[1 << CHAR_BIT];
};
/*** struct re_jump ***************************************************
Jump to a new location in the machine (not next).
***********************************************************************/
struct re_jump : public re_syntax_base
{
offset_type alt; // location to jump to
};
/*** struct re_alt ***************************************************
Jump to a new location in the machine (possibly next).
***********************************************************************/
struct re_alt : public re_jump
{
unsigned char _map[1 << CHAR_BIT]; // which characters can take the jump
unsigned int can_be_null; // true if we match a NULL string
};
/*** struct re_repeat *************************************************
Repeat a section of the machine
***********************************************************************/
struct re_repeat : public re_alt
{
std::size_t min, max; // min and max allowable repeats
int state_id; // Unique identifier for this repeat
bool leading; // True if this repeat is at the start of the machine (lets us optimize some searches)
bool greedy; // True if this is a greedy repeat
};
/*** enum re_jump_size_type *******************************************
Provides compiled size of re_jump structure (allowing for trailing alignment).
We provide this so we know how manybytes to insert when constructing the machine
(The value of padding_mask is defined in regex_raw_buffer.hpp).
***********************************************************************/
enum re_jump_size_type
{
re_jump_size = (sizeof(re_jump) + padding_mask) & ~(padding_mask),
re_repeater_size = (sizeof(re_repeat) + padding_mask) & ~(padding_mask),
re_alt_size = (sizeof(re_alt) + padding_mask) & ~(padding_mask)
};
/*** proc re_is_set_member *********************************************
Forward declaration: we'll need this one later...
***********************************************************************/
template<class charT, class traits>
struct regex_data;
template <class iterator, class charT, class traits_type, class char_classT>
iterator BOOST_REGEX_CALL re_is_set_member(iterator next,
iterator last,
const re_set_long<char_classT>* set_,
const regex_data<charT, traits_type>& e, bool icase);
} // namespace re_detail
} // namespace boost
#ifdef BOOST_MSVC
#pragma warning(push)
#pragma warning(disable: 4103)
#endif
#ifdef BOOST_HAS_ABI_HEADERS
# include BOOST_ABI_SUFFIX
#endif
#ifdef BOOST_MSVC
#pragma warning(pop)
#endif
#endif
|