summaryrefslogtreecommitdiffstats
blob: 370ab67d2eae1004edf2c84da639c727f2c0b458 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
/*=============================================================================
    Copyright (c) 2001-2011 Joel de Guzman

    Distributed under the Boost Software License, Version 1.0. (See accompanying
    file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

    Autogenerated by MultiStageTable.py (Unicode multi-stage
    table builder) (c) Peter Kankowski, 2008
==============================================================================*/
#if !defined(BOOST_SPIRIT_UNICODE_QUERY_FEBRUARY_2_2010)
#define BOOST_SPIRIT_UNICODE_QUERY_FEBRUARY_2_2010

#include <boost/cstdint.hpp>

# include "category_table.hpp"
# include "script_table.hpp"
# include "lowercase_table.hpp"
# include "uppercase_table.hpp"

namespace boost { namespace spirit { namespace ucd
{
    // This header provides Basic (Level 1) Unicode Support
    // See http://unicode.org/reports/tr18/ for details

    struct properties
    {
        // bit pattern: xxMMMCCC
        // MMM: major_category
        // CCC: category

        enum major_category
        {
            letter,
            mark,
            number,
            separator,
            other,
            punctuation,
            symbol
        };

        enum category
        {
            uppercase_letter = 0,   // [Lu] an uppercase letter
            lowercase_letter,       // [Ll] a lowercase letter
            titlecase_letter,       // [Lt] a digraphic character, with first part uppercase
            modifier_letter,        // [Lm] a modifier letter
            other_letter,           // [Lo] other letters, including syllables and ideographs

            nonspacing_mark = 8,    // [Mn] a nonspacing combining mark (zero advance width)
            enclosing_mark,         // [Me] an enclosing combining mark
            spacing_mark,           // [Mc] a spacing combining mark (positive advance width)

            decimal_number = 16,    // [Nd] a decimal digit
            letter_number,          // [Nl] a letterlike numeric character
            other_number,           // [No] a numeric character of other type

            space_separator = 24,   // [Zs] a space character (of various non-zero widths)
            line_separator,         // [Zl] U+2028 LINE SEPARATOR only
            paragraph_separator,    // [Zp] U+2029 PARAGRAPH SEPARATOR only

            control = 32,           // [Cc] a C0 or C1 control code
            format,                 // [Cf] a format control character
            private_use,            // [Co] a private-use character
            surrogate,              // [Cs] a surrogate code point
            unassigned,             // [Cn] a reserved unassigned code point or a noncharacter

            dash_punctuation = 40,  // [Pd] a dash or hyphen punctuation mark
            open_punctuation,       // [Ps] an opening punctuation mark (of a pair)
            close_punctuation,      // [Pe] a closing punctuation mark (of a pair)
            connector_punctuation,  // [Pc] a connecting punctuation mark, like a tie
            other_punctuation,      // [Po] a punctuation mark of other type
            initial_punctuation,    // [Pi] an initial quotation mark
            final_punctuation,      // [Pf] a final quotation mark

            math_symbol = 48,       // [Sm] a symbol of primarily mathematical use
            currency_symbol,        // [Sc] a currency sign
            modifier_symbol,        // [Sk] a non-letterlike modifier symbol
            other_symbol            // [So] a symbol of other type
        };

        enum derived_properties
        {
            alphabetic = 64,
            uppercase = 128,
            lowercase = 256,
            white_space = 512,
            hex_digit = 1024,
            noncharacter_code_point = 2048,
            default_ignorable_code_point = 4096
        };

        enum script
        {
            arabic = 0,
            imperial_aramaic = 1,
            armenian = 2,
            avestan = 3,
            balinese = 4,
            bamum = 5,
            bengali = 6,
            bopomofo = 7,
            braille = 8,
            buginese = 9,
            buhid = 10,
            canadian_aboriginal = 11,
            carian = 12,
            cham = 13,
            cherokee = 14,
            coptic = 15,
            cypriot = 16,
            cyrillic = 17,
            devanagari = 18,
            deseret = 19,
            egyptian_hieroglyphs = 20,
            ethiopic = 21,
            georgian = 22,
            glagolitic = 23,
            gothic = 24,
            greek = 25,
            gujarati = 26,
            gurmukhi = 27,
            hangul = 28,
            han = 29,
            hanunoo = 30,
            hebrew = 31,
            hiragana = 32,
            katakana_or_hiragana = 33,
            old_italic = 34,
            javanese = 35,
            kayah_li = 36,
            katakana = 37,
            kharoshthi = 38,
            khmer = 39,
            kannada = 40,
            kaithi = 41,
            tai_tham = 42,
            lao = 43,
            latin = 44,
            lepcha = 45,
            limbu = 46,
            linear_b = 47,
            lisu = 48,
            lycian = 49,
            lydian = 50,
            malayalam = 51,
            mongolian = 52,
            meetei_mayek = 53,
            myanmar = 54,
            nko = 55,
            ogham = 56,
            ol_chiki = 57,
            old_turkic = 58,
            oriya = 59,
            osmanya = 60,
            phags_pa = 61,
            inscriptional_pahlavi = 62,
            phoenician = 63,
            inscriptional_parthian = 64,
            rejang = 65,
            runic = 66,
            samaritan = 67,
            old_south_arabian = 68,
            saurashtra = 69,
            shavian = 70,
            sinhala = 71,
            sundanese = 72,
            syloti_nagri = 73,
            syriac = 74,
            tagbanwa = 75,
            tai_le = 76,
            new_tai_lue = 77,
            tamil = 78,
            tai_viet = 79,
            telugu = 80,
            tifinagh = 81,
            tagalog = 82,
            thaana = 83,
            thai = 84,
            tibetan = 85,
            ugaritic = 86,
            vai = 87,
            old_persian = 88,
            cuneiform = 89,
            yi = 90,
            inherited = 91,
            common = 92,
            unknown = 93
        };
    };

    inline properties::category get_category(::boost::uint32_t ch)
    {
        return static_cast<properties::category>(detail::category_lookup(ch) & 0x3F);
    }

    inline properties::major_category get_major_category(::boost::uint32_t ch)
    {
        return static_cast<properties::major_category>(get_category(ch) >> 3);
    }

    inline bool is_punctuation(::boost::uint32_t ch)
    {
        return get_major_category(ch) == properties::punctuation;
    }

    inline bool is_decimal_number(::boost::uint32_t ch)
    {
        return get_category(ch) == properties::decimal_number;
    }

    inline bool is_hex_digit(::boost::uint32_t ch)
    {
        return (detail::category_lookup(ch) & properties::hex_digit) != 0;
    }

    inline bool is_control(::boost::uint32_t ch)
    {
        return get_category(ch) == properties::control;
    }

    inline bool is_alphabetic(::boost::uint32_t ch)
    {
        return (detail::category_lookup(ch) & properties::alphabetic) != 0;
    }

    inline bool is_alphanumeric(::boost::uint32_t ch)
    {
        return is_decimal_number(ch) || is_alphabetic(ch);
    }

    inline bool is_uppercase(::boost::uint32_t ch)
    {
        return (detail::category_lookup(ch) & properties::uppercase) != 0;
    }

    inline bool is_lowercase(::boost::uint32_t ch)
    {
        return (detail::category_lookup(ch) & properties::lowercase) != 0;
    }

    inline bool is_white_space(::boost::uint32_t ch)
    {
        return (detail::category_lookup(ch) & properties::white_space) != 0;
    }

    inline bool is_blank(::boost::uint32_t ch)
    {
        switch (ch)
        {
            case '\n': case '\v': case '\f': case '\r':
                return false;
            default:
                return is_white_space(ch)
                && !(   get_category(ch) == properties::line_separator
                    ||  get_category(ch) == properties::paragraph_separator
                    );
        }
    }

    inline bool is_graph(::boost::uint32_t ch)
    {
        return !(   is_white_space(ch)
                ||  get_category(ch) == properties::control
                ||  get_category(ch) == properties::surrogate
                ||  get_category(ch) == properties::unassigned
                );
    }

    inline bool is_print(::boost::uint32_t ch)
    {
        return (is_graph(ch) || is_blank(ch)) && !is_control(ch);
    }

    inline bool is_noncharacter_code_point(::boost::uint32_t ch)
    {
        return (detail::category_lookup(ch) & properties::noncharacter_code_point) != 0;
    }

    inline bool is_default_ignorable_code_point(::boost::uint32_t ch)
    {
        return (detail::category_lookup(ch) & properties::default_ignorable_code_point) != 0;
    }

    inline properties::script get_script(::boost::uint32_t ch)
    {
        return static_cast<properties::script>(detail::script_lookup(ch) & 0x7F);
    }

    inline ::boost::uint32_t to_lowercase(::boost::uint32_t ch)
    {
        // The table returns 0 to signal that this code maps to itself
        ::boost::uint32_t r = detail::lowercase_lookup(ch);
        return (r == 0)? ch : r;
    }

    inline ::boost::uint32_t to_uppercase(::boost::uint32_t ch)
    {
        // The table returns 0 to signal that this code maps to itself
        ::boost::uint32_t r = detail::uppercase_lookup(ch);
        return (r == 0)? ch : r;
    }
}}}

#endif