summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
Diffstat (limited to '3rdParty/LibIDN/src/nfkc.c')
-rw-r--r--3rdParty/LibIDN/src/nfkc.c432
1 files changed, 244 insertions, 188 deletions
diff --git a/3rdParty/LibIDN/src/nfkc.c b/3rdParty/LibIDN/src/nfkc.c
index 621f749..4992074 100644
--- a/3rdParty/LibIDN/src/nfkc.c
+++ b/3rdParty/LibIDN/src/nfkc.c
@@ -1,42 +1,43 @@
1/* nfkc.c --- Unicode normalization utilities. 1/* nfkc.c --- Unicode normalization utilities.
2 * Copyright (C) 2002, 2003, 2004, 2006, 2007 Simon Josefsson 2 Copyright (C) 2002-2015 Simon Josefsson
3 * 3
4 * This file is part of GNU Libidn. 4 This file is part of GNU Libidn.
5 * 5
6 * GNU Libidn is free software; you can redistribute it and/or 6 GNU Libidn is free software: you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public 7 modify it under the terms of either:
8 * License as published by the Free Software Foundation; either 8
9 * version 2.1 of the License, or (at your option) any later version. 9 * the GNU Lesser General Public License as published by the Free
10 * 10 Software Foundation; either version 3 of the License, or (at
11 * GNU Libidn is distributed in the hope that it will be useful, 11 your option) any later version.
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 or
14 * Lesser General Public License for more details. 14
15 * 15 * the GNU General Public License as published by the Free
16 * You should have received a copy of the GNU Lesser General Public 16 Software Foundation; either version 2 of the License, or (at
17 * License along with GNU Libidn; if not, write to the Free Software 17 your option) any later version.
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 18
19 * 19 or both in parallel, as here.
20 */ 20
21 GNU Libidn is distributed in the hope that it will be useful,
22 but WITHOUT ANY WARRANTY; without even the implied warranty of
23 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
24 General Public License for more details.
25
26 You should have received copies of the GNU General Public License and
27 the GNU Lesser General Public License along with this program. If
28 not, see <http://www.gnu.org/licenses/>. */
21 29
22#ifdef HAVE_CONFIG_H 30#ifdef HAVE_CONFIG_H
23# include "config.h" 31#include "config.h"
24#endif 32#endif
25 33
26#include <stdlib.h> 34#include <stdlib.h>
27#include <string.h> 35#include <string.h>
28 36
29#include "stringprep.h" 37#include "stringprep.h"
30 38
31/* This file contains functions from GLIB, including gutf8.c and
32 * gunidecomp.c, all licensed under LGPL and copyright hold by:
33 *
34 * Copyright (C) 1999, 2000 Tom Tromey
35 * Copyright 2000 Red Hat, Inc.
36 */
37
38/* Hacks to make syncing with GLIB code easier. */ 39/* Hacks to make syncing with GLIB code easier. */
39#define gboolean int 40#define gboolean int
40#define gchar char 41#define gchar char
41#define guchar unsigned char 42#define guchar unsigned char
42#define glong long 43#define glong long
@@ -48,33 +49,71 @@
48#define gunichar uint32_t 49#define gunichar uint32_t
49#define gsize size_t 50#define gsize size_t
50#define gssize ssize_t 51#define gssize ssize_t
51#define g_malloc malloc 52#define g_malloc malloc
52#define g_free free 53#define g_free free
53#define GError void 54#define g_return_val_if_fail(expr,val) { \
54#define g_set_error(a,b,c,d) ((void) 0) 55 if (!(expr)) \
55#define g_new(struct_type, n_structs) \ 56 return (val); \
56 ((struct_type *) g_malloc (((gsize) sizeof (struct_type)) * ((gsize) (n_structs)))) 57 }
57# if defined (__GNUC__) && !defined (__STRICT_ANSI__) && !defined (__cplusplus) 58
58# define G_STMT_START (void)( 59/* Code from GLIB gmacros.h starts here. */
59# define G_STMT_END ) 60
60# else 61/* GLIB - Library of useful routines for C programming
61# if (defined (sun) || defined (__sun__)) 62 * Copyright (C) 1995-1997 Peter Mattis, Spencer Kimball and Josh MacDonald
62# define G_STMT_START if (1) 63 *
63# define G_STMT_END else (void)0 64 * This library is free software; you can redistribute it and/or
64# else 65 * modify it under the terms of the GNU Lesser General Public
65# define G_STMT_START do 66 * License as published by the Free Software Foundation; either
66# define G_STMT_END while (0) 67 * version 2 of the License, or (at your option) any later version.
67# endif 68 *
68# endif 69 * This library is distributed in the hope that it will be useful,
69#define g_return_val_if_fail(expr,val) G_STMT_START{ (void)0; }G_STMT_END 70 * but WITHOUT ANY WARRANTY; without even the implied warranty of
71 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
72 * Lesser General Public License for more details.
73 *
74 * You should have received a copy of the GNU Lesser General Public
75 * License along with this library; if not, write to the
76 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
77 * Boston, MA 02111-1307, USA.
78 */
79
80#ifndef FALSE
81#define FALSE (0)
82#endif
83
84#ifndef TRUE
85#define TRUE (!FALSE)
86#endif
87
70#define G_N_ELEMENTS(arr) (sizeof (arr) / sizeof ((arr)[0])) 88#define G_N_ELEMENTS(arr) (sizeof (arr) / sizeof ((arr)[0]))
71#define TRUE 1 89
72#define FALSE 0 90#define G_UNLIKELY(expr) (expr)
73 91
74/* Code from GLIB gunicode.h starts here. */ 92/* Code from GLIB gunicode.h starts here. */
75 93
94/* gunicode.h - Unicode manipulation functions
95 *
96 * Copyright (C) 1999, 2000 Tom Tromey
97 * Copyright 2000, 2005 Red Hat, Inc.
98 *
99 * The Gnome Library is free software; you can redistribute it and/or
100 * modify it under the terms of the GNU Lesser General Public License as
101 * published by the Free Software Foundation; either version 2 of the
102 * License, or (at your option) any later version.
103 *
104 * The Gnome Library is distributed in the hope that it will be useful,
105 * but WITHOUT ANY WARRANTY; without even the implied warranty of
106 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
107 * Lesser General Public License for more details.
108 *
109 * You should have received a copy of the GNU Lesser General Public
110 * License along with the Gnome Library; see the file COPYING.LIB. If not,
111 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
112 * Boston, MA 02111-1307, USA.
113 */
114
76typedef enum 115typedef enum
77{ 116{
78 G_NORMALIZE_DEFAULT, 117 G_NORMALIZE_DEFAULT,
79 G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT, 118 G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT,
80 G_NORMALIZE_DEFAULT_COMPOSE, 119 G_NORMALIZE_DEFAULT_COMPOSE,
@@ -84,12 +123,35 @@ typedef enum
84 G_NORMALIZE_ALL_COMPOSE, 123 G_NORMALIZE_ALL_COMPOSE,
85 G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE 124 G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE
86} 125}
87GNormalizeMode; 126GNormalizeMode;
88 127
128#define g_utf8_next_char(p) ((p) + g_utf8_skip[*(const guchar *)(p)])
129
89/* Code from GLIB gutf8.c starts here. */ 130/* Code from GLIB gutf8.c starts here. */
90 131
132/* gutf8.c - Operations on UTF-8 strings.
133 *
134 * Copyright (C) 1999 Tom Tromey
135 * Copyright (C) 2000 Red Hat, Inc.
136 *
137 * This library is free software; you can redistribute it and/or
138 * modify it under the terms of the GNU Lesser General Public
139 * License as published by the Free Software Foundation; either
140 * version 2 of the License, or (at your option) any later version.
141 *
142 * This library is distributed in the hope that it will be useful,
143 * but WITHOUT ANY WARRANTY; without even the implied warranty of
144 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
145 * Lesser General Public License for more details.
146 *
147 * You should have received a copy of the GNU Lesser General Public
148 * License along with this library; if not, write to the
149 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
150 * Boston, MA 02111-1307, USA.
151 */
152
91#define UTF8_COMPUTE(Char, Mask, Len) \ 153#define UTF8_COMPUTE(Char, Mask, Len) \
92 if (Char < 128) \ 154 if (Char < 128) \
93 { \ 155 { \
94 Len = 1; \ 156 Len = 1; \
95 Mask = 0x7f; \ 157 Mask = 0x7f; \
@@ -127,31 +189,23 @@ GNormalizeMode;
127 ((Char) < 0x800 ? 2 : \ 189 ((Char) < 0x800 ? 2 : \
128 ((Char) < 0x10000 ? 3 : \ 190 ((Char) < 0x10000 ? 3 : \
129 ((Char) < 0x200000 ? 4 : \ 191 ((Char) < 0x200000 ? 4 : \
130 ((Char) < 0x4000000 ? 5 : 6))))) 192 ((Char) < 0x4000000 ? 5 : 6)))))
131 193
132 194#define UTF8_GET(Result, Chars, Count, Mask, Len) \
133#define UTF8_GET(Result, Chars, Count, Mask, Len) \ 195 (Result) = (Chars)[0] & (Mask); \
134 (Result) = (Chars)[0] & (Mask); \ 196 for ((Count) = 1; (Count) < (Len); ++(Count)) \
135 for ((Count) = 1; (Count) < (Len); ++(Count)) \ 197 { \
136 { \ 198 if (((Chars)[(Count)] & 0xc0) != 0x80) \
137 if (((Chars)[(Count)] & 0xc0) != 0x80) \ 199 { \
138 { \ 200 (Result) = -1; \
139 (Result) = -1; \ 201 break; \
140 break; \ 202 } \
141 } \ 203 (Result) <<= 6; \
142 (Result) <<= 6; \ 204 (Result) |= ((Chars)[(Count)] & 0x3f); \
143 (Result) |= ((Chars)[(Count)] & 0x3f); \
144 } 205 }
145 206
146#define UNICODE_VALID(Char) \
147 ((Char) < 0x110000 && \
148 (((Char) & 0xFFFFF800) != 0xD800) && \
149 ((Char) < 0xFDD0 || (Char) > 0xFDEF) && \
150 ((Char) & 0xFFFE) != 0xFFFE)
151
152
153static const gchar utf8_skip_data[256] = { 207static const gchar utf8_skip_data[256] = {
154 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 208 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
155 1, 1, 1, 1, 1, 1, 1, 209 1, 1, 1, 1, 1, 1, 1,
156 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 210 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
157 1, 1, 1, 1, 1, 1, 1, 211 1, 1, 1, 1, 1, 1, 1,
@@ -169,21 +223,20 @@ static const gchar utf8_skip_data[256] = {
169 5, 5, 5, 6, 6, 1, 1 223 5, 5, 5, 6, 6, 1, 1
170}; 224};
171 225
172static const gchar *const g_utf8_skip = utf8_skip_data; 226static const gchar *const g_utf8_skip = utf8_skip_data;
173 227
174#define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(guchar *)(p)])
175
176/* 228/*
177 * g_utf8_strlen: 229 * g_utf8_strlen:
178 * @p: pointer to the start of a UTF-8 encoded string. 230 * @p: pointer to the start of a UTF-8 encoded string
179 * @max: the maximum number of bytes to examine. If @max 231 * @max: the maximum number of bytes to examine. If @max
180 * is less than 0, then the string is assumed to be 232 * is less than 0, then the string is assumed to be
181 * nul-terminated. If @max is 0, @p will not be examined and 233 * nul-terminated. If @max is 0, @p will not be examined and
182 * may be %NULL. 234 * may be %NULL.
183 * 235 *
184 * Returns the length of the string in characters. 236 * Computes the length of the string in characters, not including
237 * the terminating nul character.
185 * 238 *
186 * Return value: the length of the string in characters 239 * Return value: the length of the string in characters
187 **/ 240 **/
188static glong 241static glong
189g_utf8_strlen (const gchar * p, gssize max) 242g_utf8_strlen (const gchar * p, gssize max)
@@ -214,11 +267,11 @@ g_utf8_strlen (const gchar * p, gssize max)
214 } 267 }
215 268
216 /* only do the last len increment if we got a complete 269 /* only do the last len increment if we got a complete
217 * char (don't count partial chars) 270 * char (don't count partial chars)
218 */ 271 */
219 if (p - start == max) 272 if (p - start <= max)
220 ++len; 273 ++len;
221 } 274 }
222 275
223 return len; 276 return len;
224} 277}
@@ -250,11 +303,11 @@ g_utf8_get_char (const gchar * p)
250 return result; 303 return result;
251} 304}
252 305
253/* 306/*
254 * g_unichar_to_utf8: 307 * g_unichar_to_utf8:
255 * @c: a ISO10646 character code 308 * @c: a Unicode character code
256 * @outbuf: output buffer, must have at least 6 bytes of space. 309 * @outbuf: output buffer, must have at least 6 bytes of space.
257 * If %NULL, the length will be computed and returned 310 * If %NULL, the length will be computed and returned
258 * and nothing will be written to @outbuf. 311 * and nothing will be written to @outbuf.
259 * 312 *
260 * Converts a single character to UTF-8. 313 * Converts a single character to UTF-8.
@@ -262,10 +315,11 @@ g_utf8_get_char (const gchar * p)
262 * Return value: number of bytes written 315 * Return value: number of bytes written
263 **/ 316 **/
264static int 317static int
265g_unichar_to_utf8 (gunichar c, gchar * outbuf) 318g_unichar_to_utf8 (gunichar c, gchar * outbuf)
266{ 319{
320 /* If this gets modified, also update the copy in g_string_insert_unichar() */
267 guint len = 0; 321 guint len = 0;
268 int first; 322 int first;
269 int i; 323 int i;
270 324
271 if (c < 0x80) 325 if (c < 0x80)
@@ -313,29 +367,29 @@ g_unichar_to_utf8 (gunichar c, gchar * outbuf)
313} 367}
314 368
315/* 369/*
316 * g_utf8_to_ucs4_fast: 370 * g_utf8_to_ucs4_fast:
317 * @str: a UTF-8 encoded string 371 * @str: a UTF-8 encoded string
318 * @len: the maximum length of @str to use. If @len < 0, then 372 * @len: the maximum length of @str to use, in bytes. If @len < 0,
319 * the string is nul-terminated. 373 * then the string is nul-terminated.
320 * @items_written: location to store the number of characters in the 374 * @items_written: location to store the number of characters in the
321 * result, or %NULL. 375 * result, or %NULL.
322 * 376 *
323 * Convert a string from UTF-8 to a 32-bit fixed width 377 * Convert a string from UTF-8 to a 32-bit fixed width
324 * representation as UCS-4, assuming valid UTF-8 input. 378 * representation as UCS-4, assuming valid UTF-8 input.
325 * This function is roughly twice as fast as g_utf8_to_ucs4() 379 * This function is roughly twice as fast as g_utf8_to_ucs4()
326 * but does no error checking on the input. 380 * but does no error checking on the input. A trailing 0 character
381 * will be added to the string after the converted text.
327 * 382 *
328 * Return value: a pointer to a newly allocated UCS-4 string. 383 * Return value: a pointer to a newly allocated UCS-4 string.
329 * This value must be freed with g_free(). 384 * This value must be freed with g_free().
330 **/ 385 **/
331static gunichar * 386static gunichar *
332g_utf8_to_ucs4_fast (const gchar * str, glong len, glong * items_written) 387g_utf8_to_ucs4_fast (const gchar * str, glong len, glong * items_written)
333{ 388{
334 gint j, charlen;
335 gunichar *result; 389 gunichar *result;
336 gint n_chars, i; 390 gsize n_chars, i;
337 const gchar *p; 391 const gchar *p;
338 392
339 g_return_val_if_fail (str != NULL, NULL); 393 g_return_val_if_fail (str != NULL, NULL);
340 394
341 p = str; 395 p = str;
@@ -355,60 +409,48 @@ g_utf8_to_ucs4_fast (const gchar * str, glong len, glong * items_written)
355 p = g_utf8_next_char (p); 409 p = g_utf8_next_char (p);
356 ++n_chars; 410 ++n_chars;
357 } 411 }
358 } 412 }
359 413
360 result = g_new (gunichar, n_chars + 1); 414 result = g_malloc (sizeof (gunichar) * (n_chars + 1));
361 if (!result) 415 if (!result)
362 return NULL; 416 return NULL;
363 417
364 p = str; 418 p = str;
365 for (i = 0; i < n_chars; i++) 419 for (i = 0; i < n_chars; i++)
366 { 420 {
367 gunichar wc = ((unsigned char *) p)[0]; 421 gunichar wc = (guchar) * p++;
368 422
369 if (wc < 0x80) 423 if (wc < 0x80)
370 { 424 {
371 result[i] = wc; 425 result[i] = wc;
372 p++;
373 } 426 }
374 else 427 else
375 { 428 {
376 if (wc < 0xe0) 429 gunichar mask = 0x40;
377 { 430
378 charlen = 2; 431 if (G_UNLIKELY ((wc & mask) == 0))
379 wc &= 0x1f;
380 }
381 else if (wc < 0xf0)
382 {
383 charlen = 3;
384 wc &= 0x0f;
385 }
386 else if (wc < 0xf8)
387 {
388 charlen = 4;
389 wc &= 0x07;
390 }
391 else if (wc < 0xfc)
392 {
393 charlen = 5;
394 wc &= 0x03;
395 }
396 else
397 { 432 {
398 charlen = 6; 433 /* It's an out-of-sequence 10xxxxxxx byte.
399 wc &= 0x01; 434 * Rather than making an ugly hash of this and the next byte
435 * and overrunning the buffer, it's more useful to treat it
436 * with a replacement character */
437 result[i] = 0xfffd;
438 continue;
400 } 439 }
401 440
402 for (j = 1; j < charlen; j++) 441 do
403 { 442 {
404 wc <<= 6; 443 wc <<= 6;
405 wc |= ((unsigned char *) p)[j] & 0x3f; 444 wc |= (guchar) (*p++) & 0x3f;
445 mask <<= 5;
406 } 446 }
447 while ((wc & mask) != 0);
448
449 wc &= mask - 1;
407 450
408 result[i] = wc; 451 result[i] = wc;
409 p += charlen;
410 } 452 }
411 } 453 }
412 result[i] = 0; 454 result[i] = 0;
413 455
414 if (items_written) 456 if (items_written)
@@ -418,32 +460,34 @@ g_utf8_to_ucs4_fast (const gchar * str, glong len, glong * items_written)
418} 460}
419 461
420/* 462/*
421 * g_ucs4_to_utf8: 463 * g_ucs4_to_utf8:
422 * @str: a UCS-4 encoded string 464 * @str: a UCS-4 encoded string
423 * @len: the maximum length of @str to use. If @len < 0, then 465 * @len: the maximum length (number of characters) of @str to use.
424 * the string is terminated with a 0 character. 466 * If @len < 0, then the string is nul-terminated.
425 * @items_read: location to store number of characters read read, or %NULL. 467 * @items_read: location to store number of characters read, or %NULL.
426 * @items_written: location to store number of bytes written or %NULL. 468 * @items_written: location to store number of bytes written or %NULL.
427 * The value here stored does not include the trailing 0 469 * The value here stored does not include the trailing 0
428 * byte. 470 * byte.
429 * @error: location to store the error occuring, or %NULL to ignore 471 * @error: location to store the error occurring, or %NULL to ignore
430 * errors. Any of the errors in #GConvertError other than 472 * errors. Any of the errors in #GConvertError other than
431 * %G_CONVERT_ERROR_NO_CONVERSION may occur. 473 * %G_CONVERT_ERROR_NO_CONVERSION may occur.
432 * 474 *
433 * Convert a string from a 32-bit fixed width representation as UCS-4. 475 * Convert a string from a 32-bit fixed width representation as UCS-4.
434 * to UTF-8. The result will be terminated with a 0 byte. 476 * to UTF-8. The result will be terminated with a 0 byte.
435 * 477 *
436 * Return value: a pointer to a newly allocated UTF-8 string. 478 * Return value: a pointer to a newly allocated UTF-8 string.
437 * This value must be freed with g_free(). If an 479 * This value must be freed with g_free(). If an
438 * error occurs, %NULL will be returned and 480 * error occurs, %NULL will be returned and
439 * @error set. 481 * @error set. In that case, @items_read will be
482 * set to the position of the first invalid input
483 * character.
440 **/ 484 **/
441static gchar * 485static gchar *
442g_ucs4_to_utf8 (const gunichar * str, 486g_ucs4_to_utf8 (const gunichar * str,
443 glong len, 487 glong len,
444 glong * items_read, glong * items_written, GError ** error) 488 glong * items_read, glong * items_written)
445{ 489{
446 gint result_length; 490 gint result_length;
447 gchar *result = NULL; 491 gchar *result = NULL;
448 gchar *p; 492 gchar *p;
449 gint i; 493 gint i;
@@ -453,19 +497,11 @@ g_ucs4_to_utf8 (const gunichar * str,
453 { 497 {
454 if (!str[i]) 498 if (!str[i])
455 break; 499 break;
456 500
457 if (str[i] >= 0x80000000) 501 if (str[i] >= 0x80000000)
458 { 502 goto err_out;
459 if (items_read)
460 *items_read = i;
461
462 g_set_error (error, G_CONVERT_ERROR,
463 G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
464 _("Character out of range for UTF-8"));
465 goto err_out;
466 }
467 503
468 result_length += UTF8_LENGTH (str[i]); 504 result_length += UTF8_LENGTH (str[i]);
469 } 505 }
470 506
471 result = g_malloc (result_length + 1); 507 result = g_malloc (result_length + 1);
@@ -489,28 +525,49 @@ err_out:
489 return result; 525 return result;
490} 526}
491 527
492/* Code from GLIB gunidecomp.c starts here. */ 528/* Code from GLIB gunidecomp.c starts here. */
493 529
530/* decomp.c - Character decomposition.
531 *
532 * Copyright (C) 1999, 2000 Tom Tromey
533 * Copyright 2000 Red Hat, Inc.
534 *
535 * The Gnome Library is free software; you can redistribute it and/or
536 * modify it under the terms of the GNU Lesser General Public License as
537 * published by the Free Software Foundation; either version 2 of the
538 * License, or (at your option) any later version.
539 *
540 * The Gnome Library is distributed in the hope that it will be useful,
541 * but WITHOUT ANY WARRANTY; without even the implied warranty of
542 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
543 * Lesser General Public License for more details.
544 *
545 * You should have received a copy of the GNU Lesser General Public
546 * License along with the Gnome Library; see the file COPYING.LIB. If not,
547 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
548 * Boston, MA 02111-1307, USA.
549 */
550
494#include "gunidecomp.h" 551#include "gunidecomp.h"
495#include "gunicomp.h" 552#include "gunicomp.h"
496 553
497#define CC_PART1(Page, Char) \ 554#define CC_PART1(Page, Char) \
498 ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ 555 ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
499 ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \ 556 ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
500 : (cclass_data[combining_class_table_part1[Page]][Char])) 557 : (cclass_data[combining_class_table_part1[Page]][Char]))
501 558
502#define CC_PART2(Page, Char) \ 559#define CC_PART2(Page, Char) \
503 ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ 560 ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
504 ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \ 561 ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
505 : (cclass_data[combining_class_table_part2[Page]][Char])) 562 : (cclass_data[combining_class_table_part2[Page]][Char]))
506 563
507#define COMBINING_CLASS(Char) \ 564#define COMBINING_CLASS(Char) \
508 (((Char) <= G_UNICODE_LAST_CHAR_PART1) \ 565 (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
509 ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \ 566 ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \
510 : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \ 567 : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
511 ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \ 568 ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
512 : 0)) 569 : 0))
513 570
514/* constants for hangul syllable [de]composition */ 571/* constants for hangul syllable [de]composition */
515#define SBase 0xAC00 572#define SBase 0xAC00
516#define LBase 0x1100 573#define LBase 0x1100
@@ -575,39 +632,26 @@ g_unicode_canonical_ordering (gunichar * string, gsize len)
575 * characters will always be big enough. */ 632 * characters will always be big enough. */
576static void 633static void
577decompose_hangul (gunichar s, gunichar * r, gsize * result_len) 634decompose_hangul (gunichar s, gunichar * r, gsize * result_len)
578{ 635{
579 gint SIndex = s - SBase; 636 gint SIndex = s - SBase;
637 gint TIndex = SIndex % TCount;
580 638
581 /* not a hangul syllable */ 639 if (r)
582 if (SIndex < 0 || SIndex >= SCount)
583 { 640 {
584 if (r) 641 r[0] = LBase + SIndex / NCount;
585 r[0] = s; 642 r[1] = VBase + (SIndex % NCount) / TCount;
586 *result_len = 1;
587 } 643 }
588 else
589 {
590 gunichar L = LBase + SIndex / NCount;
591 gunichar V = VBase + (SIndex % NCount) / TCount;
592 gunichar T = TBase + SIndex % TCount;
593 644
645 if (TIndex)
646 {
594 if (r) 647 if (r)
595 { 648 r[2] = TBase + TIndex;
596 r[0] = L; 649 *result_len = 3;
597 r[1] = V;
598 }
599
600 if (T != TBase)
601 {
602 if (r)
603 r[2] = T;
604 *result_len = 3;
605 }
606 else
607 *result_len = 2;
608 } 650 }
651 else
652 *result_len = 2;
609} 653}
610 654
611/* returns a pointer to a null-terminated UTF-8 string */ 655/* returns a pointer to a null-terminated UTF-8 string */
612static const gchar * 656static const gchar *
613find_decomposition (gunichar ch, gboolean compat) 657find_decomposition (gunichar ch, gboolean compat)
@@ -665,26 +709,26 @@ combine_hangul (gunichar a, gunichar b, gunichar * result)
665 { 709 {
666 *result = SBase + (LIndex * VCount + VIndex) * TCount; 710 *result = SBase + (LIndex * VCount + VIndex) * TCount;
667 return TRUE; 711 return TRUE;
668 } 712 }
669 else if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0 713 else if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0
670 && 0 <= TIndex && TIndex <= TCount) 714 && 0 < TIndex && TIndex < TCount)
671 { 715 {
672 *result = a + TIndex; 716 *result = a + TIndex;
673 return TRUE; 717 return TRUE;
674 } 718 }
675 719
676 return FALSE; 720 return FALSE;
677} 721}
678 722
679#define CI(Page, Char) \ 723#define CI(Page, Char) \
680 ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ 724 ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
681 ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \ 725 ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
682 : (compose_data[compose_table[Page]][Char])) 726 : (compose_data[compose_table[Page]][Char]))
683 727
684#define COMPOSE_INDEX(Char) \ 728#define COMPOSE_INDEX(Char) \
685 ((((Char) >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff)) 729 (((Char >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
686 730
687static gboolean 731static gboolean
688combine (gunichar a, gunichar b, gunichar * result) 732combine (gunichar a, gunichar b, gunichar * result)
689{ 733{
690 gushort index_a, index_b; 734 gushort index_a, index_b;
@@ -754,11 +798,11 @@ _g_utf8_normalize_wc (const gchar * str, gssize max_len, GNormalizeMode mode)
754 while ((max_len < 0 || p < str + max_len) && *p) 798 while ((max_len < 0 || p < str + max_len) && *p)
755 { 799 {
756 const gchar *decomp; 800 const gchar *decomp;
757 gunichar wc = g_utf8_get_char (p); 801 gunichar wc = g_utf8_get_char (p);
758 802
759 if (wc >= 0xac00 && wc <= 0xd7af) 803 if (wc >= SBase && wc < SBase + SCount)
760 { 804 {
761 gsize result_len; 805 gsize result_len;
762 decompose_hangul (wc, NULL, &result_len); 806 decompose_hangul (wc, NULL, &result_len);
763 n_wc += result_len; 807 n_wc += result_len;
764 } 808 }
@@ -773,11 +817,11 @@ _g_utf8_normalize_wc (const gchar * str, gssize max_len, GNormalizeMode mode)
773 } 817 }
774 818
775 p = g_utf8_next_char (p); 819 p = g_utf8_next_char (p);
776 } 820 }
777 821
778 wc_buffer = g_new (gunichar, n_wc + 1); 822 wc_buffer = g_malloc (sizeof (gunichar) * (n_wc + 1));
779 if (!wc_buffer) 823 if (!wc_buffer)
780 return NULL; 824 return NULL;
781 825
782 last_start = 0; 826 last_start = 0;
783 n_wc = 0; 827 n_wc = 0;
@@ -787,11 +831,11 @@ _g_utf8_normalize_wc (const gchar * str, gssize max_len, GNormalizeMode mode)
787 gunichar wc = g_utf8_get_char (p); 831 gunichar wc = g_utf8_get_char (p);
788 const gchar *decomp; 832 const gchar *decomp;
789 int cc; 833 int cc;
790 gsize old_n_wc = n_wc; 834 gsize old_n_wc = n_wc;
791 835
792 if (wc >= 0xac00 && wc <= 0xd7af) 836 if (wc >= SBase && wc < SBase + SCount)
793 { 837 {
794 gsize result_len; 838 gsize result_len;
795 decompose_hangul (wc, wc_buffer + n_wc, &result_len); 839 decompose_hangul (wc, wc_buffer + n_wc, &result_len);
796 n_wc += result_len; 840 n_wc += result_len;
797 } 841 }
@@ -826,11 +870,11 @@ _g_utf8_normalize_wc (const gchar * str, gssize max_len, GNormalizeMode mode)
826 870
827 if (n_wc > 0) 871 if (n_wc > 0)
828 { 872 {
829 g_unicode_canonical_ordering (wc_buffer + last_start, 873 g_unicode_canonical_ordering (wc_buffer + last_start,
830 n_wc - last_start); 874 n_wc - last_start);
831 last_start = n_wc; 875 // dead assignment: last_start = n_wc;
832 } 876 }
833 877
834 wc_buffer[n_wc] = 0; 878 wc_buffer[n_wc] = 0;
835 879
836 /* All decomposed and reordered */ 880 /* All decomposed and reordered */
@@ -882,53 +926,53 @@ _g_utf8_normalize_wc (const gchar * str, gssize max_len, GNormalizeMode mode)
882 * @mode: the type of normalization to perform. 926 * @mode: the type of normalization to perform.
883 * 927 *
884 * Converts a string into canonical form, standardizing 928 * Converts a string into canonical form, standardizing
885 * such issues as whether a character with an accent 929 * such issues as whether a character with an accent
886 * is represented as a base character and combining 930 * is represented as a base character and combining
887 * accent or as a single precomposed character. You 931 * accent or as a single precomposed character. The
888 * should generally call g_utf8_normalize() before 932 * string has to be valid UTF-8, otherwise %NULL is
889 * comparing two Unicode strings. 933 * returned. You should generally call g_utf8_normalize()
934 * before comparing two Unicode strings.
890 * 935 *
891 * The normalization mode %G_NORMALIZE_DEFAULT only 936 * The normalization mode %G_NORMALIZE_DEFAULT only
892 * standardizes differences that do not affect the 937 * standardizes differences that do not affect the
893 * text content, such as the above-mentioned accent 938 * text content, such as the above-mentioned accent
894 * representation. %G_NORMALIZE_ALL also standardizes 939 * representation. %G_NORMALIZE_ALL also standardizes
895 * the "compatibility" characters in Unicode, such 940 * the "compatibility" characters in Unicode, such
896 * as SUPERSCRIPT THREE to the standard forms 941 * as SUPERSCRIPT THREE to the standard forms
897 * (in this case DIGIT THREE). Formatting information 942 * (in this case DIGIT THREE). Formatting information
898 * may be lost but for most text operations such 943 * may be lost but for most text operations such
899 * characters should be considered the same. 944 * characters should be considered the same.
900 * For example, g_utf8_collate() normalizes
901 * with %G_NORMALIZE_ALL as its first step.
902 * 945 *
903 * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE 946 * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
904 * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL, 947 * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
905 * but returned a result with composed forms rather 948 * but returned a result with composed forms rather
906 * than a maximally decomposed form. This is often 949 * than a maximally decomposed form. This is often
907 * useful if you intend to convert the string to 950 * useful if you intend to convert the string to
908 * a legacy encoding or pass it to a system with 951 * a legacy encoding or pass it to a system with
909 * less capable Unicode handling. 952 * less capable Unicode handling.
910 * 953 *
911 * Return value: a newly allocated string, that is the 954 * Return value: a newly allocated string, that is the
912 * normalized form of @str. 955 * normalized form of @str, or %NULL if @str is not
956 * valid UTF-8.
913 **/ 957 **/
914static gchar * 958static gchar *
915g_utf8_normalize (const gchar * str, gssize len, GNormalizeMode mode) 959g_utf8_normalize (const gchar * str, gssize len, GNormalizeMode mode)
916{ 960{
917 gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode); 961 gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
918 gchar *result; 962 gchar *result;
919 963
920 result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL); 964 result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL);
921 g_free (result_wc); 965 g_free (result_wc);
922 966
923 return result; 967 return result;
924} 968}
925 969
926/* Public Libidn API starts here. */ 970/* Public Libidn API starts here. */
927 971
928/** 972/**
929 * stringprep_utf8_to_unichar - convert UTF-8 to Unicode code point 973 * stringprep_utf8_to_unichar:
930 * @p: a pointer to Unicode character encoded as UTF-8 974 * @p: a pointer to Unicode character encoded as UTF-8
931 * 975 *
932 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character. 976 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
933 * If @p does not point to a valid UTF-8 encoded character, results are 977 * If @p does not point to a valid UTF-8 encoded character, results are
934 * undefined. 978 * undefined.
@@ -940,11 +984,11 @@ stringprep_utf8_to_unichar (const char *p)
940{ 984{
941 return g_utf8_get_char (p); 985 return g_utf8_get_char (p);
942} 986}
943 987
944/** 988/**
945 * stringprep_unichar_to_utf8 - convert Unicode code point to UTF-8 989 * stringprep_unichar_to_utf8:
946 * @c: a ISO10646 character code 990 * @c: a ISO10646 character code
947 * @outbuf: output buffer, must have at least 6 bytes of space. 991 * @outbuf: output buffer, must have at least 6 bytes of space.
948 * If %NULL, the length will be computed and returned 992 * If %NULL, the length will be computed and returned
949 * and nothing will be written to @outbuf. 993 * and nothing will be written to @outbuf.
950 * 994 *
@@ -956,33 +1000,46 @@ int
956stringprep_unichar_to_utf8 (uint32_t c, char *outbuf) 1000stringprep_unichar_to_utf8 (uint32_t c, char *outbuf)
957{ 1001{
958 return g_unichar_to_utf8 (c, outbuf); 1002 return g_unichar_to_utf8 (c, outbuf);
959} 1003}
960 1004
1005#include <unistr.h>
1006
961/** 1007/**
962 * stringprep_utf8_to_ucs4 - convert UTF-8 string to UCS-4 1008 * stringprep_utf8_to_ucs4:
963 * @str: a UTF-8 encoded string 1009 * @str: a UTF-8 encoded string
964 * @len: the maximum length of @str to use. If @len < 0, then 1010 * @len: the maximum length of @str to use. If @len < 0, then
965 * the string is nul-terminated. 1011 * the string is nul-terminated.
966 * @items_written: location to store the number of characters in the 1012 * @items_written: location to store the number of characters in the
967 * result, or %NULL. 1013 * result, or %NULL.
968 * 1014 *
969 * Convert a string from UTF-8 to a 32-bit fixed width 1015 * Convert a string from UTF-8 to a 32-bit fixed width representation
970 * representation as UCS-4, assuming valid UTF-8 input. 1016 * as UCS-4. The function now performs error checking to verify that
971 * This function does no error checking on the input. 1017 * the input is valid UTF-8 (before it was documented to not do error
1018 * checking).
972 * 1019 *
973 * Return value: a pointer to a newly allocated UCS-4 string. 1020 * Return value: a pointer to a newly allocated UCS-4 string.
974 * This value must be freed with free(). 1021 * This value must be deallocated by the caller.
975 **/ 1022 **/
976uint32_t * 1023uint32_t *
977stringprep_utf8_to_ucs4 (const char *str, ssize_t len, size_t * items_written) 1024stringprep_utf8_to_ucs4 (const char *str, ssize_t len, size_t * items_written)
978{ 1025{
1026 size_t n;
1027
1028 if (len < 0)
1029 n = strlen (str);
1030 else
1031 n = len;
1032
1033 if (u8_check ((const uint8_t *) str, n))
1034 return NULL;
1035
979 return g_utf8_to_ucs4_fast (str, (glong) len, (glong *) items_written); 1036 return g_utf8_to_ucs4_fast (str, (glong) len, (glong *) items_written);
980} 1037}
981 1038
982/** 1039/**
983 * stringprep_ucs4_to_utf8 - convert UCS-4 string to UTF-8 1040 * stringprep_ucs4_to_utf8:
984 * @str: a UCS-4 encoded string 1041 * @str: a UCS-4 encoded string
985 * @len: the maximum length of @str to use. If @len < 0, then 1042 * @len: the maximum length of @str to use. If @len < 0, then
986 * the string is terminated with a 0 character. 1043 * the string is terminated with a 0 character.
987 * @items_read: location to store number of characters read read, or %NULL. 1044 * @items_read: location to store number of characters read read, or %NULL.
988 * @items_written: location to store number of bytes written or %NULL. 1045 * @items_written: location to store number of bytes written or %NULL.
@@ -991,24 +1048,23 @@ stringprep_utf8_to_ucs4 (const char *str, ssize_t len, size_t * items_written)
991 * 1048 *
992 * Convert a string from a 32-bit fixed width representation as UCS-4. 1049 * Convert a string from a 32-bit fixed width representation as UCS-4.
993 * to UTF-8. The result will be terminated with a 0 byte. 1050 * to UTF-8. The result will be terminated with a 0 byte.
994 * 1051 *
995 * Return value: a pointer to a newly allocated UTF-8 string. 1052 * Return value: a pointer to a newly allocated UTF-8 string.
996 * This value must be freed with free(). If an 1053 * This value must be deallocated by the caller.
997 * error occurs, %NULL will be returned and 1054 * If an error occurs, %NULL will be returned.
998 * @error set.
999 **/ 1055 **/
1000char * 1056char *
1001stringprep_ucs4_to_utf8 (const uint32_t * str, ssize_t len, 1057stringprep_ucs4_to_utf8 (const uint32_t * str, ssize_t len,
1002 size_t * items_read, size_t * items_written) 1058 size_t * items_read, size_t * items_written)
1003{ 1059{
1004 return g_ucs4_to_utf8 (str, len, (glong *) items_read, 1060 return g_ucs4_to_utf8 (str, len, (glong *) items_read,
1005 (glong *) items_written, NULL); 1061 (glong *) items_written);
1006} 1062}
1007 1063
1008/** 1064/**
1009 * stringprep_utf8_nfkc_normalize - normalize Unicode string 1065 * stringprep_utf8_nfkc_normalize:
1010 * @str: a UTF-8 encoded string. 1066 * @str: a UTF-8 encoded string.
1011 * @len: length of @str, in bytes, or -1 if @str is nul-terminated. 1067 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
1012 * 1068 *
1013 * Converts a string into canonical form, standardizing 1069 * Converts a string into canonical form, standardizing
1014 * such issues as whether a character with an accent 1070 * such issues as whether a character with an accent
@@ -1032,22 +1088,22 @@ stringprep_utf8_nfkc_normalize (const char *str, ssize_t len)
1032{ 1088{
1033 return g_utf8_normalize (str, len, G_NORMALIZE_NFKC); 1089 return g_utf8_normalize (str, len, G_NORMALIZE_NFKC);
1034} 1090}
1035 1091
1036/** 1092/**
1037 * stringprep_ucs4_nfkc_normalize - normalize Unicode string 1093 * stringprep_ucs4_nfkc_normalize:
1038 * @str: a Unicode string. 1094 * @str: a Unicode string.
1039 * @len: length of @str array, or -1 if @str is nul-terminated. 1095 * @len: length of @str array, or -1 if @str is nul-terminated.
1040 * 1096 *
1041 * Converts UCS4 string into UTF-8 and runs 1097 * Converts a UCS4 string into canonical form, see
1042 * stringprep_utf8_nfkc_normalize(). 1098 * stringprep_utf8_nfkc_normalize() for more information.
1043 * 1099 *
1044 * Return value: a newly allocated Unicode string, that is the NFKC 1100 * Return value: a newly allocated Unicode string, that is the NFKC
1045 * normalized form of @str. 1101 * normalized form of @str.
1046 **/ 1102 **/
1047uint32_t * 1103uint32_t *
1048stringprep_ucs4_nfkc_normalize (uint32_t * str, ssize_t len) 1104stringprep_ucs4_nfkc_normalize (const uint32_t * str, ssize_t len)
1049{ 1105{
1050 char *p; 1106 char *p;
1051 uint32_t *result_wc; 1107 uint32_t *result_wc;
1052 1108
1053 p = stringprep_ucs4_to_utf8 (str, len, 0, 0); 1109 p = stringprep_ucs4_to_utf8 (str, len, 0, 0);