diff options
Diffstat (limited to '3rdParty/Expat/src/xmltok.c')
-rw-r--r-- | 3rdParty/Expat/src/xmltok.c | 244 |
1 files changed, 170 insertions, 74 deletions
diff --git a/3rdParty/Expat/src/xmltok.c b/3rdParty/Expat/src/xmltok.c index bf09dfc..cdf0720 100644 --- a/3rdParty/Expat/src/xmltok.c +++ b/3rdParty/Expat/src/xmltok.c @@ -1,254 +1,259 @@ /* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd See the file COPYING for copying permission. */ #include <stddef.h> -#ifdef COMPILED_FROM_DSP +#ifdef _WIN32 #include "winconfig.h" -#elif defined(MACOS_CLASSIC) -#include "macconfig.h" -#elif defined(__amigaos__) -#include "amigaconfig.h" -#elif defined(__WATCOMC__) -#include "watcomconfig.h" #else #ifdef HAVE_EXPAT_CONFIG_H #include <expat_config.h> #endif -#endif /* ndef COMPILED_FROM_DSP */ +#endif /* ndef _WIN32 */ #include "expat_external.h" #include "internal.h" #include "xmltok.h" #include "nametab.h" #ifdef XML_DTD #define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok) #else #define IGNORE_SECTION_TOK_VTABLE /* as nothing */ #endif #define VTABLE1 \ { PREFIX(prologTok), PREFIX(contentTok), \ PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \ { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \ PREFIX(sameName), \ PREFIX(nameMatchesAscii), \ PREFIX(nameLength), \ PREFIX(skipS), \ PREFIX(getAtts), \ PREFIX(charRefNumber), \ PREFIX(predefinedEntityName), \ PREFIX(updatePosition), \ PREFIX(isPublicId) #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16) #define UCS2_GET_NAMING(pages, hi, lo) \ - (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F))) + (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1u << ((lo) & 0x1F))) /* A 2 byte UTF-8 representation splits the characters 11 bits between the bottom 5 and 6 bits of the bytes. We need 8 bits to index into pages, 3 bits to add to that index and 5 bits to generate the mask. */ #define UTF8_GET_NAMING2(pages, byte) \ (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \ + ((((byte)[0]) & 3) << 1) \ + ((((byte)[1]) >> 5) & 1)] \ - & (1 << (((byte)[1]) & 0x1F))) + & (1u << (((byte)[1]) & 0x1F))) /* A 3 byte UTF-8 representation splits the characters 16 bits between the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index into pages, 3 bits to add to that index and 5 bits to generate the mask. */ #define UTF8_GET_NAMING3(pages, byte) \ (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \ + ((((byte)[1]) >> 2) & 0xF)] \ << 3) \ + ((((byte)[1]) & 3) << 1) \ + ((((byte)[2]) >> 5) & 1)] \ - & (1 << (((byte)[2]) & 0x1F))) + & (1u << (((byte)[2]) & 0x1F))) #define UTF8_GET_NAMING(pages, p, n) \ ((n) == 2 \ ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \ : ((n) == 3 \ ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \ : 0)) /* Detection of invalid UTF-8 sequences is based on Table 3.1B of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/ with the additional restriction of not allowing the Unicode code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE). Implementation details: (A & 0x80) == 0 means A < 0x80 and (A & 0xC0) == 0xC0 means A > 0xBF */ #define UTF8_INVALID2(p) \ ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0) #define UTF8_INVALID3(p) \ (((p)[2] & 0x80) == 0 \ || \ ((*p) == 0xEF && (p)[1] == 0xBF \ ? \ (p)[2] > 0xBD \ : \ ((p)[2] & 0xC0) == 0xC0) \ || \ ((*p) == 0xE0 \ ? \ (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \ : \ ((p)[1] & 0x80) == 0 \ || \ ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0))) #define UTF8_INVALID4(p) \ (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 \ || \ ((p)[2] & 0x80) == 0 || ((p)[2] & 0xC0) == 0xC0 \ || \ ((*p) == 0xF0 \ ? \ (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \ : \ ((p)[1] & 0x80) == 0 \ || \ ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0))) static int PTRFASTCALL -isNever(const ENCODING *enc, const char *p) +isNever(const ENCODING *UNUSED_P(enc), const char *UNUSED_P(p)) { return 0; } static int PTRFASTCALL -utf8_isName2(const ENCODING *enc, const char *p) +utf8_isName2(const ENCODING *UNUSED_P(enc), const char *p) { return UTF8_GET_NAMING2(namePages, (const unsigned char *)p); } static int PTRFASTCALL -utf8_isName3(const ENCODING *enc, const char *p) +utf8_isName3(const ENCODING *UNUSED_P(enc), const char *p) { return UTF8_GET_NAMING3(namePages, (const unsigned char *)p); } #define utf8_isName4 isNever static int PTRFASTCALL -utf8_isNmstrt2(const ENCODING *enc, const char *p) +utf8_isNmstrt2(const ENCODING *UNUSED_P(enc), const char *p) { return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p); } static int PTRFASTCALL -utf8_isNmstrt3(const ENCODING *enc, const char *p) +utf8_isNmstrt3(const ENCODING *UNUSED_P(enc), const char *p) { return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p); } #define utf8_isNmstrt4 isNever static int PTRFASTCALL -utf8_isInvalid2(const ENCODING *enc, const char *p) +utf8_isInvalid2(const ENCODING *UNUSED_P(enc), const char *p) { return UTF8_INVALID2((const unsigned char *)p); } static int PTRFASTCALL -utf8_isInvalid3(const ENCODING *enc, const char *p) +utf8_isInvalid3(const ENCODING *UNUSED_P(enc), const char *p) { return UTF8_INVALID3((const unsigned char *)p); } static int PTRFASTCALL -utf8_isInvalid4(const ENCODING *enc, const char *p) +utf8_isInvalid4(const ENCODING *UNUSED_P(enc), const char *p) { return UTF8_INVALID4((const unsigned char *)p); } struct normal_encoding { ENCODING enc; unsigned char type[256]; #ifdef XML_MIN_SIZE int (PTRFASTCALL *byteType)(const ENCODING *, const char *); int (PTRFASTCALL *isNameMin)(const ENCODING *, const char *); int (PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *); int (PTRFASTCALL *byteToAscii)(const ENCODING *, const char *); int (PTRCALL *charMatches)(const ENCODING *, const char *, int); #endif /* XML_MIN_SIZE */ int (PTRFASTCALL *isName2)(const ENCODING *, const char *); int (PTRFASTCALL *isName3)(const ENCODING *, const char *); int (PTRFASTCALL *isName4)(const ENCODING *, const char *); int (PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *); int (PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *); int (PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *); int (PTRFASTCALL *isInvalid2)(const ENCODING *, const char *); int (PTRFASTCALL *isInvalid3)(const ENCODING *, const char *); int (PTRFASTCALL *isInvalid4)(const ENCODING *, const char *); }; #define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *) (enc)) #ifdef XML_MIN_SIZE #define STANDARD_VTABLE(E) \ E ## byteType, \ E ## isNameMin, \ E ## isNmstrtMin, \ E ## byteToAscii, \ E ## charMatches, #else #define STANDARD_VTABLE(E) /* as nothing */ #endif #define NORMAL_VTABLE(E) \ E ## isName2, \ E ## isName3, \ E ## isName4, \ E ## isNmstrt2, \ E ## isNmstrt3, \ E ## isNmstrt4, \ E ## isInvalid2, \ E ## isInvalid3, \ E ## isInvalid4 +#define NULL_VTABLE \ + /* isName2 */ NULL, \ + /* isName3 */ NULL, \ + /* isName4 */ NULL, \ + /* isNmstrt2 */ NULL, \ + /* isNmstrt3 */ NULL, \ + /* isNmstrt4 */ NULL, \ + /* isInvalid2 */ NULL, \ + /* isInvalid3 */ NULL, \ + /* isInvalid4 */ NULL + static int FASTCALL checkCharRefNumber(int); #include "xmltok_impl.h" #include "ascii.h" #ifdef XML_MIN_SIZE #define sb_isNameMin isNever #define sb_isNmstrtMin isNever #endif #ifdef XML_MIN_SIZE #define MINBPC(enc) ((enc)->minBytesPerChar) #else /* minimum bytes per character */ #define MINBPC(enc) 1 #endif #define SB_BYTE_TYPE(enc, p) \ (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)]) #ifdef XML_MIN_SIZE static int PTRFASTCALL sb_byteType(const ENCODING *enc, const char *p) { return SB_BYTE_TYPE(enc, p); } #define BYTE_TYPE(enc, p) \ (AS_NORMAL_ENCODING(enc)->byteType(enc, p)) #else #define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p) @@ -291,352 +296,438 @@ sb_charMatches(const ENCODING *enc, const char *p, int c) { return *p == c; } #else /* c is an ASCII character */ #define CHAR_MATCHES(enc, p, c) (*(p) == c) #endif #define PREFIX(ident) normal_ ## ident #define XML_TOK_IMPL_C #include "xmltok_impl.c" #undef XML_TOK_IMPL_C #undef MINBPC #undef BYTE_TYPE #undef BYTE_TO_ASCII #undef CHAR_MATCHES #undef IS_NAME_CHAR #undef IS_NAME_CHAR_MINBPC #undef IS_NMSTRT_CHAR #undef IS_NMSTRT_CHAR_MINBPC #undef IS_INVALID_CHAR enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */ UTF8_cval1 = 0x00, UTF8_cval2 = 0xc0, UTF8_cval3 = 0xe0, UTF8_cval4 = 0xf0 }; -static void PTRCALL -utf8_toUtf8(const ENCODING *enc, +void +align_limit_to_full_utf8_characters(const char * from, const char ** fromLimRef) +{ + const char * fromLim = *fromLimRef; + size_t walked = 0; + for (; fromLim > from; fromLim--, walked++) { + const unsigned char prev = (unsigned char)fromLim[-1]; + if ((prev & 0xf8u) == 0xf0u) { /* 4-byte character, lead by 0b11110xxx byte */ + if (walked + 1 >= 4) { + fromLim += 4 - 1; + break; + } else { + walked = 0; + } + } else if ((prev & 0xf0u) == 0xe0u) { /* 3-byte character, lead by 0b1110xxxx byte */ + if (walked + 1 >= 3) { + fromLim += 3 - 1; + break; + } else { + walked = 0; + } + } else if ((prev & 0xe0u) == 0xc0u) { /* 2-byte character, lead by 0b110xxxxx byte */ + if (walked + 1 >= 2) { + fromLim += 2 - 1; + break; + } else { + walked = 0; + } + } else if ((prev & 0x80u) == 0x00u) { /* 1-byte character, matching 0b0xxxxxxx */ + break; + } + } + *fromLimRef = fromLim; +} + +static enum XML_Convert_Result PTRCALL +utf8_toUtf8(const ENCODING *UNUSED_P(enc), const char **fromP, const char *fromLim, char **toP, const char *toLim) { char *to; const char *from; - if (fromLim - *fromP > toLim - *toP) { - /* Avoid copying partial characters. */ - for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--) - if (((unsigned char)fromLim[-1] & 0xc0) != 0x80) - break; - } - for (to = *toP, from = *fromP; from != fromLim; from++, to++) + const char *fromLimInitial = fromLim; + + /* Avoid copying partial characters. */ + align_limit_to_full_utf8_characters(*fromP, &fromLim); + + for (to = *toP, from = *fromP; (from < fromLim) && (to < toLim); from++, to++) *to = *from; *fromP = from; *toP = to; + + if (fromLim < fromLimInitial) + return XML_CONVERT_INPUT_INCOMPLETE; + else if ((to == toLim) && (from < fromLim)) + return XML_CONVERT_OUTPUT_EXHAUSTED; + else + return XML_CONVERT_COMPLETED; } -static void PTRCALL +static enum XML_Convert_Result PTRCALL utf8_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim, unsigned short **toP, const unsigned short *toLim) { + enum XML_Convert_Result res = XML_CONVERT_COMPLETED; unsigned short *to = *toP; const char *from = *fromP; - while (from != fromLim && to != toLim) { + while (from < fromLim && to < toLim) { switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) { case BT_LEAD2: + if (fromLim - from < 2) { + res = XML_CONVERT_INPUT_INCOMPLETE; + goto after; + } *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f)); from += 2; break; case BT_LEAD3: + if (fromLim - from < 3) { + res = XML_CONVERT_INPUT_INCOMPLETE; + goto after; + } *to++ = (unsigned short)(((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f)); from += 3; break; case BT_LEAD4: { unsigned long n; - if (to + 1 == toLim) + if (toLim - to < 2) { + res = XML_CONVERT_OUTPUT_EXHAUSTED; + goto after; + } + if (fromLim - from < 4) { + res = XML_CONVERT_INPUT_INCOMPLETE; goto after; + } n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12) | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f); n -= 0x10000; to[0] = (unsigned short)((n >> 10) | 0xD800); to[1] = (unsigned short)((n & 0x3FF) | 0xDC00); to += 2; from += 4; } break; default: *to++ = *from++; break; } } + if (from < fromLim) + res = XML_CONVERT_OUTPUT_EXHAUSTED; after: *fromP = from; *toP = to; + return res; } #ifdef XML_NS static const struct normal_encoding utf8_encoding_ns = { { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, { #include "asciitab.h" #include "utf8tab.h" }, STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_) }; #endif static const struct normal_encoding utf8_encoding = { { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, { #define BT_COLON BT_NMSTRT #include "asciitab.h" #undef BT_COLON #include "utf8tab.h" }, STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_) }; #ifdef XML_NS static const struct normal_encoding internal_utf8_encoding_ns = { { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, { #include "iasciitab.h" #include "utf8tab.h" }, STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_) }; #endif static const struct normal_encoding internal_utf8_encoding = { { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, { #define BT_COLON BT_NMSTRT #include "iasciitab.h" #undef BT_COLON #include "utf8tab.h" }, STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_) }; -static void PTRCALL -latin1_toUtf8(const ENCODING *enc, +static enum XML_Convert_Result PTRCALL +latin1_toUtf8(const ENCODING *UNUSED_P(enc), const char **fromP, const char *fromLim, char **toP, const char *toLim) { for (;;) { unsigned char c; if (*fromP == fromLim) - break; + return XML_CONVERT_COMPLETED; c = (unsigned char)**fromP; if (c & 0x80) { if (toLim - *toP < 2) - break; + return XML_CONVERT_OUTPUT_EXHAUSTED; *(*toP)++ = (char)((c >> 6) | UTF8_cval2); *(*toP)++ = (char)((c & 0x3f) | 0x80); (*fromP)++; } else { if (*toP == toLim) - break; + return XML_CONVERT_OUTPUT_EXHAUSTED; *(*toP)++ = *(*fromP)++; } } } -static void PTRCALL -latin1_toUtf16(const ENCODING *enc, +static enum XML_Convert_Result PTRCALL +latin1_toUtf16(const ENCODING *UNUSED_P(enc), const char **fromP, const char *fromLim, unsigned short **toP, const unsigned short *toLim) { - while (*fromP != fromLim && *toP != toLim) + while (*fromP < fromLim && *toP < toLim) *(*toP)++ = (unsigned char)*(*fromP)++; + + if ((*toP == toLim) && (*fromP < fromLim)) + return XML_CONVERT_OUTPUT_EXHAUSTED; + else + return XML_CONVERT_COMPLETED; } #ifdef XML_NS static const struct normal_encoding latin1_encoding_ns = { { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 }, { #include "asciitab.h" #include "latin1tab.h" }, - STANDARD_VTABLE(sb_) + STANDARD_VTABLE(sb_) NULL_VTABLE }; #endif static const struct normal_encoding latin1_encoding = { { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 }, { #define BT_COLON BT_NMSTRT #include "asciitab.h" #undef BT_COLON #include "latin1tab.h" }, - STANDARD_VTABLE(sb_) + STANDARD_VTABLE(sb_) NULL_VTABLE }; -static void PTRCALL -ascii_toUtf8(const ENCODING *enc, +static enum XML_Convert_Result PTRCALL +ascii_toUtf8(const ENCODING *UNUSED_P(enc), const char **fromP, const char *fromLim, char **toP, const char *toLim) { - while (*fromP != fromLim && *toP != toLim) + while (*fromP < fromLim && *toP < toLim) *(*toP)++ = *(*fromP)++; + + if ((*toP == toLim) && (*fromP < fromLim)) + return XML_CONVERT_OUTPUT_EXHAUSTED; + else + return XML_CONVERT_COMPLETED; } #ifdef XML_NS static const struct normal_encoding ascii_encoding_ns = { { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 }, { #include "asciitab.h" /* BT_NONXML == 0 */ }, - STANDARD_VTABLE(sb_) + STANDARD_VTABLE(sb_) NULL_VTABLE }; #endif static const struct normal_encoding ascii_encoding = { { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 }, { #define BT_COLON BT_NMSTRT #include "asciitab.h" #undef BT_COLON /* BT_NONXML == 0 */ }, - STANDARD_VTABLE(sb_) + STANDARD_VTABLE(sb_) NULL_VTABLE }; static int PTRFASTCALL unicode_byte_type(char hi, char lo) { switch ((unsigned char)hi) { case 0xD8: case 0xD9: case 0xDA: case 0xDB: return BT_LEAD4; case 0xDC: case 0xDD: case 0xDE: case 0xDF: return BT_TRAIL; case 0xFF: switch ((unsigned char)lo) { case 0xFF: case 0xFE: return BT_NONXML; } break; } return BT_NONASCII; } #define DEFINE_UTF16_TO_UTF8(E) \ -static void PTRCALL \ -E ## toUtf8(const ENCODING *enc, \ +static enum XML_Convert_Result PTRCALL \ +E ## toUtf8(const ENCODING *UNUSED_P(enc), \ const char **fromP, const char *fromLim, \ char **toP, const char *toLim) \ { \ - const char *from; \ - for (from = *fromP; from != fromLim; from += 2) { \ + const char *from = *fromP; \ + fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */ \ + for (; from < fromLim; from += 2) { \ int plane; \ unsigned char lo2; \ unsigned char lo = GET_LO(from); \ unsigned char hi = GET_HI(from); \ switch (hi) { \ case 0: \ if (lo < 0x80) { \ if (*toP == toLim) { \ *fromP = from; \ - return; \ + return XML_CONVERT_OUTPUT_EXHAUSTED; \ } \ *(*toP)++ = lo; \ break; \ } \ /* fall through */ \ case 0x1: case 0x2: case 0x3: \ case 0x4: case 0x5: case 0x6: case 0x7: \ if (toLim - *toP < 2) { \ *fromP = from; \ - return; \ + return XML_CONVERT_OUTPUT_EXHAUSTED; \ } \ *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \ *(*toP)++ = ((lo & 0x3f) | 0x80); \ break; \ default: \ if (toLim - *toP < 3) { \ *fromP = from; \ - return; \ + return XML_CONVERT_OUTPUT_EXHAUSTED; \ } \ /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \ *(*toP)++ = ((hi >> 4) | UTF8_cval3); \ *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \ *(*toP)++ = ((lo & 0x3f) | 0x80); \ break; \ case 0xD8: case 0xD9: case 0xDA: case 0xDB: \ if (toLim - *toP < 4) { \ *fromP = from; \ - return; \ + return XML_CONVERT_OUTPUT_EXHAUSTED; \ + } \ + if (fromLim - from < 4) { \ + *fromP = from; \ + return XML_CONVERT_INPUT_INCOMPLETE; \ } \ plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \ *(*toP)++ = ((plane >> 2) | UTF8_cval4); \ *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \ from += 2; \ lo2 = GET_LO(from); \ *(*toP)++ = (((lo & 0x3) << 4) \ | ((GET_HI(from) & 0x3) << 2) \ | (lo2 >> 6) \ | 0x80); \ *(*toP)++ = ((lo2 & 0x3f) | 0x80); \ break; \ } \ } \ *fromP = from; \ + if (from < fromLim) \ + return XML_CONVERT_INPUT_INCOMPLETE; \ + else \ + return XML_CONVERT_COMPLETED; \ } #define DEFINE_UTF16_TO_UTF16(E) \ -static void PTRCALL \ -E ## toUtf16(const ENCODING *enc, \ +static enum XML_Convert_Result PTRCALL \ +E ## toUtf16(const ENCODING *UNUSED_P(enc), \ const char **fromP, const char *fromLim, \ unsigned short **toP, const unsigned short *toLim) \ { \ + enum XML_Convert_Result res = XML_CONVERT_COMPLETED; \ + fromLim = *fromP + (((fromLim - *fromP) >> 1) << 1); /* shrink to even */ \ /* Avoid copying first half only of surrogate */ \ if (fromLim - *fromP > ((toLim - *toP) << 1) \ - && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \ + && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) { \ fromLim -= 2; \ - for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \ + res = XML_CONVERT_INPUT_INCOMPLETE; \ + } \ + for (; *fromP < fromLim && *toP < toLim; *fromP += 2) \ *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \ + if ((*toP == toLim) && (*fromP < fromLim)) \ + return XML_CONVERT_OUTPUT_EXHAUSTED; \ + else \ + return res; \ } #define SET2(ptr, ch) \ (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8))) #define GET_LO(ptr) ((unsigned char)(ptr)[0]) #define GET_HI(ptr) ((unsigned char)(ptr)[1]) DEFINE_UTF16_TO_UTF8(little2_) DEFINE_UTF16_TO_UTF16(little2_) #undef SET2 #undef GET_LO #undef GET_HI #define SET2(ptr, ch) \ (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF))) #define GET_LO(ptr) ((unsigned char)(ptr)[1]) #define GET_HI(ptr) ((unsigned char)(ptr)[0]) DEFINE_UTF16_TO_UTF8(big2_) DEFINE_UTF16_TO_UTF16(big2_) #undef SET2 #undef GET_LO #undef GET_HI #define LITTLE2_BYTE_TYPE(enc, p) \ ((p)[1] == 0 \ ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \ : unicode_byte_type((p)[1], (p)[0])) @@ -699,106 +790,106 @@ little2_isNmstrtMin(const ENCODING *enc, const char *p) #define XML_TOK_IMPL_C #include "xmltok_impl.c" #undef XML_TOK_IMPL_C #undef MINBPC #undef BYTE_TYPE #undef BYTE_TO_ASCII #undef CHAR_MATCHES #undef IS_NAME_CHAR #undef IS_NAME_CHAR_MINBPC #undef IS_NMSTRT_CHAR #undef IS_NMSTRT_CHAR_MINBPC #undef IS_INVALID_CHAR #endif /* not XML_MIN_SIZE */ #ifdef XML_NS static const struct normal_encoding little2_encoding_ns = { { VTABLE, 2, 0, #if BYTEORDER == 1234 1 #else 0 #endif }, { #include "asciitab.h" #include "latin1tab.h" }, - STANDARD_VTABLE(little2_) + STANDARD_VTABLE(little2_) NULL_VTABLE }; #endif static const struct normal_encoding little2_encoding = { { VTABLE, 2, 0, #if BYTEORDER == 1234 1 #else 0 #endif }, { #define BT_COLON BT_NMSTRT #include "asciitab.h" #undef BT_COLON #include "latin1tab.h" }, - STANDARD_VTABLE(little2_) + STANDARD_VTABLE(little2_) NULL_VTABLE }; #if BYTEORDER != 4321 #ifdef XML_NS static const struct normal_encoding internal_little2_encoding_ns = { { VTABLE, 2, 0, 1 }, { #include "iasciitab.h" #include "latin1tab.h" }, - STANDARD_VTABLE(little2_) + STANDARD_VTABLE(little2_) NULL_VTABLE }; #endif static const struct normal_encoding internal_little2_encoding = { { VTABLE, 2, 0, 1 }, { #define BT_COLON BT_NMSTRT #include "iasciitab.h" #undef BT_COLON #include "latin1tab.h" }, - STANDARD_VTABLE(little2_) + STANDARD_VTABLE(little2_) NULL_VTABLE }; #endif #define BIG2_BYTE_TYPE(enc, p) \ ((p)[0] == 0 \ ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \ : unicode_byte_type((p)[0], (p)[1])) #define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1) #define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c) #define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \ UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1]) #define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \ UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1]) #ifdef XML_MIN_SIZE static int PTRFASTCALL big2_byteType(const ENCODING *enc, const char *p) { return BIG2_BYTE_TYPE(enc, p); } static int PTRFASTCALL big2_byteToAscii(const ENCODING *enc, const char *p) { return BIG2_BYTE_TO_ASCII(enc, p); } @@ -840,132 +931,132 @@ big2_isNmstrtMin(const ENCODING *enc, const char *p) #define XML_TOK_IMPL_C #include "xmltok_impl.c" #undef XML_TOK_IMPL_C #undef MINBPC #undef BYTE_TYPE #undef BYTE_TO_ASCII #undef CHAR_MATCHES #undef IS_NAME_CHAR #undef IS_NAME_CHAR_MINBPC #undef IS_NMSTRT_CHAR #undef IS_NMSTRT_CHAR_MINBPC #undef IS_INVALID_CHAR #endif /* not XML_MIN_SIZE */ #ifdef XML_NS static const struct normal_encoding big2_encoding_ns = { { VTABLE, 2, 0, #if BYTEORDER == 4321 1 #else 0 #endif }, { #include "asciitab.h" #include "latin1tab.h" }, - STANDARD_VTABLE(big2_) + STANDARD_VTABLE(big2_) NULL_VTABLE }; #endif static const struct normal_encoding big2_encoding = { { VTABLE, 2, 0, #if BYTEORDER == 4321 1 #else 0 #endif }, { #define BT_COLON BT_NMSTRT #include "asciitab.h" #undef BT_COLON #include "latin1tab.h" }, - STANDARD_VTABLE(big2_) + STANDARD_VTABLE(big2_) NULL_VTABLE }; #if BYTEORDER != 1234 #ifdef XML_NS static const struct normal_encoding internal_big2_encoding_ns = { { VTABLE, 2, 0, 1 }, { #include "iasciitab.h" #include "latin1tab.h" }, - STANDARD_VTABLE(big2_) + STANDARD_VTABLE(big2_) NULL_VTABLE }; #endif static const struct normal_encoding internal_big2_encoding = { { VTABLE, 2, 0, 1 }, { #define BT_COLON BT_NMSTRT #include "iasciitab.h" #undef BT_COLON #include "latin1tab.h" }, - STANDARD_VTABLE(big2_) + STANDARD_VTABLE(big2_) NULL_VTABLE }; #endif #undef PREFIX static int FASTCALL streqci(const char *s1, const char *s2) { for (;;) { char c1 = *s1++; char c2 = *s2++; if (ASCII_a <= c1 && c1 <= ASCII_z) c1 += ASCII_A - ASCII_a; if (ASCII_a <= c2 && c2 <= ASCII_z) c2 += ASCII_A - ASCII_a; if (c1 != c2) return 0; if (!c1) break; } return 1; } static void PTRCALL -initUpdatePosition(const ENCODING *enc, const char *ptr, +initUpdatePosition(const ENCODING *UNUSED_P(enc), const char *ptr, const char *end, POSITION *pos) { normal_updatePosition(&utf8_encoding.enc, ptr, end, pos); } static int toAscii(const ENCODING *enc, const char *ptr, const char *end) { char buf[1]; char *p = buf; XmlUtf8Convert(enc, &ptr, end, &p, p + 1); if (p == buf) return -1; else return buf[0]; } static int FASTCALL isSpace(int c) { switch (c) { case 0x20: case 0xD: case 0xA: case 0x9: return 1; } return 0; } @@ -1261,112 +1352,117 @@ XmlSizeOfUnknownEncoding(void) } static int PTRFASTCALL unknown_isName(const ENCODING *enc, const char *p) { const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); int c = uenc->convert(uenc->userData, p); if (c & ~0xFFFF) return 0; return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF); } static int PTRFASTCALL unknown_isNmstrt(const ENCODING *enc, const char *p) { const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); int c = uenc->convert(uenc->userData, p); if (c & ~0xFFFF) return 0; return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF); } static int PTRFASTCALL unknown_isInvalid(const ENCODING *enc, const char *p) { const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); int c = uenc->convert(uenc->userData, p); return (c & ~0xFFFF) || checkCharRefNumber(c) < 0; } -static void PTRCALL +static enum XML_Convert_Result PTRCALL unknown_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim, char **toP, const char *toLim) { const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); char buf[XML_UTF8_ENCODE_MAX]; for (;;) { const char *utf8; int n; if (*fromP == fromLim) - break; + return XML_CONVERT_COMPLETED; utf8 = uenc->utf8[(unsigned char)**fromP]; n = *utf8++; if (n == 0) { int c = uenc->convert(uenc->userData, *fromP); n = XmlUtf8Encode(c, buf); if (n > toLim - *toP) - break; + return XML_CONVERT_OUTPUT_EXHAUSTED; utf8 = buf; *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP] - (BT_LEAD2 - 2)); } else { if (n > toLim - *toP) - break; + return XML_CONVERT_OUTPUT_EXHAUSTED; (*fromP)++; } do { *(*toP)++ = *utf8++; } while (--n != 0); } } -static void PTRCALL +static enum XML_Convert_Result PTRCALL unknown_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim, unsigned short **toP, const unsigned short *toLim) { const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc); - while (*fromP != fromLim && *toP != toLim) { + while (*fromP < fromLim && *toP < toLim) { unsigned short c = uenc->utf16[(unsigned char)**fromP]; if (c == 0) { c = (unsigned short) uenc->convert(uenc->userData, *fromP); *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP] - (BT_LEAD2 - 2)); } else (*fromP)++; *(*toP)++ = c; } + + if ((*toP == toLim) && (*fromP < fromLim)) + return XML_CONVERT_OUTPUT_EXHAUSTED; + else + return XML_CONVERT_COMPLETED; } ENCODING * XmlInitUnknownEncoding(void *mem, int *table, CONVERTER convert, void *userData) { int i; struct unknown_encoding *e = (struct unknown_encoding *)mem; for (i = 0; i < (int)sizeof(struct normal_encoding); i++) ((char *)mem)[i] = ((char *)&latin1_encoding)[i]; for (i = 0; i < 128; i++) if (latin1_encoding.type[i] != BT_OTHER && latin1_encoding.type[i] != BT_NONXML && table[i] != i) return 0; for (i = 0; i < 256; i++) { int c = table[i]; if (c == -1) { e->normal.type[i] = BT_MALFORM; /* This shouldn't really get used. */ e->utf16[i] = 0xFFFF; e->utf8[i][0] = 1; e->utf8[i][1] = 0; } else if (c < 0) { if (c < -4) return 0; e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2)); @@ -1476,61 +1572,61 @@ getEncodingIndex(const char *name) if (streqci(name, encodingNames[i])) return i; return UNKNOWN_ENC; } /* For binary compatibility, we store the index of the encoding specified at initialization in the isUtf16 member. */ #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16) #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i) /* This is what detects the encoding. encodingTable maps from encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of the external (protocol) specified encoding; state is XML_CONTENT_STATE if we're parsing an external text entity, and XML_PROLOG_STATE otherwise. */ static int initScan(const ENCODING * const *encodingTable, const INIT_ENCODING *enc, int state, const char *ptr, const char *end, const char **nextTokPtr) { const ENCODING **encPtr; - if (ptr == end) + if (ptr >= end) return XML_TOK_NONE; encPtr = enc->encPtr; if (ptr + 1 == end) { /* only a single byte available for auto-detection */ #ifndef XML_DTD /* FIXME */ /* a well-formed document entity must have more than one byte */ if (state != XML_CONTENT_STATE) return XML_TOK_PARTIAL; #endif /* so we're parsing an external text entity... */ /* if UTF-16 was externally specified, then we need at least 2 bytes */ switch (INIT_ENC_INDEX(enc)) { case UTF_16_ENC: case UTF_16LE_ENC: case UTF_16BE_ENC: return XML_TOK_PARTIAL; } switch ((unsigned char)*ptr) { case 0xFE: case 0xFF: case 0xEF: /* possibly first byte of UTF-8 BOM */ if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE) break; /* fall through */ case 0x00: case 0x3C: return XML_TOK_PARTIAL; } } |