cataphract Mon, 11 Oct 2010 22:26:10 +0000 Revision: http://svn.php.net/viewvc?view=revision&revision=304326
Log: - PHP uses a big endian representation when it converts the code unit sequences to integers so as to store the entity maps. Code in traverse_for_entities assumed little endian. Fixed. (in practice, due to the absence of unicode and entity mappings for multi-byte encodings -- except UTF-8 --, this doesn't matter, so the relevant code was commented out for performance reasons). Changed paths: U php/php-src/trunk/ext/standard/html.c Modified: php/php-src/trunk/ext/standard/html.c =================================================================== --- php/php-src/trunk/ext/standard/html.c 2010-10-11 20:09:25 UTC (rev 304325) +++ php/php-src/trunk/ext/standard/html.c 2010-10-11 22:26:10 UTC (rev 304326) @@ -456,7 +456,7 @@ /* }}} */ /* {{{ php_utf32_utf8 */ -size_t php_utf32_utf8(unsigned char *buf, int k) +static size_t php_utf32_utf8(unsigned char *buf, unsigned k) { size_t retval = 0; @@ -487,6 +487,47 @@ } /* }}} */ +/* {{{ php_mb2_int_to_char + * Convert back big endian int representation of sequence of one or two 8-bit code units. */ +static size_t php_mb2_int_to_char(unsigned char *buf, unsigned k) +{ + assert(k <= 0xFFFFU); + /* one or two bytes */ + if (k <= 0xFFU) { /* 1 */ + buf[0] = k; + return 1U; + } else { /* 2 */ + buf[0] = k >> 8; + buf[1] = k & 0xFFU; + return 2U; + } +} +/* }}} */ + +/* {{{ php_mb3_int_to_char + * Convert back big endian int representation of sequence of one to three 8-bit code units. + * For EUC-JP. */ +static size_t php_mb3_int_to_char(unsigned char *buf, unsigned k) +{ + assert(k <= 0xFFFFFFU); + /* one to three bytes */ + if (k <= 0xFFU) { /* 1 */ + buf[0] = k; + return 1U; + } else if (k <= 0xFFFFU) { /* 2 */ + buf[0] = k >> 8; + buf[1] = k & 0xFFU; + return 2U; + } else { + buf[0] = k >> 16; + buf[1] = (k >> 8) & 0xFFU; + buf[2] = k & 0xFFU; + return 3U; + } +} +/* }}} */ + + /* {{{ unimap_bsearc_cmp * Binary search of unicode code points in unicode <--> charset mapping. * Returns the code point in the target charset (whose mapping table was given) or 0 if @@ -817,21 +858,23 @@ case cs_big5hkscs: case cs_sjis: case cs_gb2312: - /* one or two bytes */ - *(q++) = (code & 0xFFU); - if (0xFF00U & code) { /* 2 */ - *(q++) = (code >> 8); - } + /* we don't have named entity or unicode mappings for these yet, + * so we're guaranteed code <= 0xFF */ +#if 0 + q += php_mb2_int_to_char((unsigned char*)q, code); +#else + assert(code <= 0xFFU); + *(q++) = code; +#endif break; case cs_eucjp: - /* one to three bytes */ - *(q++) = code & 0xFFU; - if (0xFFFF00U & code) { /* 2 */ - *(q++) = ((code >> 8) & 0xFFU); - if (0xFF0000U & code) /* 3 */ - *(q++) = (code >> 16); - } +#if 0 /* idem */ + q += php_mb2_int_to_char((unsigned char*)q, code); +#else + assert(code <= 0xFFU); + *(q++) = code; +#endif break; default:
-- PHP CVS Mailing List (http://www.php.net/) To unsubscribe, visit: http://www.php.net/unsub.php