cataphract Mon, 11 Oct 2010 22:26:10 +0000
Revision: http://svn.php.net/viewvc?view=revision&revision=304326
Log:
- PHP uses a big endian representation when it converts the
code unit sequences to integers so as to store the entity
maps. Code in traverse_for_entities assumed little
endian. Fixed.
(in practice, due to the absence of unicode and entity
mappings for multi-byte encodings -- except UTF-8 --, this
doesn't matter, so the relevant code was commented out for
performance reasons).
Changed paths:
U php/php-src/trunk/ext/standard/html.c
Modified: php/php-src/trunk/ext/standard/html.c
===================================================================
--- php/php-src/trunk/ext/standard/html.c 2010-10-11 20:09:25 UTC (rev
304325)
+++ php/php-src/trunk/ext/standard/html.c 2010-10-11 22:26:10 UTC (rev
304326)
@@ -456,7 +456,7 @@
/* }}} */
/* {{{ php_utf32_utf8 */
-size_t php_utf32_utf8(unsigned char *buf, int k)
+static size_t php_utf32_utf8(unsigned char *buf, unsigned k)
{
size_t retval = 0;
@@ -487,6 +487,47 @@
}
/* }}} */
+/* {{{ php_mb2_int_to_char
+ * Convert back big endian int representation of sequence of one or two 8-bit
code units. */
+static size_t php_mb2_int_to_char(unsigned char *buf, unsigned k)
+{
+ assert(k <= 0xFFFFU);
+ /* one or two bytes */
+ if (k <= 0xFFU) { /* 1 */
+ buf[0] = k;
+ return 1U;
+ } else { /* 2 */
+ buf[0] = k >> 8;
+ buf[1] = k & 0xFFU;
+ return 2U;
+ }
+}
+/* }}} */
+
+/* {{{ php_mb3_int_to_char
+ * Convert back big endian int representation of sequence of one to three
8-bit code units.
+ * For EUC-JP. */
+static size_t php_mb3_int_to_char(unsigned char *buf, unsigned k)
+{
+ assert(k <= 0xFFFFFFU);
+ /* one to three bytes */
+ if (k <= 0xFFU) { /* 1 */
+ buf[0] = k;
+ return 1U;
+ } else if (k <= 0xFFFFU) { /* 2 */
+ buf[0] = k >> 8;
+ buf[1] = k & 0xFFU;
+ return 2U;
+ } else {
+ buf[0] = k >> 16;
+ buf[1] = (k >> 8) & 0xFFU;
+ buf[2] = k & 0xFFU;
+ return 3U;
+ }
+}
+/* }}} */
+
+
/* {{{ unimap_bsearc_cmp
* Binary search of unicode code points in unicode <--> charset mapping.
* Returns the code point in the target charset (whose mapping table was
given) or 0 if
@@ -817,21 +858,23 @@
case cs_big5hkscs:
case cs_sjis:
case cs_gb2312:
- /* one or two bytes */
- *(q++) = (code & 0xFFU);
- if (0xFF00U & code) { /* 2 */
- *(q++) = (code >> 8);
- }
+ /* we don't have named entity or unicode mappings for
these yet,
+ * so we're guaranteed code <= 0xFF */
+#if 0
+ q += php_mb2_int_to_char((unsigned char*)q, code);
+#else
+ assert(code <= 0xFFU);
+ *(q++) = code;
+#endif
break;
case cs_eucjp:
- /* one to three bytes */
- *(q++) = code & 0xFFU;
- if (0xFFFF00U & code) { /* 2 */
- *(q++) = ((code >> 8) & 0xFFU);
- if (0xFF0000U & code) /* 3 */
- *(q++) = (code >> 16);
- }
+#if 0 /* idem */
+ q += php_mb2_int_to_char((unsigned char*)q, code);
+#else
+ assert(code <= 0xFFU);
+ *(q++) = code;
+#endif
break;
default:
--
PHP CVS Mailing List (http://www.php.net/)
To unsubscribe, visit: http://www.php.net/unsub.php