From:             ben at csgb dot de
Operating system: possibly all
PHP version:      Irrelevant
PHP Bug Type:     Strings related
Bug description:  partially incorrect utf8 to htmlentities mapping

Description:
------------
During some doublecheck after Bug #28042 was closed, I discovered some
more mistakes in that file. I just checked the UTF-8 tables, don't know if
the other charsets are wrong, too.

In Bug #28042, We forgot two letters of the greek table, 'upsih' and
'piv', which are spelled with an 'i' as in ice instead of '1'.

Also there are some NULLs missing at several points. This causes
htmlentities(,,"UTF-8") to convert UTF-8 encoded chars into the wrong or
into no HTML-Entities since the mappings are shifted. For example U+202F
is mapped to ‰ which should be U+2030.

Here is my diff of the php5-cvs/ext/standard/html.c, the same
modifications should be made in php-4.3, please double check

--- html.c      2004-04-18 02:30:24.000000000 +0200
+++ html.c.fixed        2004-04-19 18:44:47.949012992 +0200
@@ -114,13 +114,13 @@
        /* 354 - 375 */
        NULL, NULL, NULL, NULL, NULL, NULL, NULL,
        NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-       NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+       NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
        /* 376 */
        "Yuml",
        /* 377 - 401 */
        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-       NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+       NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
        /* 402 */
        "fnof"
 };
@@ -130,7 +130,7 @@
        "circ",
        /* 711 - 731 */
        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-       NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+       NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
        /* 732 */
        "tilde",
 };
@@ -147,9 +147,9 @@
        "sigmaf", "sigma", "tau", "upsilon", "phi", "chi", "psi",
"omega",
        /* 970 - 976 are not mapped */
        NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-       "thetasym", "ups1h",
+       "thetasym", "upsih",
        NULL, NULL, NULL,
-       "p1v"
+       "piv"
 };

 static entity_table_t ent_uni_punct[] = {
@@ -158,7 +158,7 @@
        "thinsp", NULL, NULL, "zwnj", "zwj", "lrm", "rlm",
        NULL, NULL, NULL, "ndash", "mdash", NULL, NULL, NULL,
        "lsquo", "rsquo", "sbquo", NULL, "ldquo", "rdquo", "bdquo",
-       "dagger", "Dagger",     "bull", NULL, NULL, NULL, "hellip",
+       NULL, "dagger", "Dagger", "bull", NULL, NULL, NULL, "hellip",
        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "permil",
NULL,
        "prime", "Prime", NULL, NULL, NULL, NULL, NULL, "lsaquo",
"rsaquo",
        NULL, NULL, NULL, "oline", NULL, NULL, NULL, NULL, NULL,
@@ -191,7 +191,7 @@
        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
        /* 8624 (0x21b0) */
-       NULL, NULL, NULL, NULL, "crarr", NULL, NULL, NULL,
+       NULL, NULL, NULL, NULL, NULL, "crarr", NULL, NULL,
        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
        /* 8640 (0x21c0) */
        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
@@ -206,9 +206,9 @@
        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
        /* 8704 (0x2200) */
        "forall", "comp", "part", "exist", "nexist", "empty", NULL,
"nabla",
-       "isin", "notin", "epsis", NULL, "ni", "bepsi", NULL, "prod",
+       "isin", "notin", "epsis", "ni", NULL, "bepsi", NULL, "prod",
        /* 8720 (0x2210) */
-       "coprod", "sum", "minus", "mnplus", "plusdo", NULL, "setmn",
NULL,
+       "coprod", "sum", "minus", "mnplus", "plusdo", NULL, "setmn",
"lowast",
        "compfn", NULL, "radic", NULL, NULL, "prop", "infin", "ang90",
        /* 8736 (0x2220) */
        "ang", "angmsd", "angsph", "mid", "nmid", "par", "npar", "and",
@@ -232,17 +232,19 @@
        "npr", "nsc", "sub", "sup", "nsub", "nsup", "sube", "supe",
        /* 8840 - 8852 */
        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL,
+    NULL,
        /* 8853 */
        "oplus", NULL, "otimes",
        /* 8856 - 8868 */
        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL,
+    NULL,
        /* 8869 */
        "perp",
        /* 8870 - 8901 */
        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-       NULL,
+       NULL,
        /* 8901 */
        "sdot",
        /* 8902 - 8967 */
@@ -252,14 +254,13 @@
        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-       NULL, NULL, NULL, NULL, NULL,
+       NULL, NULL, NULL, NULL, NULL, NULL,
        /* 8968 */
        "lceil", "rceil", "lfloor", "rfloor",
        /* 8969 - 9000 */
        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
        NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-       NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-       NULL,
+       NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
        /* 9001 */
        "lang", "rang",
 };eck.

 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
        /* 9001 */
        "lang", "rang",
 };


Additionally I wonder, if it's wise to map those high (Most of the
ent_uni_8592_9002, but not all) Unicode chars to non HTML-standards
compliant entities. Not all browsers might interpret them correct.
Probably it would be better to implement a function, which maps them to
hexadecimal values like &x8661; instead of ⇕ (These are in ISO and
SGML-Standards but not in HTML)


-- 
Edit bug report at http://bugs.php.net/?id=28067&edit=1
-- 
Try a CVS snapshot (php4):  http://bugs.php.net/fix.php?id=28067&r=trysnapshot4
Try a CVS snapshot (php5):  http://bugs.php.net/fix.php?id=28067&r=trysnapshot5
Fixed in CVS:               http://bugs.php.net/fix.php?id=28067&r=fixedcvs
Fixed in release:           http://bugs.php.net/fix.php?id=28067&r=alreadyfixed
Need backtrace:             http://bugs.php.net/fix.php?id=28067&r=needtrace
Need Reproduce Script:      http://bugs.php.net/fix.php?id=28067&r=needscript
Try newer version:          http://bugs.php.net/fix.php?id=28067&r=oldversion
Not developer issue:        http://bugs.php.net/fix.php?id=28067&r=support
Expected behavior:          http://bugs.php.net/fix.php?id=28067&r=notwrong
Not enough info:            http://bugs.php.net/fix.php?id=28067&r=notenoughinfo
Submitted twice:            http://bugs.php.net/fix.php?id=28067&r=submittedtwice
register_globals:           http://bugs.php.net/fix.php?id=28067&r=globals
PHP 3 support discontinued: http://bugs.php.net/fix.php?id=28067&r=php3
Daylight Savings:           http://bugs.php.net/fix.php?id=28067&r=dst
IIS Stability:              http://bugs.php.net/fix.php?id=28067&r=isapi
Install GNU Sed:            http://bugs.php.net/fix.php?id=28067&r=gnused
Floating point limitations: http://bugs.php.net/fix.php?id=28067&r=float

Reply via email to