From: ben at csgb dot de Operating system: possibly all PHP version: Irrelevant PHP Bug Type: Strings related Bug description: partially incorrect utf8 to htmlentities mapping
Description: ------------ During some doublecheck after Bug #28042 was closed, I discovered some more mistakes in that file. I just checked the UTF-8 tables, don't know if the other charsets are wrong, too. In Bug #28042, We forgot two letters of the greek table, 'upsih' and 'piv', which are spelled with an 'i' as in ice instead of '1'. Also there are some NULLs missing at several points. This causes htmlentities(,,"UTF-8") to convert UTF-8 encoded chars into the wrong or into no HTML-Entities since the mappings are shifted. For example U+202F is mapped to ‰ which should be U+2030. Here is my diff of the php5-cvs/ext/standard/html.c, the same modifications should be made in php-4.3, please double check --- html.c 2004-04-18 02:30:24.000000000 +0200 +++ html.c.fixed 2004-04-19 18:44:47.949012992 +0200 @@ -114,13 +114,13 @@ /* 354 - 375 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 376 */ "Yuml", /* 377 - 401 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 402 */ "fnof" }; @@ -130,7 +130,7 @@ "circ", /* 711 - 731 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 732 */ "tilde", }; @@ -147,9 +147,9 @@ "sigmaf", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega", /* 970 - 976 are not mapped */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, - "thetasym", "ups1h", + "thetasym", "upsih", NULL, NULL, NULL, - "p1v" + "piv" }; static entity_table_t ent_uni_punct[] = { @@ -158,7 +158,7 @@ "thinsp", NULL, NULL, "zwnj", "zwj", "lrm", "rlm", NULL, NULL, NULL, "ndash", "mdash", NULL, NULL, NULL, "lsquo", "rsquo", "sbquo", NULL, "ldquo", "rdquo", "bdquo", - "dagger", "Dagger", "bull", NULL, NULL, NULL, "hellip", + NULL, "dagger", "Dagger", "bull", NULL, NULL, NULL, "hellip", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "permil", NULL, "prime", "Prime", NULL, NULL, NULL, NULL, NULL, "lsaquo", "rsaquo", NULL, NULL, NULL, "oline", NULL, NULL, NULL, NULL, NULL, @@ -191,7 +191,7 @@ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 8624 (0x21b0) */ - NULL, NULL, NULL, NULL, "crarr", NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, "crarr", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 8640 (0x21c0) */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, @@ -206,9 +206,9 @@ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 8704 (0x2200) */ "forall", "comp", "part", "exist", "nexist", "empty", NULL, "nabla", - "isin", "notin", "epsis", NULL, "ni", "bepsi", NULL, "prod", + "isin", "notin", "epsis", "ni", NULL, "bepsi", NULL, "prod", /* 8720 (0x2210) */ - "coprod", "sum", "minus", "mnplus", "plusdo", NULL, "setmn", NULL, + "coprod", "sum", "minus", "mnplus", "plusdo", NULL, "setmn", "lowast", "compfn", NULL, "radic", NULL, NULL, "prop", "infin", "ang90", /* 8736 (0x2220) */ "ang", "angmsd", "angsph", "mid", "nmid", "par", "npar", "and", @@ -232,17 +232,19 @@ "npr", "nsc", "sub", "sup", "nsub", "nsup", "sube", "supe", /* 8840 - 8852 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, /* 8853 */ "oplus", NULL, "otimes", /* 8856 - 8868 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, /* 8869 */ "perp", /* 8870 - 8901 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, + NULL, /* 8901 */ "sdot", /* 8902 - 8967 */ @@ -252,14 +254,13 @@ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, /* 8968 */ "lceil", "rceil", "lfloor", "rfloor", /* 8969 - 9000 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 9001 */ "lang", "rang", };eck. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 9001 */ "lang", "rang", }; Additionally I wonder, if it's wise to map those high (Most of the ent_uni_8592_9002, but not all) Unicode chars to non HTML-standards compliant entities. Not all browsers might interpret them correct. Probably it would be better to implement a function, which maps them to hexadecimal values like &x8661; instead of ⇕ (These are in ISO and SGML-Standards but not in HTML) -- Edit bug report at http://bugs.php.net/?id=28067&edit=1 -- Try a CVS snapshot (php4): http://bugs.php.net/fix.php?id=28067&r=trysnapshot4 Try a CVS snapshot (php5): http://bugs.php.net/fix.php?id=28067&r=trysnapshot5 Fixed in CVS: http://bugs.php.net/fix.php?id=28067&r=fixedcvs Fixed in release: http://bugs.php.net/fix.php?id=28067&r=alreadyfixed Need backtrace: http://bugs.php.net/fix.php?id=28067&r=needtrace Need Reproduce Script: http://bugs.php.net/fix.php?id=28067&r=needscript Try newer version: http://bugs.php.net/fix.php?id=28067&r=oldversion Not developer issue: http://bugs.php.net/fix.php?id=28067&r=support Expected behavior: http://bugs.php.net/fix.php?id=28067&r=notwrong Not enough info: http://bugs.php.net/fix.php?id=28067&r=notenoughinfo Submitted twice: http://bugs.php.net/fix.php?id=28067&r=submittedtwice register_globals: http://bugs.php.net/fix.php?id=28067&r=globals PHP 3 support discontinued: http://bugs.php.net/fix.php?id=28067&r=php3 Daylight Savings: http://bugs.php.net/fix.php?id=28067&r=dst IIS Stability: http://bugs.php.net/fix.php?id=28067&r=isapi Install GNU Sed: http://bugs.php.net/fix.php?id=28067&r=gnused Floating point limitations: http://bugs.php.net/fix.php?id=28067&r=float