ID: 28067 User updated by: ben at csgb dot de Reported By: ben at csgb dot de Status: Open Bug Type: Strings related Operating System: possibly all PHP Version: Irrelevant New Comment:
sorry, please be careful when using the diff, have to learn to copy and paste correctly )-; the diff ends after the first: + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 9001 */ "lang", "rang", }; without the "eck" Previous Comments: ------------------------------------------------------------------------ [2004-04-19 20:46:26] ben at csgb dot de Description: ------------ During some doublecheck after Bug #28042 was closed, I discovered some more mistakes in that file. I just checked the UTF-8 tables, don't know if the other charsets are wrong, too. In Bug #28042, We forgot two letters of the greek table, 'upsih' and 'piv', which are spelled with an 'i' as in ice instead of '1'. Also there are some NULLs missing at several points. This causes htmlentities(,,"UTF-8") to convert UTF-8 encoded chars into the wrong or into no HTML-Entities since the mappings are shifted. For example U+202F is mapped to ‰ which should be U+2030. Here is my diff of the php5-cvs/ext/standard/html.c, the same modifications should be made in php-4.3, please double check --- html.c 2004-04-18 02:30:24.000000000 +0200 +++ html.c.fixed 2004-04-19 18:44:47.949012992 +0200 @@ -114,13 +114,13 @@ /* 354 - 375 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 376 */ "Yuml", /* 377 - 401 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 402 */ "fnof" }; @@ -130,7 +130,7 @@ "circ", /* 711 - 731 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 732 */ "tilde", }; @@ -147,9 +147,9 @@ "sigmaf", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega", /* 970 - 976 are not mapped */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, - "thetasym", "ups1h", + "thetasym", "upsih", NULL, NULL, NULL, - "p1v" + "piv" }; static entity_table_t ent_uni_punct[] = { @@ -158,7 +158,7 @@ "thinsp", NULL, NULL, "zwnj", "zwj", "lrm", "rlm", NULL, NULL, NULL, "ndash", "mdash", NULL, NULL, NULL, "lsquo", "rsquo", "sbquo", NULL, "ldquo", "rdquo", "bdquo", - "dagger", "Dagger", "bull", NULL, NULL, NULL, "hellip", + NULL, "dagger", "Dagger", "bull", NULL, NULL, NULL, "hellip", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "permil", NULL, "prime", "Prime", NULL, NULL, NULL, NULL, NULL, "lsaquo", "rsaquo", NULL, NULL, NULL, "oline", NULL, NULL, NULL, NULL, NULL, @@ -191,7 +191,7 @@ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 8624 (0x21b0) */ - NULL, NULL, NULL, NULL, "crarr", NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, "crarr", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 8640 (0x21c0) */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, @@ -206,9 +206,9 @@ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 8704 (0x2200) */ "forall", "comp", "part", "exist", "nexist", "empty", NULL, "nabla", - "isin", "notin", "epsis", NULL, "ni", "bepsi", NULL, "prod", + "isin", "notin", "epsis", "ni", NULL, "bepsi", NULL, "prod", /* 8720 (0x2210) */ - "coprod", "sum", "minus", "mnplus", "plusdo", NULL, "setmn", NULL, + "coprod", "sum", "minus", "mnplus", "plusdo", NULL, "setmn", "lowast", "compfn", NULL, "radic", NULL, NULL, "prop", "infin", "ang90", /* 8736 (0x2220) */ "ang", "angmsd", "angsph", "mid", "nmid", "par", "npar", "and", @@ -232,17 +232,19 @@ "npr", "nsc", "sub", "sup", "nsub", "nsup", "sube", "supe", /* 8840 - 8852 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, /* 8853 */ "oplus", NULL, "otimes", /* 8856 - 8868 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, /* 8869 */ "perp", /* 8870 - 8901 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, + NULL, /* 8901 */ "sdot", /* 8902 - 8967 */ @@ -252,14 +254,13 @@ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, /* 8968 */ "lceil", "rceil", "lfloor", "rfloor", /* 8969 - 9000 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 9001 */ "lang", "rang", };eck. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 9001 */ "lang", "rang", }; Additionally I wonder, if it's wise to map those high (Most of the ent_uni_8592_9002, but not all) Unicode chars to non HTML-standards compliant entities. Not all browsers might interpret them correct. Probably it would be better to implement a function, which maps them to hexadecimal values like &x8661; instead of ⇕ (These are in ISO and SGML-Standards but not in HTML) ------------------------------------------------------------------------ -- Edit this bug report at http://bugs.php.net/?id=28067&edit=1