ID: 28067 Updated by: php-bugs@lists.php.net Reported By: ben at csgb dot de -Status: Feedback +Status: No Feedback Bug Type: Strings related Operating System: possibly all PHP Version: 4, 5, who knows Assigned To: derick New Comment:
No feedback was provided for this bug for over a week, so it is being suspended automatically. If you are able to provide the information that was originally requested, please do so and change the status of the bug back to "Open". Previous Comments: ------------------------------------------------------------------------ [2005-02-21 20:21:22] [EMAIL PROTECTED] Please try using this CVS snapshot: http://snaps.php.net/php4-STABLE-latest.tar.gz For Windows: http://snaps.php.net/win32/php4-win32-STABLE-latest.zip ------------------------------------------------------------------------ [2004-04-20 17:50:06] [EMAIL PROTECTED] received the patch, but it doesn't look 100% correct so I need to so some investigations. ------------------------------------------------------------------------ [2004-04-20 09:10:06] [EMAIL PROTECTED] Hello, can you please mail this patch to me, as the bug system garbled it a bit. regards, Derick ------------------------------------------------------------------------ [2004-04-19 20:51:01] ben at csgb dot de sorry, please be careful when using the diff, have to learn to copy and paste correctly )-; the diff ends after the first: + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 9001 */ "lang", "rang", }; without the "eck" ------------------------------------------------------------------------ [2004-04-19 20:46:26] ben at csgb dot de Description: ------------ During some doublecheck after Bug #28042 was closed, I discovered some more mistakes in that file. I just checked the UTF-8 tables, don't know if the other charsets are wrong, too. In Bug #28042, We forgot two letters of the greek table, 'upsih' and 'piv', which are spelled with an 'i' as in ice instead of '1'. Also there are some NULLs missing at several points. This causes htmlentities(,,"UTF-8") to convert UTF-8 encoded chars into the wrong or into no HTML-Entities since the mappings are shifted. For example U+202F is mapped to ‰ which should be U+2030. Here is my diff of the php5-cvs/ext/standard/html.c, the same modifications should be made in php-4.3, please double check --- html.c 2004-04-18 02:30:24.000000000 +0200 +++ html.c.fixed 2004-04-19 18:44:47.949012992 +0200 @@ -114,13 +114,13 @@ /* 354 - 375 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 376 */ "Yuml", /* 377 - 401 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 402 */ "fnof" }; @@ -130,7 +130,7 @@ "circ", /* 711 - 731 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 732 */ "tilde", }; @@ -147,9 +147,9 @@ "sigmaf", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega", /* 970 - 976 are not mapped */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, - "thetasym", "ups1h", + "thetasym", "upsih", NULL, NULL, NULL, - "p1v" + "piv" }; static entity_table_t ent_uni_punct[] = { @@ -158,7 +158,7 @@ "thinsp", NULL, NULL, "zwnj", "zwj", "lrm", "rlm", NULL, NULL, NULL, "ndash", "mdash", NULL, NULL, NULL, "lsquo", "rsquo", "sbquo", NULL, "ldquo", "rdquo", "bdquo", - "dagger", "Dagger", "bull", NULL, NULL, NULL, "hellip", + NULL, "dagger", "Dagger", "bull", NULL, NULL, NULL, "hellip", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "permil", NULL, "prime", "Prime", NULL, NULL, NULL, NULL, NULL, "lsaquo", "rsaquo", NULL, NULL, NULL, "oline", NULL, NULL, NULL, NULL, NULL, @@ -191,7 +191,7 @@ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 8624 (0x21b0) */ - NULL, NULL, NULL, NULL, "crarr", NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, "crarr", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 8640 (0x21c0) */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, @@ -206,9 +206,9 @@ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 8704 (0x2200) */ "forall", "comp", "part", "exist", "nexist", "empty", NULL, "nabla", - "isin", "notin", "epsis", NULL, "ni", "bepsi", NULL, "prod", + "isin", "notin", "epsis", "ni", NULL, "bepsi", NULL, "prod", /* 8720 (0x2210) */ - "coprod", "sum", "minus", "mnplus", "plusdo", NULL, "setmn", NULL, + "coprod", "sum", "minus", "mnplus", "plusdo", NULL, "setmn", "lowast", "compfn", NULL, "radic", NULL, NULL, "prop", "infin", "ang90", /* 8736 (0x2220) */ "ang", "angmsd", "angsph", "mid", "nmid", "par", "npar", "and", @@ -232,17 +232,19 @@ "npr", "nsc", "sub", "sup", "nsub", "nsup", "sube", "supe", /* 8840 - 8852 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, /* 8853 */ "oplus", NULL, "otimes", /* 8856 - 8868 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, /* 8869 */ "perp", /* 8870 - 8901 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, + NULL, /* 8901 */ "sdot", /* 8902 - 8967 */ @@ -252,14 +254,13 @@ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, /* 8968 */ "lceil", "rceil", "lfloor", "rfloor", /* 8969 - 9000 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 9001 */ "lang", "rang", };eck. NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* 9001 */ "lang", "rang", }; Additionally I wonder, if it's wise to map those high (Most of the ent_uni_8592_9002, but not all) Unicode chars to non HTML-standards compliant entities. Not all browsers might interpret them correct. Probably it would be better to implement a function, which maps them to hexadecimal values like &x8661; instead of ⇕ (These are in ISO and SGML-Standards but not in HTML) ------------------------------------------------------------------------ -- Edit this bug report at http://bugs.php.net/?id=28067&edit=1