[PHP-DEV] (#9392) Re: htmlspecial chars & htmlentities do not handle double byte charactersets

Wez Furlong Tue, 21 Aug 2001 07:05:27 -0700

Alan,

I've simplified your patch, but have not committed it yet,
because I cannot test it, so I would be grateful if you could
test it.

We need to use the the charset name, not the locale
name; nl_langinfo(CODESET) returns it, so please let me know the results of the 
following php script (assuming that you have nl_langinfo), so that I can use the 
correct names to map the charsets to the cs_XXX enum values.

<?php
setlocale(LC_ALL,"zh_TW");
echo nl_langinfo(CODESET);
setlocale(LC_ALL,"zh_CN");
echo nl_langinfo(CODESET);
setlocale(LC_ALL,"zh_HK");
echo nl_langinfo(CODESET);
?>

BTW: unsigned short is 16 bits, not 8, so there is no
need to make it a long :-)

If you have nl_langinfo on your system, this php code should work
with the chinese characters:
<?php
setlocale(LC_ALL,"zh_TW");
echo htmlentities("chinese text", ENT_COMPAT, null);
?>

If you don't have nl_langinfo, or you don't have that locale
installed, then use this code
<?php
echo htmlentities("chinese text", ENT_COMPAT, "big5");
?>

--Wez.

Index: html.c
===================================================================
RCS file: /repository/php4/ext/standard/html.c,v
retrieving revision 1.31
diff -u -r1.31 html.c
--- html.c      8 Aug 2001 20:00:09 -0000       1.31
+++ html.c      21 Aug 2001 13:58:18 -0000
@@ -35,7 +35,7 @@
    Defaults to ISO-8859-1 for now. */
 
 enum entity_charset { cs_terminator, cs_8859_1, cs_cp1252,
-       cs_8859_15, cs_utf_8 };
+       cs_8859_15, cs_utf_8, cs_big5, cs_gb2312, cs_big5hkscs };
 typedef const char * entity_table_t;
 
 /* codepage 1252 is a Windows extension to iso-8859-1. */
@@ -91,11 +91,14 @@
 };
 
 static const struct html_entity_map entity_map[] = {
-       { cs_cp1252,    0x80, 0x9f, ent_cp_1252 },
-       { cs_cp1252,    0xa0, 0xff, ent_iso_8859_1 },
+       { cs_cp1252,            0x80, 0x9f, ent_cp_1252 },
+       { cs_cp1252,            0xa0, 0xff, ent_iso_8859_1 },
        { cs_8859_1,            0xa0, 0xff, ent_iso_8859_1 },
        { cs_8859_15,           0xa0, 0xff, ent_iso_8859_15 },
        { cs_utf_8,             0xa0, 0xff, ent_iso_8859_1 },
+       { cs_big5,                      0xa0, 0xff, ent_iso_8859_1 },
+       { cs_gb2312,            0xa0, 0xff, ent_iso_8859_1 },
+       { cs_big5hkscs,         0xa0, 0xff, ent_iso_8859_1 },
        { cs_terminator }
 };
 
@@ -107,6 +110,8 @@
        { "ISO-8859-15",        cs_8859_15 },
        { "utf-8",                      cs_utf_8 },
        { "cp1252",             cs_cp1252 },
+       { "big5",                       cs_big5 },
+       { "GB2312",                     cs_gb2312 },
        { NULL }
 };
 
@@ -125,86 +130,111 @@
        
        mbseq[mbpos++] = (unsigned char)this_char;
        
-       if (charset == cs_utf_8)        {
-               unsigned long utf = 0;
-               int stat = 0;
-               int more = 1;
+       switch(charset) {
+               case cs_utf_8:
+                       {
+                               unsigned long utf = 0;
+                               int stat = 0;
+                               int more = 1;
 
-               /* unpack utf-8 encoding into a wide char.
-                * Code stolen from the mbstring extension */
-               
-               do {
-                       if (this_char < 0x80)   {
-                               more = 0;
-                               break;
-                       }
-                       else if (this_char < 0xc0)      {
-                               switch(stat)    {
-                                       case 0x10:      /* 2, 2nd */
-                                       case 0x21:      /* 3, 3rd */
-                                       case 0x32:      /* 4, 4th */
-                                       case 0x43:      /* 5, 5th */
-                                       case 0x54:      /* 6, 6th */
-                                               /* last byte in sequence */
+                               /* unpack utf-8 encoding into a wide char.
+                                * Code stolen from the mbstring extension */
+
+                               do {
+                                       if (this_char < 0x80)   {
                                                more = 0;
-                                               utf |= (this_char & 0x3f);
-                                               this_char = (unsigned short)utf;
                                                break;
-                                       case 0x20:      /* 3, 2nd */
-                                       case 0x31:      /* 4, 3rd */
-                                       case 0x42:      /* 5, 4th */
-                                       case 0x53:      /* 6, 5th */
-                                               /* penultimate char */
-                                               utf |= ((this_char & 0x3f) << 6);
-                                               stat++;
-                                               break;
-                                       case 0x30:      /* 4, 2nd */
-                                       case 0x41:      /* 5, 3rd */
-                                       case 0x52:      /* 6, 4th */
-                                               utf |= ((this_char & 0x3f) << 12);
-                                               stat++;
-                                               break;
-                                       case 0x40:      /* 5, 2nd */
-                                       case 0x51:
-                                               utf |= ((this_char & 0x3f) << 18);
-                                               stat++;
-                                               break;
-                                       case 0x50:      /* 6, 2nd */
-                                               utf |= ((this_char & 0x3f) << 24);
-                                               stat++;
-                                       default:
-                                               /* invalid */
+                                       }
+                                       else if (this_char < 0xc0)      {
+                                               switch(stat)    {
+                                                       case 0x10:      /* 2, 2nd */
+                                                       case 0x21:      /* 3, 3rd */
+                                                       case 0x32:      /* 4, 4th */
+                                                       case 0x43:      /* 5, 5th */
+                                                       case 0x54:      /* 6, 6th */
+                                                               /* last byte in 
+sequence */
+                                                               more = 0;
+                                                               utf |= (this_char & 
+0x3f);
+                                                               this_char = (unsigned 
+short)utf;
+                                                               break;
+                                                       case 0x20:      /* 3, 2nd */
+                                                       case 0x31:      /* 4, 3rd */
+                                                       case 0x42:      /* 5, 4th */
+                                                       case 0x53:      /* 6, 5th */
+                                                               /* penultimate char */
+                                                               utf |= ((this_char & 
+0x3f) << 6);
+                                                               stat++;
+                                                               break;
+                                                       case 0x30:      /* 4, 2nd */
+                                                       case 0x41:      /* 5, 3rd */
+                                                       case 0x52:      /* 6, 4th */
+                                                               utf |= ((this_char & 
+0x3f) << 12);
+                                                               stat++;
+                                                               break;
+                                                       case 0x40:      /* 5, 2nd */
+                                                       case 0x51:
+                                                               utf |= ((this_char & 
+0x3f) << 18);
+                                                               stat++;
+                                                               break;
+                                                       case 0x50:      /* 6, 2nd */
+                                                               utf |= ((this_char & 
+0x3f) << 24);
+                                                               stat++;
+                                                       default:
+                                                               /* invalid */
+                                                               more = 0;
+                                               }
+                                       }
+                                       /* lead byte */
+                                       else if (this_char < 0xe0) {
+                                               stat = 0x10;    /* 2 byte */
+                                               utf = (this_char & 0x1f) << 6;
+                                       } else if (this_char < 0xf0)    {
+                                               stat = 0x20;    /* 3 byte */
+                                               utf = (this_char & 0xf) << 12;
+                                       } else if (this_char < 0xf8) {
+                                               stat = 0x30;    /* 4 byte */
+                                               utf = (this_char & 0x7) << 18;
+                                       } else if (this_char < 0xfc)    {
+                                               stat = 0x40;    /* 5 byte */
+                                               utf = (this_char & 0x3) << 24;
+                                       } else if (this_char < 0xfe)    {
+                                               stat = 0x50;    /* 6 byte */
+                                               utf = (this_char & 0x1) << 30;
+                                       }
+                                       else    {
+                                               /* invalid; bail */
                                                more = 0;
-                               }
-                       }
-                       /* lead byte */
-                       else if (this_char < 0xe0) {
-                               stat = 0x10;    /* 2 byte */
-                               utf = (this_char & 0x1f) << 6;
-                       } else if (this_char < 0xf0)    {
-                               stat = 0x20;    /* 3 byte */
-                               utf = (this_char & 0xf) << 12;
-                       } else if (this_char < 0xf8) {
-                               stat = 0x30;    /* 4 byte */
-                               utf = (this_char & 0x7) << 18;
-                       } else if (this_char < 0xfc)    {
-                               stat = 0x40;    /* 5 byte */
-                               utf = (this_char & 0x3) << 24;
-                       } else if (this_char < 0xfe)    {
-                               stat = 0x50;    /* 6 byte */
-                               utf = (this_char & 0x1) << 30;
-                       }
-                       else    {
-                               /* invalid; bail */
-                               more = 0;
-                               break;
+                                               break;
+                                       }
+                                       if (more)
+                                       {
+                                               this_char = str[pos++];
+                                               mbseq[mbpos++] = (unsigned 
+char)this_char;
+                                       }
+                               } while(more);
                        }
-                       if (more)
+                       break;
+               case cs_big5:
+               case cs_gb2312:
+               case cs_big5hkscs:
                        {
-                               this_char = str[pos++];
-                               mbseq[mbpos++] = (unsigned char)this_char;
+                               /* check if this is the first of a 2-byte sequence */
+                               if (this_char >= 0xa1 && this_char <= 0xf9)     {
+                                       /* peek at the next char */
+                                       unsigned char next_char = str[pos];
+                                       if ((next_char >= 0x40 && next_char <= 0x73) ||
+                                                       (next_char >= 0xa1 && 
+next_char <= 0xfe))
+                                       {
+                                               /* yes, this a wide char */
+                                               this_char <<= 8;
+                                               mbseq[mbpos++] = next_char;
+                                               this_char |= next_char;
+                                               pos++;
+                                       }
+                                       
+                               }
+                               break;
                        }
-               } while(more);
        }
        *newpos = pos;
        mbseq[mbpos] = '\0';
@@ -300,7 +330,7 @@
                                                                                       
                 in a multibyte sequence
                                                                                       
                 it should be more than enough.. */
                unsigned short this_char = get_next_char(charset, old, &i, mbsequence, 
&mbseqlen);
-               int matches_map = 0;
+               int matches_map;
                
                if (len + 9 > maxlen)
                        new = erealloc (new, maxlen += 128);
@@ -309,7 +339,9 @@
                        /* look for a match in the maps for this charset */
                        int j;
                        unsigned char * rep;
-               
+       
+                       matches_map = 0;
+
                        for (j=0; entity_map[j].charset != cs_terminator; j++)  {
                                if (entity_map[j].charset == charset
                                                && this_char >= entity_map[j].basechar

-- 
PHP Development Mailing List <http://www.php.net/>
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]
To contact the list administrators, e-mail: [EMAIL PROTECTED]

[PHP-DEV] (#9392) Re: htmlspecial chars & htmlentities do not handle double byte charactersets

Reply via email to