Alan, I've simplified your patch, but have not committed it yet, because I cannot test it, so I would be grateful if you could test it. We need to use the the charset name, not the locale name; nl_langinfo(CODESET) returns it, so please let me know the results of the following php script (assuming that you have nl_langinfo), so that I can use the correct names to map the charsets to the cs_XXX enum values. <?php setlocale(LC_ALL,"zh_TW"); echo nl_langinfo(CODESET); setlocale(LC_ALL,"zh_CN"); echo nl_langinfo(CODESET); setlocale(LC_ALL,"zh_HK"); echo nl_langinfo(CODESET); ?> BTW: unsigned short is 16 bits, not 8, so there is no need to make it a long :-) If you have nl_langinfo on your system, this php code should work with the chinese characters: <?php setlocale(LC_ALL,"zh_TW"); echo htmlentities("chinese text", ENT_COMPAT, null); ?> If you don't have nl_langinfo, or you don't have that locale installed, then use this code <?php echo htmlentities("chinese text", ENT_COMPAT, "big5"); ?> --Wez.
Index: html.c =================================================================== RCS file: /repository/php4/ext/standard/html.c,v retrieving revision 1.31 diff -u -r1.31 html.c --- html.c 8 Aug 2001 20:00:09 -0000 1.31 +++ html.c 21 Aug 2001 13:58:18 -0000 @@ -35,7 +35,7 @@ Defaults to ISO-8859-1 for now. */ enum entity_charset { cs_terminator, cs_8859_1, cs_cp1252, - cs_8859_15, cs_utf_8 }; + cs_8859_15, cs_utf_8, cs_big5, cs_gb2312, cs_big5hkscs }; typedef const char * entity_table_t; /* codepage 1252 is a Windows extension to iso-8859-1. */ @@ -91,11 +91,14 @@ }; static const struct html_entity_map entity_map[] = { - { cs_cp1252, 0x80, 0x9f, ent_cp_1252 }, - { cs_cp1252, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_cp1252, 0x80, 0x9f, ent_cp_1252 }, + { cs_cp1252, 0xa0, 0xff, ent_iso_8859_1 }, { cs_8859_1, 0xa0, 0xff, ent_iso_8859_1 }, { cs_8859_15, 0xa0, 0xff, ent_iso_8859_15 }, { cs_utf_8, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_big5, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_gb2312, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_big5hkscs, 0xa0, 0xff, ent_iso_8859_1 }, { cs_terminator } }; @@ -107,6 +110,8 @@ { "ISO-8859-15", cs_8859_15 }, { "utf-8", cs_utf_8 }, { "cp1252", cs_cp1252 }, + { "big5", cs_big5 }, + { "GB2312", cs_gb2312 }, { NULL } }; @@ -125,86 +130,111 @@ mbseq[mbpos++] = (unsigned char)this_char; - if (charset == cs_utf_8) { - unsigned long utf = 0; - int stat = 0; - int more = 1; + switch(charset) { + case cs_utf_8: + { + unsigned long utf = 0; + int stat = 0; + int more = 1; - /* unpack utf-8 encoding into a wide char. - * Code stolen from the mbstring extension */ - - do { - if (this_char < 0x80) { - more = 0; - break; - } - else if (this_char < 0xc0) { - switch(stat) { - case 0x10: /* 2, 2nd */ - case 0x21: /* 3, 3rd */ - case 0x32: /* 4, 4th */ - case 0x43: /* 5, 5th */ - case 0x54: /* 6, 6th */ - /* last byte in sequence */ + /* unpack utf-8 encoding into a wide char. + * Code stolen from the mbstring extension */ + + do { + if (this_char < 0x80) { more = 0; - utf |= (this_char & 0x3f); - this_char = (unsigned short)utf; break; - case 0x20: /* 3, 2nd */ - case 0x31: /* 4, 3rd */ - case 0x42: /* 5, 4th */ - case 0x53: /* 6, 5th */ - /* penultimate char */ - utf |= ((this_char & 0x3f) << 6); - stat++; - break; - case 0x30: /* 4, 2nd */ - case 0x41: /* 5, 3rd */ - case 0x52: /* 6, 4th */ - utf |= ((this_char & 0x3f) << 12); - stat++; - break; - case 0x40: /* 5, 2nd */ - case 0x51: - utf |= ((this_char & 0x3f) << 18); - stat++; - break; - case 0x50: /* 6, 2nd */ - utf |= ((this_char & 0x3f) << 24); - stat++; - default: - /* invalid */ + } + else if (this_char < 0xc0) { + switch(stat) { + case 0x10: /* 2, 2nd */ + case 0x21: /* 3, 3rd */ + case 0x32: /* 4, 4th */ + case 0x43: /* 5, 5th */ + case 0x54: /* 6, 6th */ + /* last byte in +sequence */ + more = 0; + utf |= (this_char & +0x3f); + this_char = (unsigned +short)utf; + break; + case 0x20: /* 3, 2nd */ + case 0x31: /* 4, 3rd */ + case 0x42: /* 5, 4th */ + case 0x53: /* 6, 5th */ + /* penultimate char */ + utf |= ((this_char & +0x3f) << 6); + stat++; + break; + case 0x30: /* 4, 2nd */ + case 0x41: /* 5, 3rd */ + case 0x52: /* 6, 4th */ + utf |= ((this_char & +0x3f) << 12); + stat++; + break; + case 0x40: /* 5, 2nd */ + case 0x51: + utf |= ((this_char & +0x3f) << 18); + stat++; + break; + case 0x50: /* 6, 2nd */ + utf |= ((this_char & +0x3f) << 24); + stat++; + default: + /* invalid */ + more = 0; + } + } + /* lead byte */ + else if (this_char < 0xe0) { + stat = 0x10; /* 2 byte */ + utf = (this_char & 0x1f) << 6; + } else if (this_char < 0xf0) { + stat = 0x20; /* 3 byte */ + utf = (this_char & 0xf) << 12; + } else if (this_char < 0xf8) { + stat = 0x30; /* 4 byte */ + utf = (this_char & 0x7) << 18; + } else if (this_char < 0xfc) { + stat = 0x40; /* 5 byte */ + utf = (this_char & 0x3) << 24; + } else if (this_char < 0xfe) { + stat = 0x50; /* 6 byte */ + utf = (this_char & 0x1) << 30; + } + else { + /* invalid; bail */ more = 0; - } - } - /* lead byte */ - else if (this_char < 0xe0) { - stat = 0x10; /* 2 byte */ - utf = (this_char & 0x1f) << 6; - } else if (this_char < 0xf0) { - stat = 0x20; /* 3 byte */ - utf = (this_char & 0xf) << 12; - } else if (this_char < 0xf8) { - stat = 0x30; /* 4 byte */ - utf = (this_char & 0x7) << 18; - } else if (this_char < 0xfc) { - stat = 0x40; /* 5 byte */ - utf = (this_char & 0x3) << 24; - } else if (this_char < 0xfe) { - stat = 0x50; /* 6 byte */ - utf = (this_char & 0x1) << 30; - } - else { - /* invalid; bail */ - more = 0; - break; + break; + } + if (more) + { + this_char = str[pos++]; + mbseq[mbpos++] = (unsigned +char)this_char; + } + } while(more); } - if (more) + break; + case cs_big5: + case cs_gb2312: + case cs_big5hkscs: { - this_char = str[pos++]; - mbseq[mbpos++] = (unsigned char)this_char; + /* check if this is the first of a 2-byte sequence */ + if (this_char >= 0xa1 && this_char <= 0xf9) { + /* peek at the next char */ + unsigned char next_char = str[pos]; + if ((next_char >= 0x40 && next_char <= 0x73) || + (next_char >= 0xa1 && +next_char <= 0xfe)) + { + /* yes, this a wide char */ + this_char <<= 8; + mbseq[mbpos++] = next_char; + this_char |= next_char; + pos++; + } + + } + break; } - } while(more); } *newpos = pos; mbseq[mbpos] = '\0'; @@ -300,7 +330,7 @@ in a multibyte sequence it should be more than enough.. */ unsigned short this_char = get_next_char(charset, old, &i, mbsequence, &mbseqlen); - int matches_map = 0; + int matches_map; if (len + 9 > maxlen) new = erealloc (new, maxlen += 128); @@ -309,7 +339,9 @@ /* look for a match in the maps for this charset */ int j; unsigned char * rep; - + + matches_map = 0; + for (j=0; entity_map[j].charset != cs_terminator; j++) { if (entity_map[j].charset == charset && this_char >= entity_map[j].basechar
-- PHP Development Mailing List <http://www.php.net/> To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED] To contact the list administrators, e-mail: [EMAIL PROTECTED]