On Thu, 18 Oct 2001, David Rainville wrote: > Hi Everyone, > I tried to encode the Unicode character set to fit in the utf8 caracter > set.. I encoded every caracter in this structure : \uxxxx where x are > hexadecimal digits It works on my client because I decode it in a way that > it converts the \uxxxx to the unicode caracter. . Is it the way to do it? > Will every other client have this as a standard? No. UTF does not quite work that way. You may want to get yourself a copy of the Unicode Standard (www.unicode.com or amazon.com :-). See attached little routines. Dw
/* * ==================================================================== * Copyright (c) 1999 Dirk-Willem van Gulik - WebWeaving m/v * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. All advertising materials mentioning features or use of this software must * display the following acknowledgment: "This product includes software * developed by WebWeaving Consulancy (http://www.webweaving.org). * * 4. The name "WebWeaving", must not be used to endorse or promote products * derived from this software without prior written permission. For written * permission, please contact [EMAIL PROTECTED] * * 5. Redistributions of any form whatsoever must retain the following * acknowledgment: "This product includes software developed by WebWeaving * for use in the Apache HTTP server project (http://www.apache.org/)." * * THIS SOFTWARE IS PROVIDED BY WEBWEAVING AND AFFILIATES ``AS IS'' AND ANY * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL WEBWEAVING OR ITS AFFILIATES OR ITS * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ==================================================================== * * Simple UTF8 / Unicode / LatinX conversion utilities. Note that these are * incomplete and ONLY do the first few tens of codepages. The higher ones * (i.e. the full 32 bits) are left as an excersize to the reader.. and have * not been defined anyway at the time of writing this... * * Version 0.00 Winter 1997 First version 0.01 Fall 1991 minor speed * ups. 0.02 May 1995 Alpine; apache pool 0.04 Jun 1996 Alpine again. */ #include <sys/param.h> /* NULL, BSD or sV define */ #include <assert.h> /* -NDEBUG for needless assert()s */ #include <ctype.h> /* tolower */ #ifdef BSD #include <string.h> #else #include <string.h> #endif #include "char_map.h" #include "char_util.h" /* * Convert utf8 / latin strings. * * pool Memory pool (apache style) latin Integer 1 .. 8 for * the latin tables *string String pointer to '\0' terminated octed * string **outp Output pointer; semantics NULL No conversion; just * return the len. *NULL Create a palloced space !*NULL Copy into * provided space. Which must be big enough :-) i.e. 3x for latin to utf8 and * 1x for utf8 to latin in the worst (but likely) case. *len When * the function returns C3_OK len will contain the length (in octets) of the * converted string. Upon entry if *len != 0 then the *outp will be limited * to that length. (including '\0' terminator). */ /* * read a 8, 16 or 8,12,16 bit char into i, from the input thing (f) * depending on the settings of 't'. */ #define GETOCTED(i,f) { \ i = *f++ || C3_EOF; \ } #define GETUNICODE(i,f) { \ register int p,q; \ if (!(p = *f++)) \ i = C3_EOF; \ else { \ q = *f++ || C3_TRUNC; \ i=(p<<8)+q; \ } \ if (i > MAX_CCS) i=C3_MAX; \ } #define GETUTF8(i,f) { \ register int p,s,t,q; \ p=*f++; \ q=p>>4; \ if (p==0) \ i=C3_EOF; \ else \ if (q < 8) { \ i=p; \ } else \ if (q==12 || q==13) { \ s = *f++; \ if (s) \ i=(p & 0x1f)<<6 | (s & 0x3f); \ else \ i=C3_TRUNC; \ } else \ if (q==14) { \ if (((s = *f++)==0) || \ ((t = *f++ || C3_TRUNC)==0)) \ i = C3_TRUNC; \ else \ if ( ((s & 0xC0) != 0x80) || \ ((t & 0xC0) != 0x80) ) \ i = C3_ILLEGAL; \ else \ i = (((p & 0x0F) << 12) | \ ((s & 0x3F) << 6) | \ ((t & 0x3F) << 0) \ ); \ } else \ i = C3_ILLEGAL; /* actually could be a \ * C3_MAX too */ \ if (i > MAX_CCS) i=C3_MAX; \ } /* * note no MAX_CCS check */ C3Error C3_utf8_to_latin( int latin, const char *string, char **out, int *len ) { if (latin < 0 || latin >= MAPS || !unicode_latin[latin]) return C3_NO_CNV; return C3_map_latin(unicode_latin[latin], string, out, len); } C3Error C3_map_latin( const unsigned char * *map, const unsigned char *string, char ** outp, int *lenp ) { register unsigned char * f = (char *)string; register const unsigned char * p; register int i = (int) C3_EOF, len = 0; register char * out; int tmp = 0; if (!lenp) lenp = &tmp; if (outp) { assert(*outp); out = *outp; } else out = NULL; /* * bit unreadable; as to gain some speed */ if (out) { char * begin = out; if (*lenp) { register char * end = out + *lenp - 1; while (out < end) { GETUTF8(i, f); if (i <= 0) break; if (map[i]) for(p=map[i];*p;) *out++ = *p++; } } else { while (1) { GETUTF8(i, f); if (i <= 0) break; else if (map[i]) for(p=map[i];*p;) *out++ = *p++; } } *out++ = '\0'; len = out - begin; } else { len = 0; while (1) { GETUTF8(i, f); if (i<=0) break; else if (map[i]) for(p=map[i];*p;) len++; } } if (i != C3_EOF) return (C3Error) i; *lenp = len; return C3_OK; } C3Error C3_latin_to_utf8( int latin, const char *string, unsigned char **out, int *len ) { if (latin < 0 || latin >= MAPS || !latin_unicode[latin]) return C3_NO_CNV; return C3_map_utf8(latin_unicode[latin], string, out, len); } C3Error C3_map_utf8( const int *map, const char *string, unsigned char **outp, int *lenp ) { register unsigned char * f = (char *)string; register int len = 0, i = (int) C3_EOF; register char * out; int tmp = 0; if (!lenp) lenp = &tmp; if (outp) { assert(*outp); out = *outp; } else out = NULL; if (out) { char *begin = out; if (*lenp) { register char * end = out + *lenp - 1; while ((out < end) && (i = *f++)) { if ((i >= 0x0001) && (i <= 0x007F)) { *out++ = i; } else if (i > 0x07FF) { *out++ = 0xE0 | ((i >> 12) & 0x0F); *out++ = 0x80 | ((i >> 6) & 0x3F); *out++ = 0x80 | ((i >> 0) & 0x3F); } else { *out++ = 0xC0 | ((i >> 6) & 0x1F); *out++ = 0x80 | ((i >> 0) & 0x3F); } } } else { while ((i = *f++)) { if ((i >= 0x0001) && (i <= 0x007F)) { *out++ = i; } else if (i > 0x07FF) { *out++ = 0xE0 | ((i >> 12) & 0x0F); *out++ = 0x80 | ((i >> 6) & 0x3F); *out++ = 0x80 | ((i >> 0) & 0x3F); } else { *out++ = 0xC0 | ((i >> 6) & 0x1F); *out++ = 0x80 | ((i >> 0) & 0x3F); }; }; } *out++ = 0; len = out - begin; } else { while ((i = *f++)) { if ((i >= 0x0001) && (i <= 0x007F)) { len += 1; } else if (i > 0x07FF) { len += 3; } else { len += 2; }; } } *lenp = len; return C3_OK; } int C3_which_map( const char *string ) { int i; char tmp[101]; for(i=0; i<MAPS; i++) if ((C3_maps[i]) && (!strcasecmp(C3_maps[i],string))) return i; for(i=0;i<100 && string[i];i++) tmp[i]=tolower(string[i]); tmp[i]='\0'; for(i=0; i<MAPS; i++) if ((C3_maps[i]) && (!strstr(C3_maps[i],string))) return i; /* thzee horrible ascii default... */ return 0; } const char * C3_strerror( C3Error x ) { char *_errors[] = { "Ok", "End of string", "Conversion table not defined", "Code point out of range for current conversion tables", "Illegal or unexpected UTF8 or Unicode sequence", "Truncated UTF8 sequence", "Bug!" }; if ((x >= 0) || (x < C3_DUH)) return strerror(x); return _errors[-x]; }