On Wed, May 28, 2003 at 11:56:07PM +0200, Peter Eisentraut wrote: > There is a standard interface (SUSv2) for detecting the character set > based on the locale settings. I suggest we use this (if available) in > applications like psql and pg_dump by default unless it is overridden by > the usual mechanisms. If the character set name obtained this way is not > recognized by PostgreSQL, we fall back to SQL_ASCII. > > Here's a piece of code that shows how this would work: > > #include <stdio.h> > #include <locale.h> > #include <langinfo.h> > > int > main(int argc, char *argv[]) > { > setlocale(LC_ALL, ""); > printf("%s\n", nl_langinfo(CODESET)); > return 0; > } > > (LC_CTYPE is the governing category for this.) > > Comments?
It isn't enought for all OS. Please, look at glib or libcharset for this problem. http://www.haible.de/bruno/packages-libcharset.html I use in my project following code that is simplification of libcharset (the main function is mp_locale_charset()). Maybe it will help you :-) /* Determine a canonical name for the current locale's character encoding. * * mp_locale_charset() inspire with libcharset by: * * Copyright (C) 2000-2002 Free Software Foundation, Inc. * Written by Bruno Haible <[EMAIL PROTECTED]>. * * $Id: charset.c,v 1.2 2003/01/24 14:02:01 zakkr Exp $ */ #include "mape.h" #if HAVE_STDDEF_H # include <stddef.h> #endif #include <stdio.h> #if HAVE_STRING_H # include <string.h> #else # include <strings.h> #endif #if HAVE_STDLIB_H # include <stdlib.h> #endif #if defined _WIN32 || defined __WIN32__ # undef WIN32 /* avoid warning on mingw32 */ # define WIN32 #endif #if defined __EMX__ /* Assume EMX program runs on OS/2, even if compiled under DOS. */ # define OS2 #endif #if !defined WIN32 # if HAVE_LANGINFO_CODESET # include <langinfo.h> # else # if HAVE_SETLOCALE # include <locale.h> # endif # endif #elif defined WIN32 # define WIN32_LEAN_AND_MEAN # include <windows.h> #endif #if defined OS2 # define INCL_DOS # include <os2.h> #endif typedef struct MpCharsetAlias { char *alias, *name; } MpCharsetAlias; extern mpbool mp_locale_charset (char **charset); /* * The libcharset load all from external text file, but it's strange and * slow solution, we rather use array(s) compiled into source. In the * "good" libc this is not needful -- for example in linux. * * Please, put to this funtion exotic aliases only. The libc 'iconv' knows * a lot of basic aliases (check it first by iconv -l). * */ static const char * mp_charset_aliases (const char *name) { MpCharsetAlias *a; #if defined WIN32 MpCharsetAlias aliases[] = { { "CP936", "GBK" }, { "CP1361", "JOHAB" }, { "CP20127", "ASCII" }, { "CP20866", "KOI8-R" }, { "CP21866", "KOI8-RU" }, { "CP28591", "ISO-8859-1" }, { "CP28592", "ISO-8859-2" }, { "CP28593", "ISO-8859-3" }, { "CP28594", "ISO-8859-4" }, { "CP28595", "ISO-8859-5" }, { "CP28596", "ISO-8859-6" }, { "CP28597", "ISO-8859-7" }, { "CP28598", "ISO-8859-8" }, { "CP28599", "ISO-8859-9" }, { "CP28605", "ISO-8859-15" }, { NULL, NULL } }; #elif PORTNAME == aix MpCharsetAlias aliases[] = { { "IBM-850", "CP850" }, { "IBM-856", "CP856" }, { "IBM-921", "ISO-8859-13" }, { "IBM-922", "CP922" }, { "IBM-932", "CP932" }, { "IBM-943", "CP943" }, { "IBM-1046", "CP1046" }, { "IBM-1124", "CP1124" }, { "IBM-1129", "CP1129" }, { "IBM-1252", "CP1252" }, { "IBM-EUCCN", "GB2312" }, { "IBM-EUCJP", "EUC-JP" }, { "IBM-EUCKR", "EUC-KR" }, { "IBM-EUCTW", "EUC-TW" }, { NULL, NULL } }; #elif PORTNAME == hpux MpCharsetAlias aliases[] = { { "ROMAN8", "HP-ROMAN8" }, { "ARABIC8", "HP-ARABIC8" }, { "GREEK8", "HP-GREEK8" }, { "HEBREW8", "HP-HEBREW8" }, { "TURKISH8", "HP-TURKISH8" }, { "KANA8", "HP-KANA8" }, { "HP15CN", "GB2312" }, { NULL, NULL } }; #elif (PORTNAME == irix || PORTNAME == irix5) MpCharsetAlias aliases[] = { { "EUCCN", "GB2312" }, { NULL, NULL } }; #elif PORTNAME == osf MpCharsetAlias aliases[] = { { "KSC5601", "CP949" }, { "SDECKANJI", "EUC-JP" }, { "TACTIS", "TIS-620" }, { NULL, NULL } }; #elif (PORTNAME == solaris || PORTNAME == solaris_sparc || POSRTNAME == solaris_i386) MpCharsetAlias aliases[] = { { "646", "ASCII" }, { "CNS11643", "EUC-TW" }, { "5601", "EUC-KR" }, { "JOHAP92", "JOHAB" }, { "PCK", "SHIFT_JIS" }, { "2533", "TIS-620" }, { NULL, NULL } }; #elif PORTNAME == netbsd MpCharsetAlias aliases[] = { { "646", " ASCII" }, { "EUCCN", "GB2312" }, { NULL, NULL } }; #else return name; #endif if (aliases) { for (a = aliases; a->alias; a++) if (strcasecmp (a->alias, name) == 0) return a->name; } /* we return original name beacuse iconv() probably will know * something better about name if we don't know it :-) */ return name; } /* Returns charset from "[EMAIL PROTECTED]" string */ #ifndef HAVE_LANGINFO_CODESET static char * mp_encoding_from_locale(char *locale) { char *dot = strchr (locale, '.'); if (dot != NULL) { const char *modifier; static char buf[2 + 10 + 1]; dot++; /* Look for the possible @... trailer and remove it, if any. */ modifier = strchr (dot, '@'); if (modifier == NULL) return dot; if (modifier - dot < sizeof (buf)) { memcpy (buf, dot, modifier - dot); buf [modifier - dot] = '\0'; return buf; } } return locale; } #endif mpbool mp_locale_charset (char **charset) { const char *codeset; #if !(defined WIN32 || defined OS2) # if HAVE_LANGINFO_CODESET /* Most systems support nl_langinfo (CODESET) nowadays. */ codeset = nl_langinfo (CODESET); # else /* On old systems which lack it, use setlocale or getenv. */ const char *locale = NULL; /* But most old systems don't have a complete set of locales. Some * (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't * use setlocale here; it would return "C" when it doesn't support the * locale name the user has set. */ # if HAVE_SETLOCALE && 0 locale = setlocale (LC_CTYPE, NULL); # endif if (locale == NULL || locale[0] == '\0') { locale = getenv ("LC_ALL"); if (locale == NULL || locale[0] == '\0') { locale = getenv ("LC_CTYPE"); if (locale == NULL || locale[0] == '\0') locale = getenv ("LANG"); } } /* On some old systems, one used to set locale = "iso8859_1". On others, * you set it to "language_COUNTRY.charset". In any case, we resolve it * through the charset.alias file. */ codeset = mp_encoding_from_locale(locale); # endif /* HAVE_LANGINFO_CODESET */ #elif defined WIN32 static char buf[2 + 10 + 1]; /* Woe32 has a function returning the locale's codepage as a number. */ sprintf (buf, "CP%u", GetACP ()); codeset = buf; #elif defined OS2 const char *locale; static char buf[2 + 10 + 1]; ULONG cp[3]; ULONG cplen; /* Allow user to override the codeset, as set in the operating system, * with standard language environment variables. */ locale = getenv ("LC_ALL"); if (locale == NULL || locale[0] == '\0') { locale = getenv ("LC_CTYPE"); if (locale == NULL || locale[0] == '\0') locale = getenv ("LANG"); } if (locale != NULL && locale[0] != '\0') codeset = mp_encoding_from_locale(locale); else { /* OS/2 has a function returning the locale's codepage as a number. */ if (DosQueryCp (sizeof (cp), cp, &cplen)) codeset = ""; else { sprintf (buf, "CP%u", cp[0]); codeset = buf; } } #endif if (codeset == NULL) /* The canonical name cannot be determined. */ codeset = ""; else codeset = mp_charset_aliases (codeset); /* Don't return an empty string. GNU libc and GNU libiconv interpret * the empty string as denoting "the locale's character encoding", * thus GNU libiconv would call this function a second time. */ if (codeset[0] == '\0') { /* * Last possibility is 'CHARSET' enviroment variable */ if (!(codeset = getenv ("CHARSET"))) codeset = "ASCII"; } if (charset) *charset = (char *) codeset; if (strcasecmp(codeset, "UTF8")==0 || strcasecmp(codeset, "UTF-8")==0) return TRUE; return FALSE; } autoconf part: ------------- AC_DEFUN(jm_LANGINFO_CODESET, [ AC_CHECK_HEADERS(langinfo.h) AC_CHECK_FUNCS(nl_langinfo) AC_CACHE_CHECK([for nl_langinfo and CODESET], jm_cv_langinfo_codeset, [AC_TRY_LINK([#include <langinfo.h>], [char* cs = nl_langinfo(CODESET);], jm_cv_langinfo_codeset=yes, jm_cv_langinfo_codeset=no) ]) if test $jm_cv_langinfo_codeset = yes; then AC_DEFINE(HAVE_LANGINFO_CODESET, 1, [Define if you have <langinfo.h> and nl_langinfo(CODESET).]) fi ]) -- Karel Zak <[EMAIL PROTECTED]> http://home.zf.jcu.cz/~zakkr/ ---------------------------(end of broadcast)--------------------------- TIP 1: subscribe and unsubscribe commands go to [EMAIL PROTECTED]