Re: [HACKERS] Automatic detection of client encoding

Karel Zak Thu, 29 May 2003 19:38:29 -0700

On Wed, May 28, 2003 at 11:56:07PM +0200, Peter Eisentraut wrote:
> There is a standard interface (SUSv2) for detecting the character set
> based on the locale settings.  I suggest we use this (if available) in
> applications like psql and pg_dump by default unless it is overridden by
> the usual mechanisms.  If the character set name obtained this way is not
> recognized by PostgreSQL, we fall back to SQL_ASCII.
> 
> Here's a piece of code that shows how this would work:
> 
> #include <stdio.h>
> #include <locale.h>
> #include <langinfo.h>
> 
> int
> main(int argc, char *argv[])
> {
>         setlocale(LC_ALL, "");
>         printf("%s\n", nl_langinfo(CODESET));
>         return 0;
> }
> 
> (LC_CTYPE is the governing category for this.)
> 
> Comments?


 It isn't enought for all OS. Please, look at glib or libcharset for this 
 problem.

   http://www.haible.de/bruno/packages-libcharset.html

 I use in my project following code that is simplification of
 libcharset (the main function is mp_locale_charset()). 
 Maybe it will help you :-)



/* Determine a canonical name for the current locale's character encoding.
 *
 * mp_locale_charset() inspire with libcharset by:
 *  
 *      Copyright (C) 2000-2002 Free Software Foundation, Inc.
 *      Written by Bruno Haible <[EMAIL PROTECTED]>.  
 *
 * $Id: charset.c,v 1.2 2003/01/24 14:02:01 zakkr Exp $
 */

#include "mape.h"

#if HAVE_STDDEF_H
# include <stddef.h>
#endif

#include <stdio.h>
#if HAVE_STRING_H
# include <string.h>
#else
# include <strings.h>
#endif
#if HAVE_STDLIB_H
# include <stdlib.h>
#endif

#if defined _WIN32 || defined __WIN32__
# undef WIN32   /* avoid warning on mingw32 */
# define WIN32
#endif

#if defined __EMX__
/* Assume EMX program runs on OS/2, even if compiled under DOS.  */
# define OS2
#endif

#if !defined WIN32
# if HAVE_LANGINFO_CODESET
#  include <langinfo.h>
# else
#  if HAVE_SETLOCALE
#   include <locale.h>
#  endif
# endif
#elif defined WIN32
# define WIN32_LEAN_AND_MEAN
# include <windows.h>
#endif
#if defined OS2
# define INCL_DOS
# include <os2.h>
#endif

typedef struct MpCharsetAlias
{
        char    *alias,
                *name;
} MpCharsetAlias;

extern mpbool mp_locale_charset (char **charset);

/*
 * The libcharset load all from external text file, but it's strange and
 * slow solution, we rather use array(s) compiled into source. In the 
 * "good" libc this is not needful -- for example in linux.
 * 
 * Please, put to this funtion exotic aliases only. The libc 'iconv' knows
 * a lot of basic aliases (check it first by iconv -l).
 * 
 */
static const char *
mp_charset_aliases (const char *name)
{
        MpCharsetAlias  *a;

#if defined WIN32
        MpCharsetAlias aliases[] = 
        {
                { "CP936",      "GBK" },
                { "CP1361",     "JOHAB" },
                { "CP20127",    "ASCII" },
                { "CP20866",    "KOI8-R" },
                { "CP21866",    "KOI8-RU" },
                { "CP28591",    "ISO-8859-1" },
                { "CP28592",    "ISO-8859-2" },
                { "CP28593",    "ISO-8859-3" },
                { "CP28594",    "ISO-8859-4" },
                { "CP28595",    "ISO-8859-5" },
                { "CP28596",    "ISO-8859-6" },
                { "CP28597",    "ISO-8859-7" },
                { "CP28598",    "ISO-8859-8" },
                { "CP28599",    "ISO-8859-9" },
                { "CP28605",    "ISO-8859-15" },
                { NULL,         NULL }
        };
#elif PORTNAME == aix
        MpCharsetAlias aliases[] =
        {
                { "IBM-850",    "CP850" },
                { "IBM-856",    "CP856" },
                { "IBM-921",    "ISO-8859-13" },
                { "IBM-922",    "CP922" },
                { "IBM-932",    "CP932" },
                { "IBM-943",    "CP943" },
                { "IBM-1046",   "CP1046" },
                { "IBM-1124",   "CP1124" },
                { "IBM-1129",   "CP1129" },
                { "IBM-1252",   "CP1252" },
                { "IBM-EUCCN",  "GB2312" },
                { "IBM-EUCJP",  "EUC-JP" },
                { "IBM-EUCKR",  "EUC-KR" },
                { "IBM-EUCTW",  "EUC-TW" },
                { NULL, NULL }
        };
#elif PORTNAME == hpux 
        MpCharsetAlias aliases[] =
        {
                { "ROMAN8",     "HP-ROMAN8" },
                { "ARABIC8",    "HP-ARABIC8" },
                { "GREEK8",     "HP-GREEK8" },
                { "HEBREW8",    "HP-HEBREW8" },
                { "TURKISH8",   "HP-TURKISH8" },
                { "KANA8",      "HP-KANA8" },
                { "HP15CN",     "GB2312" },
                { NULL, NULL }
        };
#elif (PORTNAME == irix || PORTNAME == irix5)
        MpCharsetAlias aliases[] =
        {
                { "EUCCN",      "GB2312" },
                { NULL, NULL }
        };
#elif PORTNAME == osf 
        MpCharsetAlias aliases[] =
        {
                { "KSC5601",    "CP949" },
                { "SDECKANJI",  "EUC-JP" },
                { "TACTIS",     "TIS-620" },
                { NULL, NULL }
        };
#elif (PORTNAME == solaris || PORTNAME == solaris_sparc || POSRTNAME == solaris_i386)
        MpCharsetAlias aliases[] =
        {
                { "646",        "ASCII" },
                { "CNS11643",   "EUC-TW" },
                { "5601",       "EUC-KR" },
                { "JOHAP92",    "JOHAB" },
                { "PCK",        "SHIFT_JIS" },
                { "2533",       "TIS-620" },
                { NULL, NULL }
        };
#elif PORTNAME == netbsd
        MpCharsetAlias aliases[] =
        {
                { "646", " ASCII" },
                { "EUCCN", "GB2312" },
                { NULL, NULL }
        };
#else
        return name;
#endif
        
        if (aliases)
        {
                for (a = aliases; a->alias; a++)
                        if (strcasecmp (a->alias, name) == 0)
                                return a->name;
        }
      
        /* we return original name beacuse iconv() probably will know
         * something better about name if we don't know it :-)
         */
        return name;
}

/* Returns charset from "[EMAIL PROTECTED]" string */
#ifndef HAVE_LANGINFO_CODESET
static char *
mp_encoding_from_locale(char *locale)
{
        char *dot = strchr (locale, '.');

        if (dot != NULL)
        {
                const char *modifier;
                static char buf[2 + 10 + 1];

                dot++;
                
                /* Look for the possible @... trailer and remove it, if any.  */
                modifier = strchr (dot, '@');
                
                if (modifier == NULL)
                        return dot;
                if (modifier - dot < sizeof (buf))
                {
                        memcpy (buf, dot, modifier - dot);
                        buf [modifier - dot] = '\0';
                        return buf;
                }
        }
        return locale;
}
#endif

mpbool
mp_locale_charset (char **charset)
{
        const char *codeset;

#if !(defined WIN32 || defined OS2)

# if HAVE_LANGINFO_CODESET
        /* Most systems support nl_langinfo (CODESET) nowadays.  */
        codeset = nl_langinfo (CODESET);
# else
        /* On old systems which lack it, use setlocale or getenv.  */
        const char *locale = NULL;

        /* But most old systems don't have a complete set of locales.  Some
         * (like SunOS 4 or DJGPP) have only the C locale.  Therefore we don't
         * use setlocale here; it would return "C" when it doesn't support the
         * locale name the user has set.  
         */
#  if HAVE_SETLOCALE && 0
        locale = setlocale (LC_CTYPE, NULL);
#  endif
        if (locale == NULL || locale[0] == '\0')
        {
                locale = getenv ("LC_ALL");
                if (locale == NULL || locale[0] == '\0')
                {
                        locale = getenv ("LC_CTYPE");
                        if (locale == NULL || locale[0] == '\0')
                                locale = getenv ("LANG");
                }
        }

        /* On some old systems, one used to set locale = "iso8859_1". On others,
         * you set it to "language_COUNTRY.charset". In any case, we resolve it
         * through the charset.alias file.  
         */
        codeset = mp_encoding_from_locale(locale);
# endif /* HAVE_LANGINFO_CODESET */

#elif defined WIN32

        static char buf[2 + 10 + 1];

        /* Woe32 has a function returning the locale's codepage as a number.  */
        sprintf (buf, "CP%u", GetACP ());
        codeset = buf;

#elif defined OS2

        const char *locale;
        static char buf[2 + 10 + 1];
        ULONG cp[3];
        ULONG cplen;

        /* Allow user to override the codeset, as set in the operating system,
         * with standard language environment variables.  
         */
        locale = getenv ("LC_ALL");
        if (locale == NULL || locale[0] == '\0')
        {
                locale = getenv ("LC_CTYPE");
                if (locale == NULL || locale[0] == '\0')
                        locale = getenv ("LANG");
        }
        if (locale != NULL && locale[0] != '\0')
                codeset = mp_encoding_from_locale(locale);
        else
        {
                /* OS/2 has a function returning the locale's codepage as a number.  */
                if (DosQueryCp (sizeof (cp), cp, &cplen))
                        codeset = "";
                else
                {
                        sprintf (buf, "CP%u", cp[0]);
                        codeset = buf;
                }
        }
#endif
        if (codeset == NULL)
                /* The canonical name cannot be determined.  */
                codeset = "";
        else
                codeset = mp_charset_aliases (codeset);
        
        /* Don't return an empty string.  GNU libc and GNU libiconv interpret
         * the empty string as denoting "the locale's character encoding",
         * thus GNU libiconv would call this function a second time.  
         */
        if (codeset[0] == '\0')
        {
                /*
                 * Last possibility is 'CHARSET' enviroment variable
                 */
                if (!(codeset = getenv ("CHARSET")))
                        codeset = "ASCII";
        }
        
        if (charset)
                *charset = (char *) codeset;

        if (strcasecmp(codeset, "UTF8")==0 || strcasecmp(codeset, "UTF-8")==0)
                return TRUE;
        
        return FALSE;
}




 autoconf part:
 -------------

AC_DEFUN(jm_LANGINFO_CODESET,
[
  AC_CHECK_HEADERS(langinfo.h)
  AC_CHECK_FUNCS(nl_langinfo)

  AC_CACHE_CHECK([for nl_langinfo and CODESET], jm_cv_langinfo_codeset,
    [AC_TRY_LINK([#include <langinfo.h>],
      [char* cs = nl_langinfo(CODESET);],
      jm_cv_langinfo_codeset=yes,
      jm_cv_langinfo_codeset=no)
    ])
  if test $jm_cv_langinfo_codeset = yes; then
    AC_DEFINE(HAVE_LANGINFO_CODESET, 1,
      [Define if you have <langinfo.h> and nl_langinfo(CODESET).])
  fi
])


-- 
 Karel Zak  <[EMAIL PROTECTED]>
 http://home.zf.jcu.cz/~zakkr/

---------------------------(end of broadcast)---------------------------
TIP 1: subscribe and unsubscribe commands go to [EMAIL PROTECTED]

Re: [HACKERS] Automatic detection of client encoding

Reply via email to