Author: kevlo
Date: Fri Nov 18 03:05:20 2011
New Revision: 227650
URL: http://svn.freebsd.org/changeset/base/227650

Log:
  Add unicode support to msdosfs and smbfs; original pathes from imura,
  bug fixes by Kuan-Chung Chiu <buganini at gmail dot com>.
  
  Tested by me in production for several days at work.

Added:
  head/sys/libkern/iconv_ucs.c   (contents, props changed)
Modified:
  head/lib/libkiconv/xlat16_iconv.c
  head/sys/conf/files
  head/sys/fs/msdosfs/msdosfs_conv.c
  head/sys/fs/smbfs/smbfs_smb.c
  head/sys/fs/smbfs/smbfs_subr.c
  head/sys/kern/subr_mchain.c
  head/sys/libkern/iconv.c
  head/sys/modules/libiconv/Makefile
  head/sys/modules/libmchain/Makefile
  head/sys/netsmb/smb_conn.c
  head/sys/netsmb/smb_conn.h
  head/sys/netsmb/smb_smb.c
  head/sys/netsmb/smb_subr.c
  head/sys/sys/iconv.h
  head/sys/sys/mchain.h

Modified: head/lib/libkiconv/xlat16_iconv.c
==============================================================================
--- head/lib/libkiconv/xlat16_iconv.c   Fri Nov 18 02:25:54 2011        
(r227649)
+++ head/lib/libkiconv/xlat16_iconv.c   Fri Nov 18 03:05:20 2011        
(r227650)
@@ -74,6 +74,18 @@ kiconv_add_xlat16_cspair(const char *toc
        struct xlat16_table xt;
        void *data;
        char *p;
+       const char unicode[] = ENCODING_UNICODE;
+
+       if ((flag & KICONV_WCTYPE) == 0 &&
+           strcmp(unicode, tocode) != 0 &&
+           strcmp(unicode, fromcode) != 0 &&
+           kiconv_lookupconv(unicode) == 0) {
+               error = kiconv_add_xlat16_cspair(unicode, fromcode, flag);
+               if (error)
+                       return (-1);
+               error = kiconv_add_xlat16_cspair(tocode, unicode, flag);
+               return (error);
+       }
 
        if (kiconv_lookupcs(tocode, fromcode) == 0)
                return (0);

Modified: head/sys/conf/files
==============================================================================
--- head/sys/conf/files Fri Nov 18 02:25:54 2011        (r227649)
+++ head/sys/conf/files Fri Nov 18 03:05:20 2011        (r227650)
@@ -2557,6 +2557,7 @@ libkern/fnmatch.c         standard
 libkern/gets.c                 standard
 libkern/iconv.c                        optional libiconv
 libkern/iconv_converter_if.m   optional libiconv
+libkern/iconv_ucs.c            optional libiconv
 libkern/iconv_xlat.c           optional libiconv
 libkern/iconv_xlat16.c         optional libiconv
 libkern/index.c                        standard

Modified: head/sys/fs/msdosfs/msdosfs_conv.c
==============================================================================
--- head/sys/fs/msdosfs/msdosfs_conv.c  Fri Nov 18 02:25:54 2011        
(r227649)
+++ head/sys/fs/msdosfs/msdosfs_conv.c  Fri Nov 18 03:05:20 2011        
(r227650)
@@ -61,9 +61,9 @@
 extern struct iconv_functions *msdosfs_iconv;
 
 static int mbsadjpos(const char **, size_t, size_t, int, int, void *handle);
-static u_int16_t dos2unixchr(const u_char **, size_t *, int, struct 
msdosfsmount *);
+static u_char * dos2unixchr(const u_char **, size_t *, int, struct 
msdosfsmount *);
 static u_int16_t unix2doschr(const u_char **, size_t *, struct msdosfsmount *);
-static u_int16_t win2unixchr(u_int16_t, struct msdosfsmount *);
+static u_char * win2unixchr(u_int16_t, struct msdosfsmount *);
 static u_int16_t unix2winchr(const u_char **, size_t *, int, struct 
msdosfsmount *);
 
 /*
@@ -242,7 +242,7 @@ dos2unixfn(dn, un, lower, pmp)
 {
        size_t i;
        int thislong = 0;
-       u_int16_t c;
+       u_char *c;
 
        /*
         * If first char of the filename is SLOT_E5 (0x05), then the real
@@ -259,12 +259,10 @@ dos2unixfn(dn, un, lower, pmp)
        for (i = 8; i > 0 && *dn != ' ';) {
                c = dos2unixchr((const u_char **)&dn, &i, lower & LCASE_BASE,
                    pmp);
-               if (c & 0xff00) {
-                       *un++ = c >> 8;
+               while (*c != '\0') {
+                       *un++ = *c++;
                        thislong++;
                }
-               *un++ = c;
-               thislong++;
        }
        dn += i;
 
@@ -278,12 +276,10 @@ dos2unixfn(dn, un, lower, pmp)
                for (i = 3; i > 0 && *dn != ' ';) {
                        c = dos2unixchr((const u_char **)&dn, &i,
                            lower & LCASE_EXT, pmp);
-                       if (c & 0xff00) {
-                               *un++ = c >> 8;
+                       while (*c != '\0') {
+                               *un++ = *c++;
                                thislong++;
                        }
-                       *un++ = c;
-                       thislong++;
                }
        }
        *un++ = 0;
@@ -652,8 +648,9 @@ win2unixfn(nbp, wep, chksum, pmp)
        int chksum;
        struct msdosfsmount *pmp;
 {
+       u_char *c;
        u_int8_t *cp;
-       u_int8_t *np, name[WIN_CHARS * 2 + 1];
+       u_int8_t *np, name[WIN_CHARS * 3 + 1];
        u_int16_t code;
        int i;
 
@@ -686,10 +683,9 @@ win2unixfn(nbp, wep, chksum, pmp)
                        *np = '\0';
                        return -1;
                default:
-                       code = win2unixchr(code, pmp);
-                       if (code & 0xff00)
-                               *np++ = code >> 8;
-                       *np++ = code;
+                       c = win2unixchr(code, pmp);
+                       while (*c != '\0')
+                               *np++ = *c++;
                        break;
                }
                cp += 2;
@@ -705,10 +701,9 @@ win2unixfn(nbp, wep, chksum, pmp)
                        *np = '\0';
                        return -1;
                default:
-                       code = win2unixchr(code, pmp);
-                       if (code & 0xff00)
-                               *np++ = code >> 8;
-                       *np++ = code;
+                       c = win2unixchr(code, pmp);
+                       while (*c != '\0')
+                               *np++ = *c++;
                        break;
                }
                cp += 2;
@@ -724,10 +719,9 @@ win2unixfn(nbp, wep, chksum, pmp)
                        *np = '\0';
                        return -1;
                default:
-                       code = win2unixchr(code, pmp);
-                       if (code & 0xff00)
-                               *np++ = code >> 8;
-                       *np++ = code;
+                       c = win2unixchr(code, pmp);
+                       while (*c != '\0')
+                               *np++ = *c++;
                        break;
                }
                cp += 2;
@@ -817,24 +811,22 @@ mbsadjpos(const char **instr, size_t inl
 /*
  * Convert DOS char to Local char
  */
-static u_int16_t
+static u_char *
 dos2unixchr(const u_char **instr, size_t *ilen, int lower, struct msdosfsmount 
*pmp)
 {
-       u_char c;
-       char *outp, outbuf[3];
-       u_int16_t wc;
+       u_char c, *outp, outbuf[5];
        size_t len, olen;
 
+       outp = outbuf;
        if (pmp->pm_flags & MSDOSFSMNT_KICONV && msdosfs_iconv) {
-               olen = len = 2;
-               outp = outbuf;
+               olen = len = 4;
 
                if (lower & (LCASE_BASE | LCASE_EXT))
                        msdosfs_iconv->convchr_case(pmp->pm_d2u, (const char 
**)instr,
-                                                 ilen, &outp, &olen, 
KICONV_LOWER);
+                                                 ilen, (char **)&outp, &olen, 
KICONV_LOWER);
                else
                        msdosfs_iconv->convchr(pmp->pm_d2u, (const char 
**)instr,
-                                            ilen, &outp, &olen);
+                                            ilen, (char **)&outp, &olen);
                len -= olen;
 
                /*
@@ -843,21 +835,21 @@ dos2unixchr(const u_char **instr, size_t
                if (len == 0) {
                        (*ilen)--;
                        (*instr)++;
-                       return ('?');
+                       *outp++ = '?';
                }
-
-               wc = 0;
-               while(len--)
-                       wc |= (*(outp - len - 1) & 0xff) << (len << 3);
-               return (wc);
+       } else {
+               (*ilen)--;
+               c = *(*instr)++;
+               c = dos2unix[c];
+               if (lower & (LCASE_BASE | LCASE_EXT))
+                       c = u2l[c];
+               *outp++ = c;
+               outbuf[1] = '\0';
        }
 
-       (*ilen)--;
-       c = *(*instr)++;
-       c = dos2unix[c];
-       if (lower & (LCASE_BASE | LCASE_EXT))
-               c = u2l[c];
-       return ((u_int16_t)c);
+       *outp = '\0';
+       outp = outbuf;
+       return (outp);
 }
 
 /*
@@ -940,23 +932,21 @@ unix2doschr(const u_char **instr, size_t
 /*
  * Convert Windows char to Local char
  */
-static u_int16_t
+static u_char *
 win2unixchr(u_int16_t wc, struct msdosfsmount *pmp)
 {
-       u_char *inp, *outp, inbuf[3], outbuf[3];
+       u_char *inp, *outp, inbuf[3], outbuf[5];
        size_t ilen, olen, len;
 
-       if (wc == 0)
-               return (0);
-
+       outp = outbuf;
        if (pmp->pm_flags & MSDOSFSMNT_KICONV && msdosfs_iconv) {
                inbuf[0] = (u_char)(wc>>8);
                inbuf[1] = (u_char)wc;
                inbuf[2] = '\0';
 
-               ilen = olen = len = 2;
+               ilen = 2;
+               olen = len = 4;
                inp = inbuf;
-               outp = outbuf;
                msdosfs_iconv->convchr(pmp->pm_w2u, (const char **)&inp, &ilen,
                                     (char **)&outp, &olen);
                len -= olen;
@@ -964,21 +954,15 @@ win2unixchr(u_int16_t wc, struct msdosfs
                /*
                 * return '?' if failed to convert
                 */
-               if (len == 0) {
-                       wc = '?';
-                       return (wc);
-               }
-
-               wc = 0;
-               while(len--)
-                       wc |= (*(outp - len - 1) & 0xff) << (len << 3);
-               return (wc);
+               if (len == 0)
+                       *outp++ = '?';
+       } else {
+               *outp++ = (wc & 0xff00) ? '?' : (u_char)(wc & 0xff);
        }
 
-       if (wc & 0xff00)
-               wc = '?';
-
-       return (wc);
+       *outp = '\0';
+       outp = outbuf;
+       return (outp);
 }
 
 /*

Modified: head/sys/fs/smbfs/smbfs_smb.c
==============================================================================
--- head/sys/fs/smbfs/smbfs_smb.c       Fri Nov 18 02:25:54 2011        
(r227649)
+++ head/sys/fs/smbfs/smbfs_smb.c       Fri Nov 18 03:05:20 2011        
(r227650)
@@ -34,6 +34,7 @@
 #include <sys/vnode.h>
 #include <sys/mbuf.h>
 #include <sys/mount.h>
+#include <sys/endian.h>
 
 #ifdef USE_MD5_HASH
 #include <sys/md5.h>
@@ -393,6 +394,10 @@ smbfs_smb_setpattr(struct smbnode *np, u
                if (error)
                        break;
                mb_put_uint8(mbp, SMB_DT_ASCII);
+               if (SMB_UNICODE_STRINGS(SSTOVC(ssp))) {
+                       mb_put_padbyte(mbp);
+                       mb_put_uint8(mbp, 0);   /* 1st byte of NULL Unicode 
char */
+               }
                mb_put_uint8(mbp, 0);
                smb_rq_bend(rqp);
                error = smb_rq_simple(rqp);
@@ -909,6 +914,10 @@ smbfs_smb_search(struct smbfs_fctx *ctx)
                mb_put_uint16le(mbp, 0);        /* context length */
                ctx->f_flags &= ~SMBFS_RDD_FINDFIRST;
        } else {
+               if (SMB_UNICODE_STRINGS(vcp)) {
+                       mb_put_padbyte(mbp);
+                       mb_put_uint8(mbp, 0);
+               }
                mb_put_uint8(mbp, 0);   /* file name length */
                mb_put_uint8(mbp, SMB_DT_VARIABLE);
                mb_put_uint16le(mbp, SMB_SKEYLEN);
@@ -1069,7 +1078,7 @@ smbfs_smb_trans2find2(struct smbfs_fctx 
                mb_put_uint32le(mbp, 0);                /* resume key */
                mb_put_uint16le(mbp, flags);
                if (ctx->f_rname)
-                       mb_put_mem(mbp, ctx->f_rname, strlen(ctx->f_rname) + 1, 
MB_MSYSTEM);
+                       mb_put_mem(mbp, ctx->f_rname, ctx->f_rnamelen + 1, 
MB_MSYSTEM);
                else
                        mb_put_uint8(mbp, 0);   /* resume file name */
 #if 0
@@ -1152,7 +1161,10 @@ static int
 smbfs_findopenLM2(struct smbfs_fctx *ctx, struct smbnode *dnp,
        const char *wildcard, int wclen, int attr, struct smb_cred *scred)
 {
-       ctx->f_name = malloc(SMB_MAXFNAMELEN, M_SMBFSDATA, M_WAITOK);
+       if (SMB_UNICODE_STRINGS(SSTOVC(ctx->f_ssp))) {
+               ctx->f_name = malloc(SMB_MAXFNAMELEN * 2, M_SMBFSDATA, 
M_WAITOK);
+       } else
+               ctx->f_name = malloc(SMB_MAXFNAMELEN, M_SMBFSDATA, M_WAITOK);
        if (ctx->f_name == NULL)
                return ENOMEM;
        ctx->f_infolevel = SMB_DIALECT(SSTOVC(ctx->f_ssp)) < 
SMB_DIALECT_NTLM0_12 ?
@@ -1231,7 +1243,10 @@ smbfs_findnextLM2(struct smbfs_fctx *ctx
                SMBERROR("unexpected info level %d\n", ctx->f_infolevel);
                return EINVAL;
        }
-       nmlen = min(size, SMB_MAXFNAMELEN);
+       if (SMB_UNICODE_STRINGS(SSTOVC(ctx->f_ssp))) {
+               nmlen = min(size, SMB_MAXFNAMELEN * 2);
+       } else
+               nmlen = min(size, SMB_MAXFNAMELEN);
        cp = ctx->f_name;
        error = md_get_mem(mbp, cp, nmlen, MB_MSYSTEM);
        if (error)
@@ -1245,8 +1260,12 @@ smbfs_findnextLM2(struct smbfs_fctx *ctx
                        return EBADRPC;
                }
        }
-       if (nmlen && cp[nmlen - 1] == 0)
-               nmlen--;
+       if (SMB_UNICODE_STRINGS(SSTOVC(ctx->f_ssp))) {
+               if (nmlen > 1 && cp[nmlen - 1] == 0 && cp[nmlen - 2] == 0)
+                       nmlen -= 2;
+       } else
+               if (nmlen && cp[nmlen - 1] == 0)
+                       nmlen--;
        if (nmlen == 0)
                return EBADRPC;
 
@@ -1330,10 +1349,17 @@ smbfs_findnext(struct smbfs_fctx *ctx, i
                        error = smbfs_findnextLM2(ctx, limit);
                if (error)
                        return error;
-               if ((ctx->f_nmlen == 1 && ctx->f_name[0] == '.') ||
-                   (ctx->f_nmlen == 2 && ctx->f_name[0] == '.' &&
-                    ctx->f_name[1] == '.'))
-                       continue;
+               if (SMB_UNICODE_STRINGS(SSTOVC(ctx->f_ssp))) {
+                       if ((ctx->f_nmlen == 2 &&
+                            *(u_int16_t *)ctx->f_name == htole16(0x002e)) ||
+                           (ctx->f_nmlen == 4 &&
+                            *(u_int32_t *)ctx->f_name == htole32(0x002e002e)))
+                               continue;
+               } else
+                       if ((ctx->f_nmlen == 1 && ctx->f_name[0] == '.') ||
+                           (ctx->f_nmlen == 2 && ctx->f_name[0] == '.' &&
+                            ctx->f_name[1] == '.'))
+                               continue;
                break;
        }
        smbfs_fname_tolocal(SSTOVC(ctx->f_ssp), ctx->f_name, &ctx->f_nmlen,

Modified: head/sys/fs/smbfs/smbfs_subr.c
==============================================================================
--- head/sys/fs/smbfs/smbfs_subr.c      Fri Nov 18 02:25:54 2011        
(r227649)
+++ head/sys/fs/smbfs/smbfs_subr.c      Fri Nov 18 03:05:20 2011        
(r227650)
@@ -130,7 +130,10 @@ smb_fphelp(struct mbchain *mbp, struct s
                return smb_put_dmem(mbp, vcp, "\\", 2, caseopt);*/
        while (i--) {
                np = *--npp;
-               error = mb_put_uint8(mbp, '\\');
+               if (SMB_UNICODE_STRINGS(vcp))
+                       error = mb_put_uint16le(mbp, '\\');
+               else
+                       error = mb_put_uint8(mbp, '\\');
                if (error)
                        break;
                error = smb_put_dmem(mbp, vcp, np->n_name, np->n_nmlen, 
caseopt);
@@ -148,6 +151,11 @@ smbfs_fullpath(struct mbchain *mbp, stru
        int caseopt = SMB_CS_NONE;
        int error;
 
+       if (SMB_UNICODE_STRINGS(vcp)) {
+               error = mb_put_padbyte(mbp);
+               if (error)
+                       return error;
+       }
        if (SMB_DIALECT(vcp) < SMB_DIALECT_LANMAN1_0)
                caseopt |= SMB_CS_UPPER;
        if (dnp != NULL) {
@@ -156,7 +164,10 @@ smbfs_fullpath(struct mbchain *mbp, stru
                        return error;
        }
        if (name) {
-               error = mb_put_uint8(mbp, '\\');
+               if (SMB_UNICODE_STRINGS(vcp))
+                       error = mb_put_uint16le(mbp, '\\');
+               else
+                       error = mb_put_uint8(mbp, '\\');
                if (error)
                        return error;
                error = smb_put_dmem(mbp, vcp, name, nmlen, caseopt);
@@ -164,6 +175,8 @@ smbfs_fullpath(struct mbchain *mbp, stru
                        return error;
        }
        error = mb_put_uint8(mbp, 0);
+       if (SMB_UNICODE_STRINGS(vcp) && error == 0)
+               error = mb_put_uint8(mbp, 0);
        return error;
 }
 
@@ -191,6 +204,17 @@ smbfs_fname_tolocal(struct smb_vc *vcp, 
 
                error = iconv_conv_case
                        (vcp->vc_tolocal, (const char **)&ibuf, &ilen, &obuf, 
&olen, copt);
+               if (error && SMB_UNICODE_STRINGS(vcp)) {
+                       /*
+                        * If using unicode, leaving a file name as it was when
+                        * convert fails will cause a problem because the file 
name
+                        * will contain NULL.
+                        * Here, put '?' and give converted file name.
+                        */
+                       *obuf = '?';
+                       olen--;
+                       error = 0;
+               }
                if (!error) {
                        *nmlen = sizeof(outbuf) - olen;
                        memcpy(name, outbuf, *nmlen);

Modified: head/sys/kern/subr_mchain.c
==============================================================================
--- head/sys/kern/subr_mchain.c Fri Nov 18 02:25:54 2011        (r227649)
+++ head/sys/kern/subr_mchain.c Fri Nov 18 03:05:20 2011        (r227650)
@@ -128,6 +128,21 @@ mb_reserve(struct mbchain *mbp, int size
 }
 
 int
+mb_put_padbyte(struct mbchain *mbp)
+{
+       caddr_t dst;
+       char x = 0;
+
+       dst = mtod(mbp->mb_cur, caddr_t) + mbp->mb_cur->m_len;
+
+       /* only add padding if address is odd */
+       if ((unsigned long)dst & 1)
+               return mb_put_mem(mbp, (caddr_t)&x, 1, MB_MSYSTEM);
+       else
+       return 0;
+}
+
+int
 mb_put_uint8(struct mbchain *mbp, uint8_t x)
 {
        return mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM);

Modified: head/sys/libkern/iconv.c
==============================================================================
--- head/sys/libkern/iconv.c    Fri Nov 18 02:25:54 2011        (r227649)
+++ head/sys/libkern/iconv.c    Fri Nov 18 03:05:20 2011        (r227650)
@@ -377,6 +377,18 @@ iconv_sysctl_cslist(SYSCTL_HANDLER_ARGS)
 SYSCTL_PROC(_kern_iconv, OID_AUTO, cslist, CTLFLAG_RD | CTLTYPE_OPAQUE,
            NULL, 0, iconv_sysctl_cslist, "S,xlat", "registered charset pairs");
 
+int
+iconv_add(const char *converter, const char *to, const char *from)
+{
+       struct iconv_converter_class *dcp;
+       struct iconv_cspair *csp;
+
+       if (iconv_lookupconv(converter, &dcp) != 0)
+               return EINVAL;
+
+       return iconv_register_cspair(to, from, dcp, NULL, &csp);
+}
+
 /*
  * Add new charset pair
  */

Added: head/sys/libkern/iconv_ucs.c
==============================================================================
--- /dev/null   00:00:00 1970   (empty, because file is newly added)
+++ head/sys/libkern/iconv_ucs.c        Fri Nov 18 03:05:20 2011        
(r227650)
@@ -0,0 +1,540 @@
+/*-
+ * Copyright (c) 2003, 2005 Ryuichiro Imura
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/iconv.h>
+
+#include "iconv_converter_if.h"
+
+/*
+ * "UCS" converter
+ */
+
+#define        KICONV_UCS_COMBINE      0x1
+#define        KICONV_UCS_FROM_UTF8    0x2
+#define        KICONV_UCS_TO_UTF8      0x4
+#define        KICONV_UCS_FROM_LE      0x8
+#define        KICONV_UCS_TO_LE        0x10
+#define        KICONV_UCS_FROM_UTF16   0x20
+#define        KICONV_UCS_TO_UTF16     0x40
+#define        KICONV_UCS_UCS4         0x80
+
+#define        ENCODING_UTF16  "UTF-16BE"
+#define        ENCODING_UTF8   "UTF-8"
+
+static struct {
+       const char *name;
+       int from_flag, to_flag;
+} unicode_family[] = {
+       { "UTF-8",      KICONV_UCS_FROM_UTF8,   KICONV_UCS_TO_UTF8 },
+       { "UCS-2LE",    KICONV_UCS_FROM_LE,     KICONV_UCS_TO_LE },
+       { "UTF-16BE",   KICONV_UCS_FROM_UTF16,  KICONV_UCS_TO_UTF16 },
+       { "UTF-16LE",   KICONV_UCS_FROM_UTF16|KICONV_UCS_FROM_LE,
+           KICONV_UCS_TO_UTF16|KICONV_UCS_TO_LE },
+       { NULL,         0,      0 }
+};
+
+static uint32_t utf8_to_ucs4(const char *src, size_t *utf8width, size_t 
srclen);
+static u_char *ucs4_to_utf8(uint32_t ucs4, char * dst, size_t *utf8width, 
size_t dstlen);
+static uint32_t encode_surrogate(uint32_t code);
+static uint32_t decode_surrogate(const u_char *ucs);
+
+#ifdef MODULE_DEPEND
+MODULE_DEPEND(iconv_ucs, libiconv, 2, 2, 2);
+#endif
+
+/*
+ * UCS converter instance
+ */
+struct iconv_ucs {
+       KOBJ_FIELDS;
+       int                     convtype;
+       struct iconv_cspair *   d_csp;
+       struct iconv_cspair *   d_cspf;
+       void *                  f_ctp;
+       void *                  t_ctp;
+       void *                  ctype;
+};
+
+static int
+iconv_ucs_open(struct iconv_converter_class *dcp,
+       struct iconv_cspair *csp, struct iconv_cspair *cspf, void **dpp)
+{
+       struct iconv_ucs *dp;
+       int i;
+       const char *from, *to;
+
+       dp = (struct iconv_ucs *)kobj_create((struct kobj_class*)dcp, M_ICONV, 
M_WAITOK);
+       to = csp->cp_to;
+       from = cspf ? cspf->cp_from : csp->cp_from;
+
+       dp->convtype = 0;
+
+       if (cspf)
+               dp->convtype |= KICONV_UCS_COMBINE;
+       for (i = 0; unicode_family[i].name; i++) {
+               if (strcmp(from, unicode_family[i].name) == 0)
+                       dp->convtype |= unicode_family[i].from_flag;
+               if (strcmp(to, unicode_family[i].name) == 0)
+                       dp->convtype |= unicode_family[i].to_flag;
+       }
+       if (strcmp(ENCODING_UNICODE, ENCODING_UTF16) == 0)
+               dp->convtype |= KICONV_UCS_UCS4;
+       else
+               dp->convtype &= ~KICONV_UCS_UCS4;
+
+       dp->f_ctp = dp->t_ctp = NULL;
+       if (dp->convtype & KICONV_UCS_COMBINE) {
+               if ((dp->convtype & KICONV_UCS_FROM_UTF8) == 0 &&
+                   (dp->convtype & KICONV_UCS_FROM_LE) == 0) {
+                       iconv_open(ENCODING_UNICODE, from, &dp->f_ctp);
+               }
+               if ((dp->convtype & KICONV_UCS_TO_UTF8) == 0 &&
+                   (dp->convtype & KICONV_UCS_TO_LE) == 0) {
+                       iconv_open(to, ENCODING_UNICODE, &dp->t_ctp);
+               }
+       }
+
+       dp->ctype = NULL;
+       if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_TO_UTF8))
+               iconv_open(KICONV_WCTYPE_NAME, ENCODING_UTF8, &dp->ctype);
+
+       dp->d_csp = csp;
+       if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE)) {
+               if (cspf) {
+                       dp->d_cspf = cspf;
+                       cspf->cp_refcount++;
+               } else
+                       csp->cp_refcount++;
+       }
+       if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
+               csp->cp_refcount++;
+       *dpp = (void*)dp;
+       return 0;
+}
+
+static int
+iconv_ucs_close(void *data)
+{
+       struct iconv_ucs *dp = data;
+
+       if (dp->f_ctp)
+               iconv_close(dp->f_ctp);
+       if (dp->t_ctp)
+               iconv_close(dp->t_ctp);
+       if (dp->ctype)
+               iconv_close(dp->ctype);
+       if (dp->d_cspf)
+               dp->d_cspf->cp_refcount--;
+       else if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE))
+               dp->d_csp->cp_refcount--;
+       if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
+               dp->d_csp->cp_refcount--;
+       kobj_delete((struct kobj*)data, M_ICONV);
+       return 0;
+}
+
+static int
+iconv_ucs_conv(void *d2p, const char **inbuf,
+       size_t *inbytesleft, char **outbuf, size_t *outbytesleft,
+       int convchar, int casetype)
+{
+       struct iconv_ucs *dp = (struct iconv_ucs*)d2p;
+       int ret = 0, i;
+       size_t in, on, ir, or, inlen, outlen, ucslen;
+       const char *src, *p;
+       char *dst;
+       u_char ucs[4], *q;
+       uint32_t code;
+
+       if (inbuf == NULL || *inbuf == NULL || outbuf == NULL || *outbuf == 
NULL)
+               return 0;
+       ir = in = *inbytesleft;
+       or = on = *outbytesleft;
+       src = *inbuf;
+       dst = *outbuf;
+
+       while (ir > 0 && or > 0) {
+
+               /*
+                * The first half of conversion.
+                * (convert any code into ENCODING_UNICODE)
+                */
+               code = 0;
+               p = src;
+               if (dp->convtype & KICONV_UCS_FROM_UTF8) {
+                       /* convert UTF-8 to ENCODING_UNICODE */
+                       inlen = 0;
+                       code = utf8_to_ucs4(p, &inlen, ir);
+                       if (code == 0) {
+                               ret = -1;
+                               break;
+                       }
+
+                       if (casetype == KICONV_FROM_LOWER && dp->ctype) {
+                               code = towlower(code, dp->ctype);
+                       } else if (casetype == KICONV_FROM_UPPER && dp->ctype) {
+                               code = towupper(code, dp->ctype);
+                       }
+
+                       if ((code >= 0xd800 && code < 0xe000) || code >= 
0x110000 ) {
+                               /* reserved for utf-16 surrogate pair */
+                               /* invalid unicode */
+                               ret = -1;
+                               break;
+                       }
+
+                       if (inlen == 4) {
+                               if (dp->convtype & KICONV_UCS_UCS4) {
+                                       ucslen = 4;
+                                       code = encode_surrogate(code);
+                               } else {
+                                       /* can't handle with ucs-2 */
+                                       ret = -1;
+                                       break;
+                               }
+                       } else {
+                               ucslen = 2;
+                       }
+
+                       /* save UCS-4 into ucs[] */
+                       for (q = ucs, i = ucslen - 1 ; i >= 0 ; i--)
+                               *q++ = (code >> (i << 3)) & 0xff;
+
+               } else if (dp->convtype & KICONV_UCS_COMBINE && dp->f_ctp) {
+                       /* convert local code to ENCODING_UNICODE */
+                       ucslen = 4;
+                       inlen = ir;
+                       q = ucs;
+                       ret = iconv_convchr_case(dp->f_ctp, &p, &inlen, (char 
**)&q,
+                           &ucslen, casetype & (KICONV_FROM_LOWER | 
KICONV_FROM_UPPER));
+                       if (ret)
+                               break;
+                       inlen = ir - inlen;
+                       ucslen = 4 - ucslen;
+
+               } else {
+                       /* src code is a proper subset of ENCODING_UNICODE */
+                       q = ucs;
+                       if (dp->convtype & KICONV_UCS_FROM_LE) {
+                               *q = *(p + 1);
+                               *(q + 1) = *p;
+                               p += 2;
+                       } else {
+                               *q = *p++;
+                               *(q + 1) = *p++;
+                       }
+                       if ((*q & 0xfc) == 0xd8) {
+                               if (dp->convtype & KICONV_UCS_UCS4 &&
+                                   dp->convtype & KICONV_UCS_FROM_UTF16) {
+                                       inlen = ucslen = 4;
+                               } else {
+                                       /* invalid unicode */
+                                       ret = -1;
+                                       break;
+                               }
+                       } else {
+                               inlen = ucslen = 2;
+                       }
+                       if (ir < inlen) {
+                               ret = -1;
+                               break;
+                       }
+                       if (ucslen == 4) {
+                               q += 2;
+                               if (dp->convtype & KICONV_UCS_FROM_LE) {
+                                       *q = *(p + 1);
+                                       *(q + 1) = *p;
+                               } else {
+                                       *q = *p++;
+                                       *(q + 1) = *p;
+                               }
+                               if ((*q & 0xfc) != 0xdc) {
+                                       /* invalid unicode */
+                                       ret = -1;
+                                       break;
+                               }
+                       }
+               }
+
+               /*
+                * The second half of conversion.
+                * (convert ENCODING_UNICODE into any code)
+                */
+               p = ucs;
+               if (dp->convtype & KICONV_UCS_TO_UTF8) {
+                       q = (u_char *)dst;
+                       if (ucslen == 4 && dp->convtype & KICONV_UCS_UCS4) {
+                               /* decode surrogate pair */
+                               code = decode_surrogate(p);
+                       } else {
+                               code = (ucs[0] << 8) | ucs[1];
+                       }
+
+                       if (casetype == KICONV_LOWER && dp->ctype) {
+                               code = towlower(code, dp->ctype);
+                       } else if (casetype == KICONV_UPPER && dp->ctype) {
+                               code = towupper(code, dp->ctype);
+                       }
+
+                       outlen = 0;
+                       if (ucs4_to_utf8(code, q, &outlen, or) == NULL) {
+                               ret = -1;
+                               break;
+                       }
+
+                       src += inlen;
+                       ir -= inlen;
+                       dst += outlen;
+                       or -= outlen;
+
+               } else if (dp->convtype & KICONV_UCS_COMBINE && dp->t_ctp) {
+                       ret = iconv_convchr_case(dp->t_ctp, &p, &ucslen, &dst,
+                           &or, casetype & (KICONV_LOWER | KICONV_UPPER));
+                       if (ret)
+                               break;
+
+                       src += inlen;
+                       ir -= inlen;
+
+               } else {
+                       /* dst code is a proper subset of ENCODING_UNICODE */
+                       if (or < ucslen) {
+                               ret = -1;
+                               break;
+                       }
+                       src += inlen;
+                       ir -= inlen;
+                       or -= ucslen;
+                       if (dp->convtype & KICONV_UCS_TO_LE) {
+                               *dst++ = *(p + 1);
+                               *dst++ = *p;
+                               p += 2;
+                       } else {
+                               *dst++ = *p++;
+                               *dst++ = *p++;
+                       }
+                       if (ucslen == 4) {
+                               if ((dp->convtype & KICONV_UCS_UCS4) == 0 ||
+                                   (dp->convtype & KICONV_UCS_TO_UTF16) == 0) {
+                                       ret = -1;
+                                       break;
+                               }
+                               if (dp->convtype & KICONV_UCS_TO_LE) {
+                                       *dst++ = *(p + 1);
+                                       *dst++ = *p;
+                               } else {
+                                       *dst++ = *p++;
+                                       *dst++ = *p;
+                               }
+                       }
+               }
+
+               if (convchar == 1)
+                       break;
+       }
+
+       *inbuf += in - ir;
+       *outbuf += on - or;
+       *inbytesleft -= in - ir;
+       *outbytesleft -= on - or;
+       return (ret);
+}
+
+static int
+iconv_ucs_init(struct iconv_converter_class *dcp)
+{
+       int error;
+
+       error = iconv_add(ENCODING_UNICODE, ENCODING_UNICODE, ENCODING_UTF8);
+       if (error)
+               return (error);
+       error = iconv_add(ENCODING_UNICODE, ENCODING_UTF8, ENCODING_UNICODE);
+       if (error)
+               return (error);
+       return (0);
+}
+
+static int
+iconv_ucs_done(struct iconv_converter_class *dcp)
+{
+       return (0);
+}
+
+static const char *
+iconv_ucs_name(struct iconv_converter_class *dcp)
+{
+       return (ENCODING_UNICODE);
+}
+
+static kobj_method_t iconv_ucs_methods[] = {
+       KOBJMETHOD(iconv_converter_open,        iconv_ucs_open),
+       KOBJMETHOD(iconv_converter_close,       iconv_ucs_close),
+       KOBJMETHOD(iconv_converter_conv,        iconv_ucs_conv),
+       KOBJMETHOD(iconv_converter_init,        iconv_ucs_init),
+       KOBJMETHOD(iconv_converter_done,        iconv_ucs_done),
+       KOBJMETHOD(iconv_converter_name,        iconv_ucs_name),
+       {0, 0}
+};
+
+KICONV_CONVERTER(ucs, sizeof(struct iconv_ucs));
+
+static uint32_t
+utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen)
+{
+       size_t i, w = 0;
+       uint32_t ucs4 = 0;
+
+       /*
+        * get leading 1 byte from utf-8
+        */
+       if ((*src & 0x80) == 0) {
+               /*
+                * leading 1 bit is "0"
+                *  utf-8: 0xxxxxxx
+                *  ucs-4: 00000000 00000000 00000000 0xxxxxxx
+                */
+               w = 1;
+               /* get trailing 7 bits */
+               ucs4 = *src & 0x7f;
+       } else if ((*src & 0xe0) == 0xc0) {
+               /*
+                * leading 3 bits are "110"
+                *  utf-8: 110xxxxx 10yyyyyy
+                *  ucs-4: 00000000 00000000 00000xxx xxyyyyyy
+                */
+               w = 2;
+               /* get trailing 5 bits */
+               ucs4 = *src & 0x1f;
+       } else if ((*src & 0xf0) == 0xe0) {
+               /*
+                * leading 4 bits are "1110"
+                *  utf-8: 1110xxxx 10yyyyyy 10zzzzzz
+                *  ucs-4: 00000000 00000000 xxxxyyyy yyzzzzzz
+                */
+               w = 3;
+               /* get trailing 4 bits */
+               ucs4 = *src & 0x0f;
+       } else if ((*src & 0xf8) == 0xf0) {
+               /*
+                * leading 5 bits are "11110"
+                *  utf-8: 11110www 10xxxxxx 10yyyyyy 10zzzzzz
+                *  ucs-4: 00000000 000wwwxx xxxxyyyy yyzzzzzz
+                */
+               w = 4;
+               /* get trailing 3 bits */
+               ucs4 = *src & 0x07;
+       } else {
+               /* out of utf-16 range or having illegal bits */
+               return (0);
+       }
+       if (w == 0)
+               return (0);
+
+       if (srclen < w)
+               return (0);
+
+       /*
+        * get left parts from utf-8
+        */
+       for (i = 1 ; i < w ; i++) {
+               if ((*(src + i) & 0xc0) != 0x80) {
+                       /* invalid: leading 2 bits are not "10" */
+                       return (0);
+               }
+               /* concatenate trailing 6 bits into ucs4 */
+               ucs4 <<= 6;
+               ucs4 |= *(src + i) & 0x3f;
+       }
+
+       *utf8width = w;
+       return (ucs4);
+}
+
+static u_char *
+ucs4_to_utf8(uint32_t ucs4, char *dst, size_t *utf8width, size_t dstlen)
+{
+       u_char lead, *p;
+       size_t i, w;
+
+       /*
+        * determine utf-8 width and leading bits
+        */
+       if (ucs4 < 0x80) {
+               w = 1;
+               lead = 0;       /* "0" */
+       } else if (ucs4 < 0x800) {
+               w = 2;
+               lead = 0xc0;    /* "11" */
+       } else if (ucs4 < 0x10000) {
+               w = 3;
+               lead = 0xe0;    /* "111" */
+       } else if (ucs4 < 0x200000) {
+               w = 4;
+               lead = 0xf0;    /* "1111" */
+       } else {
+               return (NULL);
+       }
+
+       if (dstlen < w)
+               return (NULL);
+
+       /*
+        * construct utf-8
+        */
+       p = dst;
+       for (i = w - 1 ; i >= 1 ; i--) {
+               /* get trailing 6 bits and put it with leading bit as "1" */
+               *(p + i) = (ucs4 & 0x3f) | 0x80;
+               ucs4 >>= 6;
+       }
+       *p = ucs4 | lead;
+
+       *utf8width = w;
+
+       return (p);
+}

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
_______________________________________________
svn-src-head@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"

Reply via email to