[ntfs-3g-devel] patch for utf-8 support

Nikos Mavrogiannopoulos Wed, 24 Dec 2008 09:48:56 -0800

Hello,
 I attach a patch that adds a configure option "--enable-utf8-mode" that
can be used to enable utf8 characters on systems that this is not
supported by its libc (embedded) systems. It is based on the Xutf8
implementation by Jean-Marc Lienher.


regards,
Nikos

Index: src/ntfs-3g.c
===================================================================
--- a/src/ntfs-3g.c	(revision 733)
+++ b/src/ntfs-3g.c	(working copy)
@@ -160,8 +160,13 @@
 
 static const char *usage_msg = 
 "\n"
-"%s %s %s %d - Third Generation NTFS Driver\n"
+"%s %s %s %d - Third Generation NTFS Driver"
+#ifdef NTFS_UTF8
+" (UTF8 mode)\n"
+#else
 "\n"
+#endif
+"\n"
 "Copyright (C) 2006-2008 Szabolcs Szakacsits\n"
 "Copyright (C) 2005-2007 Yura Pakhuchiy\n"
 "\n"
@@ -358,7 +363,9 @@
 	stream_name_len = ntfs_fuse_parse_path(org_path, &path, &stream_name);
 	if (stream_name_len < 0)
 		return stream_name_len;
+
 	memset(stbuf, 0, sizeof(struct stat));
+
 	ni = ntfs_pathname_to_inode(ctx->vol, NULL, path);
 	if (!ni) {
 		res = -errno;
Index: config.h.in
===================================================================
--- a/config.h.in	(revision 733)
+++ b/config.h.in	(working copy)
@@ -270,6 +270,9 @@
 /* Don't use default IO ops */
 #undef NO_NTFS_DEVICE_DEFAULT_IO_OPS
 
+/* Define to 1 if utf8 mode enabled */
+#undef NTFS_UTF8
+
 /* Name of package */
 #undef PACKAGE
 
Index: configure.ac
===================================================================
--- a/configure.ac	(revision 733)
+++ b/configure.ac	(working copy)
@@ -53,6 +53,13 @@
 )
 
 AC_ARG_ENABLE(
+	[utf8-mode],
+	[AS_HELP_STRING([--enable-utf8-mode],[enable UTF8 mode of operation (avoid using locale)])],
+	[utf8_mode="yes"],
+	[utf8_mode="no"]
+)
+
+AC_ARG_ENABLE(
 	[pedantic],
 	[AS_HELP_STRING([--enable-pedantic],[enable compile pedantic mode])],
 	,
@@ -302,6 +309,14 @@
 	)
 fi
 
+if test "${utf8_mode}" = "yes"; then
+	AC_DEFINE(
+		[NTFS_UTF8],
+		[1],
+		[Define to 1 if utf8 mode enabled]
+	)
+fi
+
 test "${enable_device_default_io_ops}" = "no" && AC_DEFINE(
 	[NO_NTFS_DEVICE_DEFAULT_IO_OPS],
 	[1],
Index: libntfs-3g/Xutf8.c
===================================================================
--- a/libntfs-3g/Xutf8.c	(revision 0)
+++ b/libntfs-3g/Xutf8.c	(revision 0)
@@ -0,0 +1,186 @@
+/*
+ * "$Id:  $"
+ *
+ * Unicode to UTF-8 conversion functions.
+ *
+ *      Copyright (c) 2000,2001 by O'ksi'D.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * O'KSI'D BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Except as contained in this notice, the name of the O'ksi'D shall
+ * not be used in advertising or otherwise to promote the sale, use or other
+ * dealings in this Software without prior written authorization from O'ksi'D.
+ *
+ *
+ *  Author: Jean-Marc Lienher ( http://oksid.ch )
+ */
+
+#include "Xutf8.h"
+
+/*** NOTE : all functions are LIMITED to 24 bits Unicode values !!! ***/
+
+/* 
+ * Converts the first char of the UTF-8 string to an Unicode value 
+ * Returns the byte length of the converted UTF-8 char 
+ * Returns -1 if the UTF-8 string is not valid 
+ */
+int
+XConvertUtf8ToUcs(
+        const unsigned char     *buf,
+        int                     len,
+        unsigned int          	*ucs)
+{
+    if (buf == 0 || ucs == 0) return 0;
+    
+    if (buf[0] & 0x80) {
+	if (buf[0] & 0x40) {
+	  if (buf[0] & 0x20) {
+	    if (buf[0] & 0x10) {
+	      if (buf[0] & 0x08) {
+		if (buf[0] & 0x04) {
+		  if (buf[0] & 0x02) {
+			/* bad UTF-8 string */
+		  } else {
+			/* 0x04000000 - 0x7FFFFFFF */
+		  }	
+		} else if (len > 4 
+				&& (buf[1] & 0xC0) == 0x80
+				&& (buf[2] & 0xC0) == 0x80
+				&& (buf[3] & 0xC0) == 0x80
+				&& (buf[4] & 0xC0) == 0x80) 
+		{
+		  /* 0x00200000 - 0x03FFFFFF */
+                  *ucs =  ((buf[0] & ~0xF8) << 24) +
+                          ((buf[1] & ~0x80) << 18) +
+                          ((buf[2] & ~0x80) << 12) +
+                          ((buf[3] & ~0x80) << 6) +
+                           (buf[4] & ~0x80);
+		  if (*ucs > 0x001FFFFF && *ucs < 0x01000000) return 5;
+		}
+              } else if (len > 3 
+				&& (buf[1] & 0xC0) == 0x80
+				&& (buf[2] & 0xC0) == 0x80
+				&& (buf[3] & 0xC0) == 0x80) 
+	      {
+		/* 0x00010000 - 0x001FFFFF */
+                *ucs =  ((buf[0] & ~0xF0) << 18) +
+                        ((buf[1] & ~0x80) << 12) +
+                        ((buf[2] & ~0x80) << 6) +
+                         (buf[3] & ~0x80);
+	        if (*ucs > 0x0000FFFF) return 4;
+              }
+	    } else if (len > 2 && 
+			(buf[1] & 0xC0) == 0x80 && 
+			(buf[2] & 0xC0) == 0x80) 
+	    {
+	      /* 0x00000800 - 0x0000FFFF */
+              *ucs =  ((buf[0] & ~0xE0) << 12) +
+               	      ((buf[1] & ~0x80) << 6) +
+                       (buf[2] & ~0x80);
+              if (*ucs > 0x000007FF) return 3;
+	    }	
+	  } else if (len > 1 && (buf[1] & 0xC0) == 0x80) {
+	    /* 0x00000080 - 0x000007FF */
+	    *ucs = ((buf[0] & ~0xC0) << 6) +
+		    (buf[1] & ~0x80);
+	    if (*ucs > 0x0000007F) return 2;
+	  }
+	}
+      } else if (len > 0) {
+	/* 0x00000000 - 0x0000007F */
+	*ucs = buf[0];
+	return 1;
+      } 
+
+      *ucs = (unsigned int) '?'; /* bad utf-8 string */
+      return -1;
+}
+
+/* 
+ * Converts an Unicode value to an UTF-8 string 
+ * NOTE : the buffer (buf) must be at least 5 bytes long !!!  
+ */
+int 
+XConvertUcsToUtf8(
+	unsigned int 	ucs, 
+	char 		*buf)
+{
+        if (buf == 0) return 0;
+
+	if (ucs < 0x000080) {
+		buf[0] = ucs;
+		return 1;
+	} else if (ucs < 0x000800) {
+		buf[0] = 0xC0 | (ucs >> 6);
+		buf[1] = 0x80 | (ucs & 0x3F);
+		return 2;
+	} else if (ucs < 0x010000) { 
+		buf[0] = 0xE0 | (ucs >> 12);
+		buf[1] = 0x80 | ((ucs >> 6) & 0x3F);
+		buf[2] = 0x80 | (ucs & 0x3F);
+		return 3;
+	} else if (ucs < 0x00200000) {
+		buf[0] = 0xF0 | (ucs >> 18);
+		buf[1] = 0x80 | ((ucs >> 12) & 0x3F);
+		buf[2] = 0x80 | ((ucs >> 6) & 0x3F);
+		buf[3] = 0x80 | (ucs & 0x3F);
+		return 4;
+	} else if (ucs < 0x01000000) {
+		buf[0] = 0xF8 | (ucs >> 24);
+		buf[1] = 0x80 | ((ucs >> 18) & 0x3F);
+		buf[2] = 0x80 | ((ucs >> 12) & 0x3F);
+		buf[3] = 0x80 | ((ucs >> 6) & 0x3F);
+		buf[4] = 0x80 | (ucs & 0x3F);
+		return 5;
+	}
+	buf[0] = '?';
+	return -1;
+}
+
+/* 
+ * returns the byte length of the first UTF-8 char 
+ * (returns -1 if not valid) 
+ */
+int
+        XUtf8CharByteLen(
+                         const unsigned char     *buf,
+                         int                     len)
+{
+    unsigned int ucs;
+    return XConvertUtf8ToUcs(buf, len, &ucs);
+}
+
+/*
+ * returns the quantity of Unicode chars in the UTF-8 string 
+ */
+int 
+        XCountUtf8Char(
+                       const unsigned char 	*buf, 
+                       int 			len)
+{
+    int i = 0;
+    int nbc = 0;
+    while (i < len) {
+        int cl = XUtf8CharByteLen(buf + i, len - i);
+        if (cl < 1) cl = 1;
+        nbc++;
+        i += cl;
+    }
+    return nbc;
+}
Index: libntfs-3g/Xutf8.h
===================================================================
--- a/libntfs-3g/Xutf8.h	(revision 0)
+++ b/libntfs-3g/Xutf8.h	(revision 0)
@@ -0,0 +1,3 @@
+int XConvertUtf8ToUcs(const unsigned char     *buf, int len, unsigned int *ucs);
+int XConvertUcsToUtf8(unsigned int 	ucs, char *buf);
+int XCountUtf8Char(const unsigned char 	*buf, int len);
Index: libntfs-3g/unistr.c
===================================================================
--- a/libntfs-3g/unistr.c	(revision 733)
+++ b/libntfs-3g/unistr.c	(working copy)
@@ -46,6 +46,7 @@
 #include "debug.h"
 #include "logging.h"
 #include "misc.h"
+#include <assert.h>
 
 /*
  * IMPORTANT
@@ -373,6 +374,20 @@
 			err_val, ic, upcase, upcase_len);
 }
 
+#ifdef NTFS_UTF8
+# undef HAVE_MBSINIT
+# define wchar_t unsigned int
+
+#include "Xutf8.h"
+
+#define mbtowc(a,b,c) XConvertUtf8ToUcs( b, c, a)
+#define wctomb(x,y) XConvertUcsToUtf8(y, x)
+#define mbstowcs(a, b, c) XCountUtf8Char(b, strlen(b))
+
+
+
+#endif /* NTFS_UTF8*/
+
 /**
  * ntfs_ucstombs - convert a little endian Unicode string to a multibyte string
  * @ins:	input Unicode string buffer
@@ -439,8 +454,9 @@
 				return -1;
 			}
 			tc = ntfs_malloc((mbs_len + 64) & ~63);
-			if (!tc)
+			if (!tc) {
 				goto err_out;
+                        }
 			memcpy(tc, mbs, mbs_len);
 			mbs_len = (mbs_len + 64) & ~63;
 			free(mbs);
@@ -454,10 +470,11 @@
 #ifdef HAVE_MBSINIT
 		cnt = wcrtomb(mbs + o, wc, &mbstate);
 #else
-		cnt = wctomb(mbs + o, wc);
+                cnt = wctomb(mbs + o, wc);
 #endif
-		if (cnt == -1)
+		if (cnt == -1) {
 			goto err_out;
+                }
 		if (cnt <= 0) {
 			ntfs_log_debug("Eeek. cnt <= 0, cnt = %i\n", cnt);
 			errno = EINVAL;
@@ -539,7 +556,7 @@
 	}
 #endif
 #elif !defined(DJGPP)
-	ins_len = mbstowcs(NULL, s, 0);
+        ins_len = mbstowcs(NULL, s, 0);
 #else
 	/* Eeek!!! DJGPP has broken mbstowcs() implementation!!! */
 	ins_len = strlen(ins);
@@ -580,7 +597,7 @@
 #ifdef HAVE_MBSINIT
 		cnt = mbrtowc(&wc, ins + i, ins_size - i, &mbstate);
 #else
-		cnt = mbtowc(&wc, ins + i, ins_size - i);
+                cnt = mbtowc(&wc, ins + i, ins_size - i);
 #endif
 		if (!cnt)
 			break;
@@ -594,6 +611,7 @@
 		/* Make sure we are not overflowing the NTFS Unicode set. */
 		if ((unsigned long)wc >= (unsigned long)(1 <<
 				(8 * sizeof(ntfschar)))) {
+                        ntfs_log_debug("Eeek. cnt = %d %x\n", wc, wc);
 			errno = EILSEQ;
 			goto err_out;
 		}
Index: libntfs-3g/Makefile.am
===================================================================
--- a/libntfs-3g/Makefile.am	(revision 733)
+++ b/libntfs-3g/Makefile.am	(working copy)
@@ -40,7 +40,8 @@
 	security.c 	\
 	unistr.c 	\
 	version.c 	\
-	volume.c
+	volume.c	\
+	Xutf8.c
 
 if NTFS_DEVICE_DEFAULT_IO_OPS
 if WINDOWS

------------------------------------------------------------------------------

_______________________________________________
ntfs-3g-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/ntfs-3g-devel

[ntfs-3g-devel] patch for utf-8 support

Reply via email to