Update UTF-8 locale ctype data (was: Re: ls(1) multibyte support)

2011-01-14 Thread Stefan Sperling
On Thu, Jan 06, 2011 at 07:52:19PM +0300, Alexander Polakov wrote:
> * Alexander Polakov  [110105 17:20]:
> > Hi,
> > 
> > here's an updated version.
> > 
> > 1) en_US.UTF-8.src updates from FreeBSD

Let's start with those.

These changes are all fine, I checked them against Unicode 5.2.
http://www.unicode.org/Public/5.2.0/charts/CodeCharts-noHan.pdf

The diff below (from Alexander) brings us up to par with FreeBSD.
Many updates could be made to this file to support additional
characters listed in Unicode 5.2.0 (or even 6.0.0).
But that can be done later.

Can someone ok this? Thanks in advance.

Index: share/locale/ctype/en_US.UTF-8.src
===
RCS file: /OpenBSD/src/share/locale/ctype/en_US.UTF-8.src,v
retrieving revision 1.1
diff -u -r1.1 en_US.UTF-8.src
--- share/locale/ctype/en_US.UTF-8.src  7 Aug 2005 10:03:45 -   1.1
+++ share/locale/ctype/en_US.UTF-8.src  6 Jan 2011 16:24:39 -
@@ -491,9 +491,9 @@
  * U+0300 - U+036F : Combining Diacritical Marks
  */
 
-GRAPH 0x0300 - 0x034f  0x0360 - 0x036f
-PRINT 0x0300 - 0x034f  0x0360 - 0x036f
-SWIDTH1   0x0300 - 0x034f  0x0360 - 0x036f
+GRAPH 0x0300 - 0x034e  0x0350 - 0x036f
+PRINT 0x0300 - 0x034e  0x0350 - 0x036f
+SWIDTH0   0x0300 - 0x034e  0x0350 - 0x036f
 
 MAPUPPER  < 0x0345 0x0399 >
 
@@ -583,7 +583,7 @@
 LOWER 0x04c8  0x04ca  0x04cc  0x04ce  0x04d1  0x04d3  0x04d5
 LOWER 0x04d7  0x04d9  0x04db  0x04dd  0x04df  0x04e1  0x04e3
 LOWER 0x04e5  0x04e7  0x04e9  0x04eb  0x04ed  0x04ef  0x04f1
-LOWER 0x04f3  0x04f5  0x04f9
+LOWER 0x04f3  0x04f5  0x04f7  0x04f9
 PUNCT 0x0482
 UPPER 0x0400 - 0x042f  0x0460  0x0462  0x0464  0x0466  0x0468
 UPPER 0x046a  0x046c  0x046e  0x0470  0x0472  0x0474  0x0476
@@ -595,9 +595,10 @@
 UPPER 0x04c5  0x04c7  0x04c9  0x04cb  0x04cd  0x04d0  0x04d2
 UPPER 0x04d4  0x04d6  0x04d8  0x04da  0x04dc  0x04de  0x04e0
 UPPER 0x04e2  0x04e4  0x04e6  0x04e8  0x04ea  0x04ec  0x04ee
-UPPER 0x04f0  0x04f2  0x04f4  0x04f8
-PRINT 0x0400 - 0x0486  0x0488 - 0x04ce  0x04d0 - 0x04f5  0x04f8  0x04f9
-SWIDTH1   0x0400 - 0x0486  0x0488 - 0x04ce  0x04d0 - 0x04f5  0x04f8  0x04f9
+UPPER 0x04f0  0x04f2  0x04f4  0x04f6  0x04f8
+PRINT 0x0400 - 0x0486  0x0488 - 0x04ce  0x04d0 - 0x04f9
+SWIDTH0   0x0483 - 0x0486  0x0488 - 0x0489
+SWIDTH1   0x0400 - 0x0482  0x048a - 0x04ce  0x04d0 - 0x04f9
 
 MAPUPPER  < 0x0430 - 0x044f : 0x0410 >
 MAPUPPER  < 0x0450 - 0x045f : 0x0400 >
@@ -671,6 +672,7 @@
 MAPUPPER  < 0x04f1 0x04f0 >
 MAPUPPER  < 0x04f3 0x04f2 >
 MAPUPPER  < 0x04f5 0x04f4 >
+MAPUPPER  < 0x04f7 0x04f6 >
 MAPUPPER  < 0x04f9 0x04f8 >
 MAPLOWER  < 0x0400 - 0x040f : 0x0450 >
 MAPLOWER  < 0x0410 - 0x042f : 0x0430 >
@@ -744,6 +746,7 @@
 MAPLOWER  < 0x04f0 0x04f1 >
 MAPLOWER  < 0x04f2 0x04f3 >
 MAPLOWER  < 0x04f4 0x04f5 >
+MAPLOWER  < 0x04f6 0x04f7 >
 MAPLOWER  < 0x04f8 0x04f9 >
 
 
@@ -1052,7 +1055,8 @@
 GRAPH 0x0e01 - 0x0e3a  0x0e3f - 0x0e5b
 PUNCT 0x0e3f  0x0e4f  0x0e5a  0x0e5b
 PRINT 0x0e01 - 0x0e3a  0x0e3f - 0x0e5b
-SWIDTH1   0x0e01 - 0x0e3a  0x0e3f - 0x0e5b
+SWIDTH0   0x0e31   0x0e34 - 0x0e3a  0x0e47 - 0x0e4e
+SWIDTH1   0x0e01 - 0x0e30  0x0e32 - 0x0e33  0x0e3f - 0x0e46  0x0e4f - 0x0e5b
 
 TODIGIT   < 0x0e50 - 0x0e59 : 0x >
 
@@ -1283,6 +1287,14 @@
 
 TODIGIT   < 0x1810 - 0x1819 : 0x >
 
+/*
+ * U+1DC0 - U+1DFF : Combining Diacritical Marks Supplement
+ */
+
+GRAPH 0x1DC0 - 0x1DC3
+PRINT 0x1DC0 - 0x1DC3
+SWIDTH0   0x1DC0 - 0x1DC3
+
 
 /*
  * U+1E00 - U+1EFF : Latin Extended Additional
@@ -1672,7 +1684,8 @@
 BLANK 0x2000 - 0x200b  0x202f  0x205f
 PRINT 0x2000 - 0x200b  0x2010 - 0x2029  0x202f - 0x2052  0x2057
 PRINT 0x205f
-SWIDTH1   0x2000 - 0x200b  0x2010 - 0x2029  0x202f - 0x2052  0x2057
+SWIDTH1   0x2000 - 0x200a  0x2010 - 0x2029  0x202f - 0x2052  0x2057
+SWIDTH0   0x200b - 0x200d
 SWIDTH1   0x205f
 
 
@@ -1707,9 +1720,9 @@
  * U+20D0 - U+20FF : Combining Diacritical Marks for Symbols
  */
 
-GRAPH 0x20d0 - 0x20ea
-PRINT 0x20d0 - 0x20ea
-SWIDTH1   0x20d0 - 0x20ea
+GRAPH 0x20d0 - 0x20eb
+PRINT 0x20d0 - 0x20eb
+SWIDTH0   0x20d0 - 0x20eb
 
 
 /*
@@ -1987,7 +2000,8 @@
 PUNCT 0x309b  0x309c
 PRINT 0x3041 - 0x3096  0x3099 - 0x309f
 PHONOGRAM 0x3041 - 0x3096  0x309f
-SWIDTH2   0x3041 - 0x3096  0x3099 - 0x309f
+SWIDTH0   0x3099 - 0x309a
+SWIDTH2   0x3041 - 0x3096  0x309b - 0x309f
 
 
 /*
@@ -2211,7 +2225,7 @@
 
 GRAPH 0xfe20 - 0xfe23
 PRINT 0xfe20 - 0xfe23
-SWIDTH1   0xfe20 - 0xfe23
+SWIDTH0   0xfe20 - 0xfe23
 
 
 /*
@@ -2333,8 +2347,13 @@
 GRAPH 0x1d100 - 0x1d126  0x1d12a - 0x1d172  0x1d17b - 0x1d1dd
 PUNCT 0x1d100 - 0x1d126  0x1d12a - 0x1d164  0x1d16a - 0x1d16c
 PUNCT 0x1d183  0x1d184  0x1d18c - 0x1d1a9  0x1d1ae - 0x1d1dd
-PRINT 0x1d100 - 0x1d126  0x1d12a - 0x1d172  0x1d17b - 0x1d1dd
-SWIDTH1   0x1d100 - 0x1d126  0x1d12a - 0x1d172  0x1d17b - 0x1d1dd
+PRINT 0x1d100 - 0x1d126  0x1d12a - 0x1d158  0x1d15a - 0x1d172
+PRINT 0x1d17b - 0

Re: Update UTF-8 locale ctype data (was: Re: ls(1) multibyte support)

2011-03-04 Thread Stefan Sperling
On Sat, Jan 15, 2011 at 12:44:51AM +0100, Stefan Sperling wrote:
> On Fri, Jan 14, 2011 at 05:21:46PM +0100, Stefan Sperling wrote:
> > On Thu, Jan 06, 2011 at 07:52:19PM +0300, Alexander Polakov wrote:
> > > * Alexander Polakov  [110105 17:20]:
> > > > Hi,
> > > > 
> > > > here's an updated version.
> > > > 
> > > > 1) en_US.UTF-8.src updates from FreeBSD
> > 
> > Let's start with those.
> > 
> > These changes are all fine, I checked them against Unicode 5.2.
> > http://www.unicode.org/Public/5.2.0/charts/CodeCharts-noHan.pdf
> > 
> > The diff below (from Alexander) brings us up to par with FreeBSD.
> > Many updates could be made to this file to support additional
> > characters listed in Unicode 5.2.0 (or even 6.0.0).
> > But that can be done later.
> > 
> > Can someone ok this? Thanks in advance.
> 
> Before the ctype changes can go in, we'll need to this part from
> Alexander's diff to fix mklocale (caught by nicm@, thanks!)

Can this go in now?
Any OKs?

Index: lib/libc/locale/runetype.h
===
RCS file: /cvs/src/lib/libc/locale/runetype.h,v
retrieving revision 1.5
diff -u -p -r1.5 runetype.h
--- lib/libc/locale/runetype.h  8 Oct 2007 08:17:15 -   1.5
+++ lib/libc/locale/runetype.h  14 Jan 2011 23:34:28 -
@@ -69,9 +69,9 @@ typedef uint32_t _RuneType;
 #define_RUNETYPE_I 0x0008U /* Ideogram */
 #define_RUNETYPE_T 0x0010U /* Special */
 #define_RUNETYPE_Q 0x0020U /* Phonogram */
-#define_RUNETYPE_SWM   0xc000U/* Mask to get screen width data */
+#define_RUNETYPE_SWM   0xe000U /* Mask to get screen width 
data */
 #define_RUNETYPE_SWS   30  /* Bits to shift to get width */
-#define_RUNETYPE_SW0   0xU /* 0 width character */
+#define_RUNETYPE_SW0   0x2000U /* 0 width character */
 #define_RUNETYPE_SW1   0x4000U /* 1 width character */
 #define_RUNETYPE_SW2   0x8000U /* 2 width character */
 #define_RUNETYPE_SW3   0xc000U /* 3 width character */
Index: share/locale/ctype/en_US.UTF-8.src
===
RCS file: /cvs/src/share/locale/ctype/en_US.UTF-8.src,v
retrieving revision 1.1
diff -u -p -r1.1 en_US.UTF-8.src
--- share/locale/ctype/en_US.UTF-8.src  7 Aug 2005 10:03:45 -   1.1
+++ share/locale/ctype/en_US.UTF-8.src  15 Jan 2011 15:49:26 -
@@ -491,9 +491,9 @@ SWIDTH1   0x02b0 - 0x02ee
  * U+0300 - U+036F : Combining Diacritical Marks
  */
 
-GRAPH 0x0300 - 0x034f  0x0360 - 0x036f
-PRINT 0x0300 - 0x034f  0x0360 - 0x036f
-SWIDTH1   0x0300 - 0x034f  0x0360 - 0x036f
+GRAPH 0x0300 - 0x034e  0x0350 - 0x036f
+PRINT 0x0300 - 0x034e  0x0350 - 0x036f
+SWIDTH0   0x0300 - 0x034e  0x0350 - 0x036f
 
 MAPUPPER  < 0x0345 0x0399 >
 
@@ -583,7 +583,7 @@ LOWER 0x04b9  0x04bb  0x04bd  0x04bf
 LOWER 0x04c8  0x04ca  0x04cc  0x04ce  0x04d1  0x04d3  0x04d5
 LOWER 0x04d7  0x04d9  0x04db  0x04dd  0x04df  0x04e1  0x04e3
 LOWER 0x04e5  0x04e7  0x04e9  0x04eb  0x04ed  0x04ef  0x04f1
-LOWER 0x04f3  0x04f5  0x04f9
+LOWER 0x04f3  0x04f5  0x04f7  0x04f9
 PUNCT 0x0482
 UPPER 0x0400 - 0x042f  0x0460  0x0462  0x0464  0x0466  0x0468
 UPPER 0x046a  0x046c  0x046e  0x0470  0x0472  0x0474  0x0476
@@ -595,9 +595,10 @@ UPPER 0x04b8  0x04ba  0x04bc  0x04be
 UPPER 0x04c5  0x04c7  0x04c9  0x04cb  0x04cd  0x04d0  0x04d2
 UPPER 0x04d4  0x04d6  0x04d8  0x04da  0x04dc  0x04de  0x04e0
 UPPER 0x04e2  0x04e4  0x04e6  0x04e8  0x04ea  0x04ec  0x04ee
-UPPER 0x04f0  0x04f2  0x04f4  0x04f8
-PRINT 0x0400 - 0x0486  0x0488 - 0x04ce  0x04d0 - 0x04f5  0x04f8  0x04f9
-SWIDTH1   0x0400 - 0x0486  0x0488 - 0x04ce  0x04d0 - 0x04f5  0x04f8  0x04f9
+UPPER 0x04f0  0x04f2  0x04f4  0x04f6  0x04f8
+PRINT 0x0400 - 0x0486  0x0488 - 0x04ce  0x04d0 - 0x04f9
+SWIDTH0   0x0483 - 0x0486  0x0488 - 0x0489
+SWIDTH1   0x0400 - 0x0482  0x048a - 0x04ce  0x04d0 - 0x04f9
 
 MAPUPPER  < 0x0430 - 0x044f : 0x0410 >
 MAPUPPER  < 0x0450 - 0x045f : 0x0400 >
@@ -671,6 +672,7 @@ MAPUPPER  < 0x04ef 0x04ee >
 MAPUPPER  < 0x04f1 0x04f0 >
 MAPUPPER  < 0x04f3 0x04f2 >
 MAPUPPER  < 0x04f5 0x04f4 >
+MAPUPPER  < 0x04f7 0x04f6 >
 MAPUPPER  < 0x04f9 0x04f8 >
 MAPLOWER  < 0x0400 - 0x040f : 0x0450 >
 MAPLOWER  < 0x0410 - 0x042f : 0x0430 >
@@ -744,6 +746,7 @@ MAPLOWER  < 0x04ee 0x04ef >
 MAPLOWER  < 0x04f0 0x04f1 >
 MAPLOWER  < 0x04f2 0x04f3 >
 MAPLOWER  < 0x04f4 0x04f5 >
+MAPLOWER  < 0x04f6 0x04f7 >
 MAPLOWER  < 0x04f8 0x04f9 >
 
 
@@ -1052,7 +1055,8 @@ DIGIT 0x0e50 - 0x0e59
 GRAPH 0x0e01 - 0x0e3a  0x0e3f - 0x0e5b
 PUNCT 0x0e3f  0x0e4f  0x0e5a  0x0e5b
 PRINT 0x0e01 - 0x0e3a  0x0e3f - 0x0e5b
-SWIDTH1   0x0e01 - 0x0e3a  0x0e3f - 0x0e5b
+SWIDTH0   0x0e31   0x0e34 - 0x0e3a  0x0e47 - 0x0e4e
+SWIDTH1   0x0e01 - 0x0e30  0x0e32 - 0x0e33  0x0e3f - 0x0e46  0x0e4f - 0x0e5b
 
 TODIGIT   < 0x0e50 - 0x0

Re: Update UTF-8 locale ctype data (was: Re: ls(1) multibyte support)

2011-01-14 Thread Stefan Sperling
On Fri, Jan 14, 2011 at 05:21:46PM +0100, Stefan Sperling wrote:
> On Thu, Jan 06, 2011 at 07:52:19PM +0300, Alexander Polakov wrote:
> > * Alexander Polakov  [110105 17:20]:
> > > Hi,
> > > 
> > > here's an updated version.
> > > 
> > > 1) en_US.UTF-8.src updates from FreeBSD
> 
> Let's start with those.
> 
> These changes are all fine, I checked them against Unicode 5.2.
> http://www.unicode.org/Public/5.2.0/charts/CodeCharts-noHan.pdf
> 
> The diff below (from Alexander) brings us up to par with FreeBSD.
> Many updates could be made to this file to support additional
> characters listed in Unicode 5.2.0 (or even 6.0.0).
> But that can be done later.
> 
> Can someone ok this? Thanks in advance.

Before the ctype changes can go in, we'll need to this part from
Alexander's diff to fix mklocale (caught by nicm@, thanks!)

These symbols are internal to libc, with exception of mklocale.
Can this go in during ABI lock?

Index: lib/libc/locale/runetype.h
===
RCS file: /OpenBSD/src/lib/libc/locale/runetype.h,v
retrieving revision 1.5
diff -u -r1.5 runetype.h
--- lib/libc/locale/runetype.h  8 Oct 2007 08:17:15 -   1.5
+++ lib/libc/locale/runetype.h  6 Jan 2011 16:24:20 -
@@ -69,9 +69,9 @@
 #define_RUNETYPE_I 0x0008U /* Ideogram */
 #define_RUNETYPE_T 0x0010U /* Special */
 #define_RUNETYPE_Q 0x0020U /* Phonogram */
-#define_RUNETYPE_SWM   0xc000U/* Mask to get screen width data */
+#define_RUNETYPE_SWM   0xe000U /* Mask to get screen width 
data */
 #define_RUNETYPE_SWS   30  /* Bits to shift to get width */
-#define_RUNETYPE_SW0   0xU /* 0 width character */
+#define_RUNETYPE_SW0   0x2000U /* 0 width character */
 #define_RUNETYPE_SW1   0x4000U /* 1 width character */
 #define_RUNETYPE_SW2   0x8000U /* 2 width character */
 #define_RUNETYPE_SW3   0xc000U /* 3 width character */