Hi Ted,

Ted Unangst wrote on Sat, Nov 14, 2015 at 01:22:33PM -0500:

> One other program which I think will provide some insight is cut.
> It cares, perhaps even more than others, about bytes and chars
> and bytes in the middle of chars.

The following fully implements -c.  While here, reduce the ridiculous
LC_ALL to what's actually needed, LC_CTYPE.

This doesn't even need a utf8.c file.  The standard mblen(3) interface
can easily be used directly.  Conversion to wchar_t and back is not
required.  I'm not worrying about the internal state of mblen(3).
We are not going to support state-dependent encodings, anyway.

The implementation of -n will not be more difficult and very similar.

Fixing -d won't be difficult either.  It requires changing dchar
from char to char *, using mblen(3) when parsing the -d argument,
and using strstr(3) in f_cut() instead of the hand-rolled version
of strchr(3) currently in there.

OK?
  Ingo

P.S.
Of course, i intend to modernize the other functions by using
getline() there, too.


Index: cut.c
===================================================================
RCS file: /cvs/src/usr.bin/cut/cut.c,v
retrieving revision 1.22
diff -u -p -r1.22 cut.c
--- cut.c       3 Nov 2015 04:57:20 -0000       1.22
+++ cut.c       15 Nov 2015 17:35:19 -0000
@@ -49,6 +49,7 @@ int   dflag;
 int    fflag;
 int    sflag;
 
+void   b_cut(FILE *, char *);
 void   c_cut(FILE *, char *);
 void   f_cut(FILE *, char *);
 void   get_list(char *);
@@ -61,20 +62,18 @@ main(int argc, char *argv[])
        void (*fcn)(FILE *, char *);
        int ch, rval;
 
-       setlocale (LC_ALL, "");
+       setlocale(LC_CTYPE, "");
 
        if (pledge("stdio rpath", NULL) == -1)
                err(1, "pledge");
 
        dchar = '\t';                   /* default delimiter is \t */
 
-       /* Since we don't support multi-byte characters, the -c and -b 
-          options are equivalent, and the -n option is meaningless. */
        while ((ch = getopt(argc, argv, "b:c:d:f:sn")) != -1)
                switch(ch) {
                case 'b':
                case 'c':
-                       fcn = c_cut;
+                       fcn = ch == 'c' && MB_CUR_MAX > 1 ? c_cut : b_cut;
                        get_list(optarg);
                        cflag = 1;
                        break;
@@ -91,6 +90,7 @@ main(int argc, char *argv[])
                        sflag = 1;
                        break;
                case 'n':
+                       /* XXX  Not yet implemented. */
                        break;
                case '?':
                default:
@@ -192,7 +192,7 @@ get_list(char *list)
 
 /* ARGSUSED */
 void
-c_cut(FILE *fp, char *fname)
+b_cut(FILE *fp, char *fname)
 {
        int ch, col;
        char *pos;
@@ -216,6 +216,42 @@ c_cut(FILE *fp, char *fname)
                                        ;
                }
                (void)putchar('\n');
+       }
+}
+
+void
+c_cut(FILE *fp, char *fname)
+{
+       static char     *line = NULL;
+       static size_t    linesz = 0;
+       ssize_t          linelen;
+       char            *cp, *pos, *maxpos;
+       int              len;
+
+       mblen(NULL, MB_CUR_MAX);
+       while ((linelen = getline(&line, &linesz, fp)) != -1) {
+               if (line[linelen - 1] == '\n')
+                       line[linelen - 1] = '\0';
+
+               cp = line;
+               pos = positions + 1;
+               maxpos = pos + maxval;
+               while(pos < maxpos && *cp != '\0') {
+                       len = mblen(cp, MB_CUR_MAX);
+                       if (len == -1) {
+                               mblen(NULL, MB_CUR_MAX);
+                               len = 1;
+                       }
+                       if (*pos++ == '\0')
+                               cp += len;
+                       else
+                               while (len--)
+                                       putchar(*cp++);
+               }
+               if (autostop)
+                       puts(cp);
+               else
+                       putchar('\n');
        }
 }
 
Index: cut.1
===================================================================
RCS file: /cvs/src/usr.bin/cut/cut.1,v
retrieving revision 1.24
diff -u -p -r1.24 cut.1
--- cut.1       10 Jul 2014 14:11:56 -0000      1.24
+++ cut.1       15 Nov 2015 17:35:19 -0000
@@ -145,11 +145,28 @@ utility is compliant with the
 .St -p1003.1-2008
 specification.
 .Sh CAVEATS
-The current implementation does not support multi-byte characters.
-Consequently
+The definition of a character depends on the current character set
+.Xr locale 1 .
+If
+.Ev LC_CTYPE
+is set to
+.Qq C
+or
+.Qq POSIX ,
 .Fl c
 does the same as
 .Fl b ,
-and
 .Fl n
-has no effect.
+has no effect, and
+.Fl d
+uses the first byte of
+.Ar delim .
+.Sh BUGS
+The
+.Fl n
+option is not yet implemented and has no effect.
+The
+.Fl d
+option currently always uses the first byte of
+.Ar delim
+rather than the first character.

Reply via email to