Hi Ted,

Ingo Schwarze wrote on Sun, Nov 15, 2015 at 06:56:37PM +0100:
> Ted Unangst wrote on Sat, Nov 14, 2015 at 01:22:33PM -0500:

>> One other program which I think will provide some insight is cut.
>> It cares, perhaps even more than others, about bytes and chars
>> and bytes in the middle of chars.

> The following fully implements -c.  While here, reduce the ridiculous
> LC_ALL to what's actually needed, LC_CTYPE.

And here is a complete implementation of -c, -n, and of -d with a
multibyte delimiter character.

> This doesn't even need a utf8.c file.  The standard mblen(3) interface
> can easily be used directly.  Conversion to wchar_t and back is not
> required.  I'm not worrying about the internal state of mblen(3).
> We are not going to support state-dependent encodings, anyway.

All of that is still true.

None of the algorithms that are currently used change.
The churn results from the fact that using multibyte delimiters
requires switching from hand-crafted parsing to use of <string.h>,
in particular strstr(3).  That said, the code in f_cut() becomes
shorter and simpler even though functionality is added.

The algorithm for -nb is so similar to the one for -c that both
can easily use the same function c_cut(), inspecting nflag at
merely one single place.


The insight that Ted asked for is that

 * in contrast to the other programs (ls, rs, ul), cut(1) isn't
   merely doing the same things with multibyte characters that it
   used to do with single byte characters; it grows substantial
   new, user-visible functionality (-c, -nb, -d multibyte);

 * and all the same, in contrast to the others, it doesn't need
   any custom mb*() function, it gets away with the standard
   mblen() interface.

I can commit this in three steps (implement -c, implement -n,
implement -d multibyte) - if anybody wants that, please say so;
otherwise i'm going to do only one commit.

OK?
  Ingo


Index: cut.1
===================================================================
RCS file: /cvs/src/usr.bin/cut/cut.1,v
retrieving revision 1.24
diff -u -p -r1.24 cut.1
--- cut.1       10 Jul 2014 14:11:56 -0000      1.24
+++ cut.1       23 Nov 2015 17:41:06 -0000
@@ -114,6 +114,8 @@ The selected fields are output,
 separated by the field delimiter character.
 .It Fl n
 Do not split multi-byte characters.
+A character is written to standard output if and only if the byte
+position holding its last byte is selected.
 .It Fl s
 Suppresses lines with no field delimiter characters.
 Unless specified, lines with no delimiters are passed through unmodified.
@@ -145,11 +147,19 @@ utility is compliant with the
 .St -p1003.1-2008
 specification.
 .Sh CAVEATS
-The current implementation does not support multi-byte characters.
-Consequently
+The definition of a character depends on the current character set
+.Xr locale 1 .
+If
+.Ev LC_CTYPE
+is set to
+.Qq C
+or
+.Qq POSIX ,
 .Fl c
 does the same as
 .Fl b ,
-and
 .Fl n
-has no effect.
+has no effect, and
+.Fl d
+uses the first byte of
+.Ar delim .
Index: cut.c
===================================================================
RCS file: /cvs/src/usr.bin/cut/cut.c,v
retrieving revision 1.22
diff -u -p -r1.22 cut.c
--- cut.c       3 Nov 2015 04:57:20 -0000       1.22
+++ cut.c       23 Nov 2015 17:41:06 -0000
@@ -33,6 +33,7 @@
  * SUCH DAMAGE.
  */
 
+#include <assert.h>
 #include <ctype.h>
 #include <err.h>
 #include <errno.h>
@@ -43,12 +44,17 @@
 #include <string.h>
 #include <unistd.h>
 
+char   dchar[5];
+int    dlen;
+
+int    bflag;
 int    cflag;
-char   dchar;
 int    dflag;
 int    fflag;
+int    nflag;
 int    sflag;
 
+void   b_cut(FILE *, char *);
 void   c_cut(FILE *, char *);
 void   f_cut(FILE *, char *);
 void   get_list(char *);
@@ -61,37 +67,43 @@ main(int argc, char *argv[])
        void (*fcn)(FILE *, char *);
        int ch, rval;
 
-       setlocale (LC_ALL, "");
+       setlocale(LC_CTYPE, "");
 
        if (pledge("stdio rpath", NULL) == -1)
                err(1, "pledge");
 
-       dchar = '\t';                   /* default delimiter is \t */
+       dchar[0] = '\t';                /* default delimiter */
+       dchar[1] = '\0';
+       dlen = 1;
 
-       /* Since we don't support multi-byte characters, the -c and -b 
-          options are equivalent, and the -n option is meaningless. */
        while ((ch = getopt(argc, argv, "b:c:d:f:sn")) != -1)
                switch(ch) {
                case 'b':
+                       get_list(optarg);
+                       bflag = 1;
+                       break;
                case 'c':
-                       fcn = c_cut;
                        get_list(optarg);
                        cflag = 1;
                        break;
                case 'd':
-                       dchar = *optarg;
+                       if ((dlen = mblen(optarg, MB_CUR_MAX)) == -1)
+                               usage();
+                       assert(dlen < sizeof(dchar));
+                       (void)memcpy(dchar, optarg, dlen);
+                       dchar[dlen] = '\0';
                        dflag = 1;
                        break;
                case 'f':
                        get_list(optarg);
-                       fcn = f_cut;
                        fflag = 1;
                        break;
+               case 'n':
+                       nflag = 1;
+                       break;
                case 's':
                        sflag = 1;
                        break;
-               case 'n':
-                       break;
                case '?':
                default:
                        usage();
@@ -99,12 +111,21 @@ main(int argc, char *argv[])
        argc -= optind;
        argv += optind;
 
-       if (fflag) {
-               if (cflag)
-                       usage();
-       } else if (!cflag || dflag || sflag)
+       if (bflag + cflag + fflag != 1 ||
+           (nflag && !bflag) ||
+           ((dflag || sflag) && !fflag))
                usage();
 
+       if (MB_CUR_MAX == 1) {
+               nflag = 0;
+               if (cflag) {
+                       bflag = 1;
+                       cflag = 0;
+               }
+       }
+
+       fcn = fflag ? f_cut : (cflag || nflag) ? c_cut : b_cut;
+
        rval = 0;
        if (*argv)
                for (; *argv; ++argv) {
@@ -192,7 +213,7 @@ get_list(char *list)
 
 /* ARGSUSED */
 void
-c_cut(FILE *fp, char *fname)
+b_cut(FILE *fp, char *fname)
 {
        int ch, col;
        char *pos;
@@ -220,65 +241,82 @@ c_cut(FILE *fp, char *fname)
 }
 
 void
-f_cut(FILE *fp, char *fname)
+c_cut(FILE *fp, char *fname)
 {
-       int ch, field, isdelim;
-       char *pos, *p, sep;
-       int output;
-       size_t len;
-       char *lbuf, *tbuf;
+       static char     *line = NULL;
+       static size_t    linesz = 0;
+       ssize_t          linelen;
+       char            *cp, *pos, *maxpos;
+       int              len;
+
+       while ((linelen = getline(&line, &linesz, fp)) != -1) {
+               if (line[linelen - 1] == '\n')
+                       line[linelen - 1] = '\0';
 
-       for (sep = dchar, tbuf = NULL; (lbuf = fgetln(fp, &len));) {
-               output = 0;
-               if (lbuf[len - 1] != '\n') {
-                       /* no newline at the end of the last line so add one */
-                       if ((tbuf = malloc(len + 1)) == NULL)
-                               err(1, NULL);
-                       memcpy(tbuf, lbuf, len);
-                       tbuf[len] = '\n';
-                       lbuf = tbuf;
-               }
-               for (isdelim = 0, p = lbuf;; ++p) {
-                       ch = *p;
-                       /* this should work if newline is delimiter */
-                       if (ch == sep)
-                               isdelim = 1;
-                       if (ch == '\n') {
-                               if (!isdelim && !sflag)
-                                       (void)fwrite(lbuf, len, 1, stdout);
-                               break;
-                       }
+               cp = line;
+               pos = positions + 1;
+               maxpos = pos + maxval;
+               while(pos < maxpos && *cp != '\0') {
+                       len = mblen(cp, MB_CUR_MAX);
+                       if (len == -1)
+                               len = 1;
+                       pos += nflag ? len : 1;
+                       if (pos[-1] == '\0')
+                               cp += len;
+                       else
+                               while (len--)
+                                       putchar(*cp++);
                }
-               if (!isdelim)
+               if (autostop)
+                       puts(cp);
+               else
+                       putchar('\n');
+       }
+}
+
+void
+f_cut(FILE *fp, char *fname)
+{
+       static char     *line = NULL;
+       static size_t    linesz = 0;
+       ssize_t          linelen;
+       char            *sp, *ep, *pos, *maxpos;
+       int              output;
+
+       while ((linelen = getline(&line, &linesz, fp)) != -1) {
+               if (line[linelen - 1] == '\n')
+                       line[linelen - 1] = '\0';
+
+               if ((ep = strstr(line, dchar)) == NULL) {
+                       if (!sflag)
+                               puts(line);
                        continue;
+               }
 
                pos = positions + 1;
-               for (field = maxval, p = lbuf; field; --field, ++pos) {
-                       if (*pos) {
-                               if (output++)
-                                       (void)putchar(sep);
-                               while ((ch = *p++) != '\n' && ch != sep)
-                                       (void)putchar(ch);
-                       } else
-                               while ((ch = *p++) != '\n' && ch != sep)
-                                       ;
-                       if (ch == '\n')
-                               break;
-               }
-               if (ch != '\n') {
-                       if (autostop) {
+               maxpos = pos + maxval;
+               output = 0;
+               sp = line;
+               for (;;) {
+                       if (*pos++) {
                                if (output)
-                                       (void)putchar(sep);
-                               for (; (ch = *p) != '\n'; ++p)
-                                       (void)putchar(ch);
+                                       fputs(dchar, stdout);
+                               while (sp < ep)
+                                       putchar(*sp++);
+                               output = 1;
                        } else
-                               for (; (ch = *p) != '\n'; ++p)
-                                       ;
-               }
-               (void)putchar('\n');
+                               sp = ep;
+                       if (*sp == '\0' || pos == maxpos)
+                               break;
+                       sp += dlen;
+                       if ((ep = strstr(sp, dchar)) == NULL)
+                               ep = strchr(sp, '\0');
+               }
+               if (autostop)
+                       puts(sp);
+               else
+                       putchar('\n');
        }
-       if (tbuf)
-               free(tbuf);
 }
 
 void

Reply via email to