Hi Ted, Ingo Schwarze wrote on Sun, Nov 15, 2015 at 06:56:37PM +0100: > Ted Unangst wrote on Sat, Nov 14, 2015 at 01:22:33PM -0500:
>> One other program which I think will provide some insight is cut. >> It cares, perhaps even more than others, about bytes and chars >> and bytes in the middle of chars. > The following fully implements -c. While here, reduce the ridiculous > LC_ALL to what's actually needed, LC_CTYPE. And here is a complete implementation of -c, -n, and of -d with a multibyte delimiter character. > This doesn't even need a utf8.c file. The standard mblen(3) interface > can easily be used directly. Conversion to wchar_t and back is not > required. I'm not worrying about the internal state of mblen(3). > We are not going to support state-dependent encodings, anyway. All of that is still true. None of the algorithms that are currently used change. The churn results from the fact that using multibyte delimiters requires switching from hand-crafted parsing to use of <string.h>, in particular strstr(3). That said, the code in f_cut() becomes shorter and simpler even though functionality is added. The algorithm for -nb is so similar to the one for -c that both can easily use the same function c_cut(), inspecting nflag at merely one single place. The insight that Ted asked for is that * in contrast to the other programs (ls, rs, ul), cut(1) isn't merely doing the same things with multibyte characters that it used to do with single byte characters; it grows substantial new, user-visible functionality (-c, -nb, -d multibyte); * and all the same, in contrast to the others, it doesn't need any custom mb*() function, it gets away with the standard mblen() interface. I can commit this in three steps (implement -c, implement -n, implement -d multibyte) - if anybody wants that, please say so; otherwise i'm going to do only one commit. OK? Ingo Index: cut.1 =================================================================== RCS file: /cvs/src/usr.bin/cut/cut.1,v retrieving revision 1.24 diff -u -p -r1.24 cut.1 --- cut.1 10 Jul 2014 14:11:56 -0000 1.24 +++ cut.1 23 Nov 2015 17:41:06 -0000 @@ -114,6 +114,8 @@ The selected fields are output, separated by the field delimiter character. .It Fl n Do not split multi-byte characters. +A character is written to standard output if and only if the byte +position holding its last byte is selected. .It Fl s Suppresses lines with no field delimiter characters. Unless specified, lines with no delimiters are passed through unmodified. @@ -145,11 +147,19 @@ utility is compliant with the .St -p1003.1-2008 specification. .Sh CAVEATS -The current implementation does not support multi-byte characters. -Consequently +The definition of a character depends on the current character set +.Xr locale 1 . +If +.Ev LC_CTYPE +is set to +.Qq C +or +.Qq POSIX , .Fl c does the same as .Fl b , -and .Fl n -has no effect. +has no effect, and +.Fl d +uses the first byte of +.Ar delim . Index: cut.c =================================================================== RCS file: /cvs/src/usr.bin/cut/cut.c,v retrieving revision 1.22 diff -u -p -r1.22 cut.c --- cut.c 3 Nov 2015 04:57:20 -0000 1.22 +++ cut.c 23 Nov 2015 17:41:06 -0000 @@ -33,6 +33,7 @@ * SUCH DAMAGE. */ +#include <assert.h> #include <ctype.h> #include <err.h> #include <errno.h> @@ -43,12 +44,17 @@ #include <string.h> #include <unistd.h> +char dchar[5]; +int dlen; + +int bflag; int cflag; -char dchar; int dflag; int fflag; +int nflag; int sflag; +void b_cut(FILE *, char *); void c_cut(FILE *, char *); void f_cut(FILE *, char *); void get_list(char *); @@ -61,37 +67,43 @@ main(int argc, char *argv[]) void (*fcn)(FILE *, char *); int ch, rval; - setlocale (LC_ALL, ""); + setlocale(LC_CTYPE, ""); if (pledge("stdio rpath", NULL) == -1) err(1, "pledge"); - dchar = '\t'; /* default delimiter is \t */ + dchar[0] = '\t'; /* default delimiter */ + dchar[1] = '\0'; + dlen = 1; - /* Since we don't support multi-byte characters, the -c and -b - options are equivalent, and the -n option is meaningless. */ while ((ch = getopt(argc, argv, "b:c:d:f:sn")) != -1) switch(ch) { case 'b': + get_list(optarg); + bflag = 1; + break; case 'c': - fcn = c_cut; get_list(optarg); cflag = 1; break; case 'd': - dchar = *optarg; + if ((dlen = mblen(optarg, MB_CUR_MAX)) == -1) + usage(); + assert(dlen < sizeof(dchar)); + (void)memcpy(dchar, optarg, dlen); + dchar[dlen] = '\0'; dflag = 1; break; case 'f': get_list(optarg); - fcn = f_cut; fflag = 1; break; + case 'n': + nflag = 1; + break; case 's': sflag = 1; break; - case 'n': - break; case '?': default: usage(); @@ -99,12 +111,21 @@ main(int argc, char *argv[]) argc -= optind; argv += optind; - if (fflag) { - if (cflag) - usage(); - } else if (!cflag || dflag || sflag) + if (bflag + cflag + fflag != 1 || + (nflag && !bflag) || + ((dflag || sflag) && !fflag)) usage(); + if (MB_CUR_MAX == 1) { + nflag = 0; + if (cflag) { + bflag = 1; + cflag = 0; + } + } + + fcn = fflag ? f_cut : (cflag || nflag) ? c_cut : b_cut; + rval = 0; if (*argv) for (; *argv; ++argv) { @@ -192,7 +213,7 @@ get_list(char *list) /* ARGSUSED */ void -c_cut(FILE *fp, char *fname) +b_cut(FILE *fp, char *fname) { int ch, col; char *pos; @@ -220,65 +241,82 @@ c_cut(FILE *fp, char *fname) } void -f_cut(FILE *fp, char *fname) +c_cut(FILE *fp, char *fname) { - int ch, field, isdelim; - char *pos, *p, sep; - int output; - size_t len; - char *lbuf, *tbuf; + static char *line = NULL; + static size_t linesz = 0; + ssize_t linelen; + char *cp, *pos, *maxpos; + int len; + + while ((linelen = getline(&line, &linesz, fp)) != -1) { + if (line[linelen - 1] == '\n') + line[linelen - 1] = '\0'; - for (sep = dchar, tbuf = NULL; (lbuf = fgetln(fp, &len));) { - output = 0; - if (lbuf[len - 1] != '\n') { - /* no newline at the end of the last line so add one */ - if ((tbuf = malloc(len + 1)) == NULL) - err(1, NULL); - memcpy(tbuf, lbuf, len); - tbuf[len] = '\n'; - lbuf = tbuf; - } - for (isdelim = 0, p = lbuf;; ++p) { - ch = *p; - /* this should work if newline is delimiter */ - if (ch == sep) - isdelim = 1; - if (ch == '\n') { - if (!isdelim && !sflag) - (void)fwrite(lbuf, len, 1, stdout); - break; - } + cp = line; + pos = positions + 1; + maxpos = pos + maxval; + while(pos < maxpos && *cp != '\0') { + len = mblen(cp, MB_CUR_MAX); + if (len == -1) + len = 1; + pos += nflag ? len : 1; + if (pos[-1] == '\0') + cp += len; + else + while (len--) + putchar(*cp++); } - if (!isdelim) + if (autostop) + puts(cp); + else + putchar('\n'); + } +} + +void +f_cut(FILE *fp, char *fname) +{ + static char *line = NULL; + static size_t linesz = 0; + ssize_t linelen; + char *sp, *ep, *pos, *maxpos; + int output; + + while ((linelen = getline(&line, &linesz, fp)) != -1) { + if (line[linelen - 1] == '\n') + line[linelen - 1] = '\0'; + + if ((ep = strstr(line, dchar)) == NULL) { + if (!sflag) + puts(line); continue; + } pos = positions + 1; - for (field = maxval, p = lbuf; field; --field, ++pos) { - if (*pos) { - if (output++) - (void)putchar(sep); - while ((ch = *p++) != '\n' && ch != sep) - (void)putchar(ch); - } else - while ((ch = *p++) != '\n' && ch != sep) - ; - if (ch == '\n') - break; - } - if (ch != '\n') { - if (autostop) { + maxpos = pos + maxval; + output = 0; + sp = line; + for (;;) { + if (*pos++) { if (output) - (void)putchar(sep); - for (; (ch = *p) != '\n'; ++p) - (void)putchar(ch); + fputs(dchar, stdout); + while (sp < ep) + putchar(*sp++); + output = 1; } else - for (; (ch = *p) != '\n'; ++p) - ; - } - (void)putchar('\n'); + sp = ep; + if (*sp == '\0' || pos == maxpos) + break; + sp += dlen; + if ((ep = strstr(sp, dchar)) == NULL) + ep = strchr(sp, '\0'); + } + if (autostop) + puts(sp); + else + putchar('\n'); } - if (tbuf) - free(tbuf); } void