Hi Ted, Ted Unangst wrote on Sat, Nov 14, 2015 at 01:22:33PM -0500:
> One other program which I think will provide some insight is cut. > It cares, perhaps even more than others, about bytes and chars > and bytes in the middle of chars. The following fully implements -c. While here, reduce the ridiculous LC_ALL to what's actually needed, LC_CTYPE. This doesn't even need a utf8.c file. The standard mblen(3) interface can easily be used directly. Conversion to wchar_t and back is not required. I'm not worrying about the internal state of mblen(3). We are not going to support state-dependent encodings, anyway. The implementation of -n will not be more difficult and very similar. Fixing -d won't be difficult either. It requires changing dchar from char to char *, using mblen(3) when parsing the -d argument, and using strstr(3) in f_cut() instead of the hand-rolled version of strchr(3) currently in there. OK? Ingo P.S. Of course, i intend to modernize the other functions by using getline() there, too. Index: cut.c =================================================================== RCS file: /cvs/src/usr.bin/cut/cut.c,v retrieving revision 1.22 diff -u -p -r1.22 cut.c --- cut.c 3 Nov 2015 04:57:20 -0000 1.22 +++ cut.c 15 Nov 2015 17:35:19 -0000 @@ -49,6 +49,7 @@ int dflag; int fflag; int sflag; +void b_cut(FILE *, char *); void c_cut(FILE *, char *); void f_cut(FILE *, char *); void get_list(char *); @@ -61,20 +62,18 @@ main(int argc, char *argv[]) void (*fcn)(FILE *, char *); int ch, rval; - setlocale (LC_ALL, ""); + setlocale(LC_CTYPE, ""); if (pledge("stdio rpath", NULL) == -1) err(1, "pledge"); dchar = '\t'; /* default delimiter is \t */ - /* Since we don't support multi-byte characters, the -c and -b - options are equivalent, and the -n option is meaningless. */ while ((ch = getopt(argc, argv, "b:c:d:f:sn")) != -1) switch(ch) { case 'b': case 'c': - fcn = c_cut; + fcn = ch == 'c' && MB_CUR_MAX > 1 ? c_cut : b_cut; get_list(optarg); cflag = 1; break; @@ -91,6 +90,7 @@ main(int argc, char *argv[]) sflag = 1; break; case 'n': + /* XXX Not yet implemented. */ break; case '?': default: @@ -192,7 +192,7 @@ get_list(char *list) /* ARGSUSED */ void -c_cut(FILE *fp, char *fname) +b_cut(FILE *fp, char *fname) { int ch, col; char *pos; @@ -216,6 +216,42 @@ c_cut(FILE *fp, char *fname) ; } (void)putchar('\n'); + } +} + +void +c_cut(FILE *fp, char *fname) +{ + static char *line = NULL; + static size_t linesz = 0; + ssize_t linelen; + char *cp, *pos, *maxpos; + int len; + + mblen(NULL, MB_CUR_MAX); + while ((linelen = getline(&line, &linesz, fp)) != -1) { + if (line[linelen - 1] == '\n') + line[linelen - 1] = '\0'; + + cp = line; + pos = positions + 1; + maxpos = pos + maxval; + while(pos < maxpos && *cp != '\0') { + len = mblen(cp, MB_CUR_MAX); + if (len == -1) { + mblen(NULL, MB_CUR_MAX); + len = 1; + } + if (*pos++ == '\0') + cp += len; + else + while (len--) + putchar(*cp++); + } + if (autostop) + puts(cp); + else + putchar('\n'); } } Index: cut.1 =================================================================== RCS file: /cvs/src/usr.bin/cut/cut.1,v retrieving revision 1.24 diff -u -p -r1.24 cut.1 --- cut.1 10 Jul 2014 14:11:56 -0000 1.24 +++ cut.1 15 Nov 2015 17:35:19 -0000 @@ -145,11 +145,28 @@ utility is compliant with the .St -p1003.1-2008 specification. .Sh CAVEATS -The current implementation does not support multi-byte characters. -Consequently +The definition of a character depends on the current character set +.Xr locale 1 . +If +.Ev LC_CTYPE +is set to +.Qq C +or +.Qq POSIX , .Fl c does the same as .Fl b , -and .Fl n -has no effect. +has no effect, and +.Fl d +uses the first byte of +.Ar delim . +.Sh BUGS +The +.Fl n +option is not yet implemented and has no effect. +The +.Fl d +option currently always uses the first byte of +.Ar delim +rather than the first character.