Re: start cleaning up UTF-8 processing in less(1)

Ingo Schwarze Mon, 25 Feb 2019 03:40:34 -0800

Hi,

Nicholas Marriott wrote on Mon, Feb 25, 2019 at 10:14:03AM +0000:
> Ingo Schwarze wrote:


>> During the upcoming cleanup steps, let use retain full support for
>> the first (ESC-[) syntax and lets us completely delete support for
>> the second and third CSI syntaxes (single-byte CSI and UTF-8
>> single-character two-byte CSI).
>> 
>> If you are OK with that plan, i'll send diffs implementing that.

> Very little, if anything at all, uses 8-bit or UTF-8 CSI,
> getting rid of it is a good idea.

Thank you for all your feedback.

So here is a patch doing that.

The central part is dropping the definitions related to the 8-bit
and the UTF-8 CSI from less.h, keeping only the definitions needed
for ESC-[.  In the future, these forms of the CSI will be treated
exactly like other control characters: e.g., encoded for display.

The rest of the diff mechanically resolves the IS_CSI_START() macro,
replacing it by a simple "== ESC" test.  Casting is needed at none
of the places: the types involved are simply:

        '['     int     (value 0x5b < 0x7f)
        037     int     (value 0x1f < 0x7f)  -- apply & -->
        ESC     int     (value 0x1b < 0x7f)

So there is no danger in comparing ESC using == or != with any of
the types char, unsigned char (which are covered by the int range
during promotion) or LWCHAR == unsigned long, in which case the
(positve) constant ESC gets sign extended, which is value-preserving
for a positive integer.

The effects of the various chunks are, in detail:

 * cvt.c, cvt_text():
   8-bit and UTF-8 CSI are no longer recognized as ANSI escape
   sequences but copied just like any other control characters.
 * filename.c, bin_file():
   8-bit and UTF-8 CSI are no longer skipped but counted as
   binary characters just like any other binary characters.
   (change in behaviour confirmed by testing)
 * line.c, pshift():
   No longer treat 8-bit and UTF-8 CSI sequences specially
   when shifting a line left.
   (shifting left and right was tested)
 * line.c, in_ansi_esc_seq():
   No longer recognize 8-bit and UTF-8 CSI sequences.
   This allows simplifying the loop to iterate over bytes
   rather than over characters, getting rif of one LWCHAR
   variable and one call to the pesky function step_char(-1}.
   (it was tested that ESC-[ still works)
 * line.c, store_char():
   No longer recognize 8-bit and UTF-8 CSI sequences.
   Again, minus one LWCHAR and minus one step_char(-1).
 * line.c, do_append():
   No longer pass 8-bit and UTF-8 CSI unencoded, encode them
   like any other control character.

This survived light testing.
All code touched will be touched again in the upcoming cleanup
because all these functions still contain step_char() or get_wchar(),
but this step simplifies many of the upcoming cleanup steps.
Already minus six lines of code and minus seven macro calls
in this patch alone.

OK?
  Ingo


Index: cvt.c
===================================================================
RCS file: /cvs/src/usr.bin/less/cvt.c,v
retrieving revision 1.8
diff -u -p -r1.8 cvt.c
--- cvt.c       17 Sep 2016 15:06:41 -0000      1.8
+++ cvt.c       25 Feb 2019 11:25:38 -0000
@@ -77,7 +77,7 @@ cvt_text(char *odst, char *osrc, int *ch
                                dst--;
                        } while (dst > odst &&
                            !IS_ASCII_OCTET(*dst) && !IS_UTF8_LEAD(*dst));
-               } else if ((ops & CVT_ANSI) && IS_CSI_START(ch)) {
+               } else if ((ops & CVT_ANSI) && ch == ESC) {
                        /* Skip to end of ANSI escape sequence. */
                        src++;  /* skip the CSI start char */
                        while (src < src_end)
Index: filename.c
===================================================================
RCS file: /cvs/src/usr.bin/less/filename.c,v
retrieving revision 1.26
diff -u -p -r1.26 filename.c
--- filename.c  29 Oct 2017 17:10:55 -0000      1.26
+++ filename.c  25 Feb 2019 11:25:38 -0000
@@ -348,7 +348,7 @@ bin_file(int f)
        pend = &data[n];
        for (p = data; p < pend; ) {
                LWCHAR c = step_char(&p, +1, pend);
-               if (ctldisp == OPT_ONPLUS && IS_CSI_START(c)) {
+               if (ctldisp == OPT_ONPLUS && c == ESC) {
                        do {
                                c = step_char(&p, +1, pend);
                        } while (p < pend && is_ansi_middle(c));
Index: less.h
===================================================================
RCS file: /cvs/src/usr.bin/less/less.h,v
retrieving revision 1.28
diff -u -p -r1.28 less.h
--- less.h      30 Dec 2018 23:09:58 -0000      1.28
+++ less.h      25 Feb 2019 11:25:38 -0000
@@ -38,8 +38,6 @@
 #define        IS_SPACE(c)     isspace((unsigned char)(c))
 #define        IS_DIGIT(c)     isdigit((unsigned char)(c))
 
-#define        IS_CSI_START(c) (((LWCHAR)(c)) == ESC || (((LWCHAR)(c)) == CSI))
-
 #ifndef TRUE
 #define        TRUE            1
 #endif
@@ -151,9 +149,7 @@ struct textlist {
 #define        AT_INDET        (1 << 7)  /* Indeterminate: either bold or 
underline */
 
 #define        CONTROL(c)      ((c)&037)
-
 #define        ESC             CONTROL('[')
-#define        CSI             ((unsigned char)'\233')
 
 #define        S_INTERRUPT     01
 #define        S_STOP          02
Index: line.c
===================================================================
RCS file: /cvs/src/usr.bin/less/line.c,v
retrieving revision 1.23
diff -u -p -r1.23 line.c
--- line.c      24 Feb 2019 04:54:36 -0000      1.23
+++ line.c      25 Feb 2019 11:25:39 -0000
@@ -230,7 +230,7 @@ pshift(int shift)
         */
        while (shifted <= shift && from < curr) {
                c = linebuf[from];
-               if (ctldisp == OPT_ONPLUS && IS_CSI_START(c)) {
+               if (ctldisp == OPT_ONPLUS && c == ESC) {
                        /* Keep cumulative effect.  */
                        linebuf[to] = c;
                        attr[to++] = attr[from++];
@@ -469,11 +469,10 @@ in_ansi_esc_seq(void)
         * Search backwards for either an ESC (which means we ARE in a seq);
         * or an end char (which means we're NOT in a seq).
         */
-       for (p = &linebuf[curr]; p > linebuf; ) {
-               LWCHAR ch = step_char(&p, -1, linebuf);
-               if (IS_CSI_START(ch))
+       for (p = linebuf + curr - 1; p >= linebuf; p--) {
+               if (*p == ESC)
                        return (1);
-               if (!is_ansi_middle(ch))
+               if (!is_ansi_middle(*p))
                        return (0);
        }
        return (0);
@@ -533,17 +532,14 @@ store_char(LWCHAR ch, char a, char *rep,
        if (ctldisp == OPT_ONPLUS && in_ansi_esc_seq()) {
                if (!is_ansi_end(ch) && !is_ansi_middle(ch)) {
                        /* Remove whole unrecognized sequence.  */
-                       char *p = &linebuf[curr];
-                       LWCHAR bch;
                        do {
-                               bch = step_char(&p, -1, linebuf);
-                       } while (p > linebuf && !IS_CSI_START(bch));
-                       curr = p - linebuf;
+                               curr--;
+                       } while (curr > 0 && linebuf[curr] != ESC);
                        return (0);
                }
                a = AT_ANSI;    /* Will force re-AT_'ing around it.  */
                w = 0;
-       } else if (ctldisp == OPT_ONPLUS && IS_CSI_START(ch)) {
+       } else if (ctldisp == OPT_ONPLUS && ch == ESC) {
                a = AT_ANSI;    /* Will force re-AT_'ing around it.  */
                w = 0;
        } else {
@@ -851,7 +847,7 @@ do_append(LWCHAR ch, char *rep, off_t po
        } else if ((!utf_mode || is_ascii_char(ch)) && control_char((char)ch)) {
 do_control_char:
                if (ctldisp == OPT_ON ||
-                   (ctldisp == OPT_ONPLUS && IS_CSI_START(ch))) {
+                   (ctldisp == OPT_ONPLUS && ch == ESC)) {
                        /*
                         * Output as a normal character.
                         */

Re: start cleaning up UTF-8 processing in less(1)

Reply via email to