Hi, On 19/05/17 03:42, Anton Lindqvist wrote: > Hi, > I did submit this problem[1] earlier but with an incomplete analysis and > fix. Here's a second attempt. > > This does only occur when running ksh with emacs mode under tmux. How to > re-produce: > > 1. Run ksh under tmux. > > 2. Input the following characters, without spaces: > > a (any character) ^B (backward-char) ö (any UTF-8 character) > > 3. At this point, the prompt gets overwritten. > > Since ksh read a single byte of input, it will display a partial UTF-8 > character before the whole character has been read. This is especially > troublesome when the cursor is not placed at the end of the line. In the > scenario above, after reading the first byte of 'ö' the following > sequence will be displayed: > > 0xc3 0x61 0x08 > > That is the first byte of 'ö' (0xc3), 'a' (0x61), '\b' (0x08). tmux > does the right thing here, since 0xc3 is a valid UTF-8 start byte it > expects it to be followed by a UTF-8 continuation byte which is not the > case. The two first bytes (0xc3, 0x61) are discarded and the parser is > reset to its initial state causing the backspace to be accepted and the > first character in the prompt to be overwritten. > > After the second byte of 'ö' (0xb6) is read by ksh, the following > sequence will be displayed: > > 0x08 0xc3 0xb6 0x61 0x08 > > That is '\b' (0x08), 'ö' (0xc3, 0xb6), 'a' (0x61), '\b' (0x08). Since > ksh assumes the cursor is correctly positioned it displays a leading > backspace in order to move passed the first character. This is however > not true causing another character in the prompt to be overwritten. > > Below is diff that make sure to read a whole UTF-8 character in > x_emacs() prior doing another iteration of the main-loop which solves > the problem. It does not validate UTF-8 input but instead assumes every > such character is valid. > > Comments and feedback are much appreciated. > > [1] http://marc.info/?l=openbsd-misc&m=148509346310901&w=2 > > Index: emacs.c > =================================================================== > RCS file: /cvs/src/bin/ksh/emacs.c,v > retrieving revision 1.67 > diff -u -p -r1.67 emacs.c > --- emacs.c 12 May 2017 14:37:52 -0000 1.67 > +++ emacs.c 14 May 2017 08:21:26 -0000 > @@ -98,6 +98,7 @@ static int x_col; > static int x_displen; > static int x_arg; /* general purpose arg */ > static int x_arg_defaulted;/* x_arg not explicitly set; defaulted to 1 */ > +static int x_getc_again; > > static int xlp_valid; > /* end from 4.9 edit.h } */ > @@ -142,6 +143,7 @@ static int x_fold_case(int); > static char *x_lastcp(void); > static void do_complete(int, Comp_type); > static int isu8cont(unsigned char); > +static int u8len(unsigned char); > > /* proto's for keybindings */ > static int x_abort(int); > @@ -272,6 +274,21 @@ isu8cont(unsigned char c) > return (c & (0x80 | 0x40)) == 0x80; > } > > +static int > +u8len(unsigned char c) > +{ > + switch (c & 0xF0) { > + case 0xF0: > + return 4; > + case 0xE0: > + return 3; > + case 0xC0: > + return 2; > + default: > + return 1; > + } > +} > +
This is wrong: most codepoints in the range U+0080-U+07ff (the ones greater than U+0400) would be interpreted as being 1 character long instead of 2. > int > x_emacs(char *buf, size_t len) > { > @@ -318,10 +335,12 @@ x_emacs(char *buf, size_t len) > x_last_command = NULL; > while (1) { > x_flush(); > - if ((c = x_e_getc()) < 0) > - return 0; > + do { > + if ((c = x_e_getc()) < 0) > + return 0; > > - line[at++] = c; > + line[at++] = c; > + } while (x_getc_again > 0); > line[at] = '\0'; > > if (x_arg == -1) { > @@ -364,7 +383,10 @@ x_emacs(char *buf, size_t len) > } else { > if (submatch) > continue; > - if (at == 1) > + if (at > 1) { > + x_ins(line); > + ret = KSTD; > + } else if (at == 1) > ret = x_insert(c); > else > ret = x_error(c); /* not matched meta sequence > */ > @@ -1887,8 +1909,12 @@ x_e_getc(void) > macro_args = NULL; > c = x_getc(); > } > - } else > + } else { > c = x_getc(); > + if (x_getc_again == 0) > + x_getc_again = u8len(c); > + x_getc_again--; > + } > > return c; > } >