Bug#1039503: wrongly splits up extended grapheme clusters (like certain emoji)

2023-06-27 Thread Steinar H. Gunderson
On Tue, Jun 27, 2023 at 12:49:53AM +0200, Steinar H. Gunderson wrote:
> The attached patch seems to get us halfway there; screen now combines all of
> them correctly into one cluster. However, it's still split for whatever
> reason; only if I redraw (C-a l) the flag shows up, and the text breaking is
> wrong, so I assume there's at least something wrong with the width
> calculation, and possibly lots of other things I've not thought of. :-)

Version 2 of the patch; this seems to actually work. Only lightly tested,
though, and there are enough subtleties in the code that it probably needs
review from someone who knows screen from before.

/* Steinar */
-- 
Homepage: https://www.sesse.net/
diff -ur /home/sesse/nmu/orig/screen-4.9.0/ansi.c ../ansi.c
--- /home/sesse/nmu/orig/screen-4.9.0/ansi.c	2023-06-27 22:53:44.0 +0200
+++ ../ansi.c	2023-06-27 23:00:26.667216364 +0200
@@ -694,9 +694,9 @@
 		}
 		  curr->w_rend.font = 0;
 		}
-	  if (curr->w_encoding == UTF8 && c >= 0x0300 && utf8_iscomb(c))
+	  if (curr->w_encoding == UTF8 && c >= 0x0300)
 		{
-		  int ox, oy;
+		  int ox, oy, c_prev;
 		  struct mchar omc;
 
 		  ox = curr->w_x - 1;
@@ -718,15 +718,35 @@
 			  omc.mbcs = 0xff;
 			}
 		}
-		  if (ox >= 0)
-		{
-		  utf8_handle_comb(c, &omc);
-		  MFixLine(curr, oy, &omc);
-		  copy_mchar2mline(&omc, &curr->w_mlines[oy], ox);
-		  LPutChar(&curr->w_layer, &omc, ox, oy);
-		  LGotoPos(&curr->w_layer, curr->w_x, curr->w_y);
-		}
-		  break;
+  c_prev = omc.image | (omc.font << 8) | omc.fontx << 16;
+  if (!grapheme_cluster_break(c_prev, c))
+{
+		  if (ox >= 0)
+		{
+		  utf8_handle_comb(c, &omc);
+		  MFixLine(curr, oy, &omc);
+		  copy_mchar2mline(&omc, &curr->w_mlines[oy], ox);
+  if (!utf8_isdouble(c_prev) &&
+  utf8_isdouble(omc.image | (omc.font << 8) | omc.fontx << 16))
+{
+  /* A combining character switched us from single-width to double-width.
+ Do what the w_mbcs = 0xff path would have done below if the character
+ was double-width to begin with. */
+  omc.mbcs = 0xff;
+  if (ox < cols - 1)
+{
+		  curr->w_mlines[oy].image[ox + 1] = 0xff;
+		  curr->w_mlines[oy].font[ox + 1] = 0xff;
+		  curr->w_mlines[oy].fontx[ox + 1] = 0;
+		  curr->w_x++;
+}
+   }
+		  LPutChar(&curr->w_layer, &omc, ox, oy);
+		  LGotoPos(&curr->w_layer, curr->w_x, curr->w_y);
+		}
+		  break;
+}
+  /* fall through */
 		}
 #  ifdef DW_CHARS
 		if (curr->w_encoding == UTF8 && utf8_isdouble(c))
diff -ur /home/sesse/nmu/orig/screen-4.9.0/encoding.c ../encoding.c
--- /home/sesse/nmu/orig/screen-4.9.0/encoding.c	2023-06-27 22:53:44.0 +0200
+++ ../encoding.c	2023-06-27 22:57:29.386767268 +0200
@@ -1215,6 +1233,398 @@
   return bisearch(c, combining, sizeof(combining) / sizeof(struct interval) - 1);
 }
 
+static bool
+is_grapheme_extend(c)
+int c;
+{
+  /* https://unicode.org/Public/15.0.0/ucd/DerivedCoreProperties.txt */
+  static const struct interval grapheme_extend[] = {
+{ 0x0300, 0x036F }, { 0x0483, 0x0487 }, { 0x0488, 0x0489 }, 
+{ 0x0591, 0x05BD }, { 0x05BF, 0x05BF }, { 0x05C1, 0x05C2 }, 
+{ 0x05C4, 0x05C5 }, { 0x05C7, 0x05C7 }, { 0x0610, 0x061A }, 
+{ 0x064B, 0x065F }, { 0x0670, 0x0670 }, { 0x06D6, 0x06DC }, 
+{ 0x06DF, 0x06E4 }, { 0x06E7, 0x06E8 }, { 0x06EA, 0x06ED }, 
+{ 0x0711, 0x0711 }, { 0x0730, 0x074A }, { 0x07A6, 0x07B0 }, 
+{ 0x07EB, 0x07F3 }, { 0x07FD, 0x07FD }, { 0x0816, 0x0819 }, 
+{ 0x081B, 0x0823 }, { 0x0825, 0x0827 }, { 0x0829, 0x082D }, 
+{ 0x0859, 0x085B }, { 0x0898, 0x089F }, { 0x08CA, 0x08E1 }, 
+{ 0x08E3, 0x0902 }, { 0x093A, 0x093A }, { 0x093C, 0x093C }, 
+{ 0x0941, 0x0948 }, { 0x094D, 0x094D }, { 0x0951, 0x0957 }, 
+{ 0x0962, 0x0963 }, { 0x0981, 0x0981 }, { 0x09BC, 0x09BC }, 
+{ 0x09BE, 0x09BE }, { 0x09C1, 0x09C4 }, { 0x09CD, 0x09CD }, 
+{ 0x09D7, 0x09D7 }, { 0x09E2, 0x09E3 }, { 0x09FE, 0x09FE }, 
+{ 0x0A01, 0x0A02 }, { 0x0A3C, 0x0A3C }, { 0x0A41, 0x0A42 }, 
+{ 0x0A47, 0x0A48 }, { 0x0A4B, 0x0A4D }, { 0x0A51, 0x0A51 }, 
+{ 0x0A70, 0x0A71 }, { 0x0A75, 0x0A75 }, { 0x0A81, 0x0A82 }, 
+{ 0x0ABC, 0x0ABC }, { 0x0AC1, 0x0AC5 }, { 0x0AC7, 0x0AC8 }, 
+{ 0x0ACD, 0x0ACD }, { 0x0AE2, 0x0AE3 }, { 0x0AFA, 0x0AFF }, 
+{ 0x0B01, 0x0B01 }, { 0x0B3C, 0x0B3C }, { 0x0B3E, 0x0B3E }, 
+{ 0x0B3F, 0x0B3F }, { 0x0B41, 0x0B44 }, { 0x0B4D, 0x0B4D }, 
+{ 0x0B55, 0x0B56 }, { 0x0B57, 0x0B57 }, { 0x0B62, 0x0B63 }, 
+{ 0x0B82, 0x0B82 }, { 0x0BBE, 0x0BBE }, { 0x0BC0, 

Bug#1039503: wrongly splits up extended grapheme clusters (like certain emoji)

2023-06-26 Thread Steinar H. Gunderson
On Mon, Jun 26, 2023 at 08:04:28PM +0200, Steinar H. Gunderson wrote:
> Is it possible to retrofit these rules? This specific rule would seem to
> hit a lot of modern emoji sequences (the Unicode Consortium seems to prefer
> using such sequences instead of defining new code points where possible);
> I would assume including the full table of grapheme clusters would help
> e.g. Korean text, too, although I cannot read Korean and have no idea
> whether it's actually a problem.

The attached patch seems to get us halfway there; screen now combines all of
them correctly into one cluster. However, it's still split for whatever
reason; only if I redraw (C-a l) the flag shows up, and the text breaking is
wrong, so I assume there's at least something wrong with the width
calculation, and possibly lots of other things I've not thought of. :-)

/* Steinar */
-- 
Homepage: https://www.sesse.net/
Description: Better detection of grapheme clusters
Author: Steinar H. Gunderson 
Bug-Debian: https://bugs.debian.org/1039503
Last-Update: 2023-06-25

---

--- screen-4.9.0.orig/ansi.c
+++ screen-4.9.0/ansi.c
@@ -694,9 +702,9 @@ register int len;
 		}
 		  curr->w_rend.font = 0;
 		}
-	  if (curr->w_encoding == UTF8 && c >= 0x0300 && utf8_iscomb(c))
+	  if (curr->w_encoding == UTF8 && c >= 0x0300)
 		{
-		  int ox, oy;
+		  int ox, oy, c_prev;
 		  struct mchar omc;
 
 		  ox = curr->w_x - 1;
@@ -718,15 +726,20 @@ register int len;
 			  omc.mbcs = 0xff;
 			}
 		}
-		  if (ox >= 0)
-		{
-		  utf8_handle_comb(c, &omc);
-		  MFixLine(curr, oy, &omc);
-		  copy_mchar2mline(&omc, &curr->w_mlines[oy], ox);
-		  LPutChar(&curr->w_layer, &omc, ox, oy);
-		  LGotoPos(&curr->w_layer, curr->w_x, curr->w_y);
-		}
-		  break;
+  c_prev = omc.image | (omc.font << 8) | omc.fontx << 16;
+  if (!grapheme_cluster_break(c_prev, c))
+{
+		  if (ox >= 0)
+		{
+		  utf8_handle_comb(c, &omc);
+		  MFixLine(curr, oy, &omc);
+		  copy_mchar2mline(&omc, &curr->w_mlines[oy], ox);
+		  LPutChar(&curr->w_layer, &omc, ox, oy);
+		  LGotoPos(&curr->w_layer, curr->w_x, curr->w_y);
+		}
+		  break;
+}
+  /* fall through */
 		}
 #  ifdef DW_CHARS
 		if (curr->w_encoding == UTF8 && utf8_isdouble(c))
--- screen-4.9.0.orig/encoding.c
+++ screen-4.9.0/encoding.c
@@ -1215,6 +1221,398 @@ int c;
   return bisearch(c, combining, sizeof(combining) / sizeof(struct interval) - 1);
 }
 
+static bool
+is_grapheme_extend(c)
+int c;
+{
+  /* https://unicode.org/Public/15.0.0/ucd/DerivedCoreProperties.txt */
+  static const struct interval grapheme_extend[] = {
+{ 0x0300, 0x036F }, { 0x0483, 0x0487 }, { 0x0488, 0x0489 }, 
+{ 0x0591, 0x05BD }, { 0x05BF, 0x05BF }, { 0x05C1, 0x05C2 }, 
+{ 0x05C4, 0x05C5 }, { 0x05C7, 0x05C7 }, { 0x0610, 0x061A }, 
+{ 0x064B, 0x065F }, { 0x0670, 0x0670 }, { 0x06D6, 0x06DC }, 
+{ 0x06DF, 0x06E4 }, { 0x06E7, 0x06E8 }, { 0x06EA, 0x06ED }, 
+{ 0x0711, 0x0711 }, { 0x0730, 0x074A }, { 0x07A6, 0x07B0 }, 
+{ 0x07EB, 0x07F3 }, { 0x07FD, 0x07FD }, { 0x0816, 0x0819 }, 
+{ 0x081B, 0x0823 }, { 0x0825, 0x0827 }, { 0x0829, 0x082D }, 
+{ 0x0859, 0x085B }, { 0x0898, 0x089F }, { 0x08CA, 0x08E1 }, 
+{ 0x08E3, 0x0902 }, { 0x093A, 0x093A }, { 0x093C, 0x093C }, 
+{ 0x0941, 0x0948 }, { 0x094D, 0x094D }, { 0x0951, 0x0957 }, 
+{ 0x0962, 0x0963 }, { 0x0981, 0x0981 }, { 0x09BC, 0x09BC }, 
+{ 0x09BE, 0x09BE }, { 0x09C1, 0x09C4 }, { 0x09CD, 0x09CD }, 
+{ 0x09D7, 0x09D7 }, { 0x09E2, 0x09E3 }, { 0x09FE, 0x09FE }, 
+{ 0x0A01, 0x0A02 }, { 0x0A3C, 0x0A3C }, { 0x0A41, 0x0A42 }, 
+{ 0x0A47, 0x0A48 }, { 0x0A4B, 0x0A4D }, { 0x0A51, 0x0A51 }, 
+{ 0x0A70, 0x0A71 }, { 0x0A75, 0x0A75 }, { 0x0A81, 0x0A82 }, 
+{ 0x0ABC, 0x0ABC }, { 0x0AC1, 0x0AC5 }, { 0x0AC7, 0x0AC8 }, 
+{ 0x0ACD, 0x0ACD }, { 0x0AE2, 0x0AE3 }, { 0x0AFA, 0x0AFF }, 
+{ 0x0B01, 0x0B01 }, { 0x0B3C, 0x0B3C }, { 0x0B3E, 0x0B3E }, 
+{ 0x0B3F, 0x0B3F }, { 0x0B41, 0x0B44 }, { 0x0B4D, 0x0B4D }, 
+{ 0x0B55, 0x0B56 }, { 0x0B57, 0x0B57 }, { 0x0B62, 0x0B63 }, 
+{ 0x0B82, 0x0B82 }, { 0x0BBE, 0x0BBE }, { 0x0BC0, 0x0BC0 }, 
+{ 0x0BCD, 0x0BCD }, { 0x0BD7, 0x0BD7 }, { 0x0C00, 0x0C00 }, 
+{ 0x0C04, 0x0C04 }, { 0x0C3C, 0x0C3C }, { 0x0C3E, 0x0C40 }, 
+{ 0x0C46, 0x0C48 }, { 0x0C4A, 0x0C4D }, { 0x0C55, 0x0C56 }, 
+{ 0x0C62, 0x0C63 }, { 0x0C81, 0x0C81 }, { 0x0CBC, 0x0CBC }, 
+{ 0x0CBF, 0x0CBF }, { 0x0CC2, 0x0CC2 }, { 0x0CC6, 0x0CC6 }, 
+{ 0x0CCC, 0x0CCD }, { 0x0CD5, 0x0CD6 }, { 0x0CE2, 0x0CE3 }, 
+{ 0x0D00, 0x0D01 }, { 0x0D3B, 0x0D3C }, { 0x0D3E, 0x0D3E }, 
+{ 0x0D41, 0x0D44 }, { 0x0D4D, 0x0D4D }, { 0x0D57, 0x0D57 }, 
+{ 0x0D62, 0x0D63 }, { 0x0D81, 0x0D81 }, { 0x0DCA, 0x0DCA }, 
+{ 0x0DCF, 0x0DCF }, { 0x0DD2, 0x0DD4 }, { 0x0DD6, 0x0DD6 }, 
+{ 0x0DDF, 0x0DDF }, { 0x0E31, 0x0E31 }, { 0x0E34, 0x0E3A }, 
+{ 0x0E47, 0x0E

Bug#1039503: wrongly splits up extended grapheme clusters (like certain emoji)

2023-06-26 Thread Steinar H. Gunderson
Package: screen
Version: 4.9.0-4
Severity: normal
Tags: upstream

Hi,

I was trying to figure out why irssi sometimes garbles the display when certain
emoji are involved in the channel topic; after some debugging, it seems the 
issue
is with screen, not irssi. To reproduce, start up screen and do (in a shell)

  echo '\0360\0237\0217\0263\0357\0270\0217\0342\0200\0215\0360\0237\0214\0210'

(That is UTF-8 for U+1F3F3 U+FE0F U+200D U+1F308.)

Outside of screen, I correctly get 🏳️‍🌈, , RAINBOW FLAG. However, inside
screen, I get 🏳️, space, 🌈 (WHITE FLAG, space, RAINBOW). Curiously enough,
if I switch windows and get an irssi redraw, it actually gets drawn correctly,
but that's probably something more complex.

I've tried following the code, and this is my understanding of what's going on:
screen really wants one cell = one codepoint, yet it needs to support combining
characters. So internally, it seems to keep a sort of cache of combining 
sequences,
using the otherwise-reserved-for-surrogates range U+D800..U+DFFF.

This seems to happen in two steps; utf8_handle_comb() (in encoding.c) adds more
code points to a given sequence, allocating points and keeping a linked list.
(There's some confusion in that the “font” member of struct mchar is used to
hold the upper 8 bits of the resulting surrogate, but I think that's just some
sort of hack because the “image” member is 8-bit only? And there's something
about double-wide characters that I don't fully understand.) Then, at display
time, ToUtf8_comb() follows this linked list back to output the entire sequence.

The screen debug log appears to go through this (I've kept only what I think
are relevant messages):

  read UNICODE 1f3f3
  read UNICODE fe0f
  combinig char 1f3f3 fe0f -> d800
  bring to front: 0
  GotoPos (1,1) -> (0,1)
  ---LGotoPos 1 1
  read UNICODE 200d
  combinig char d800 200d -> d801
  bring to front: 1
  bring to front: 0
  GotoPos (1,1) -> (0,1)
  ---LGotoPos 1 1
  read UNICODE 1f308
  ---LGotoPos 0 1

Seemingly, it understands that the two first codepoints are to be combined,
allocates U+D800 for that, and then continues reading. Then it reads
the third one, combines it with U+D800 to create U+D801, but then mistakenly
does _not_ combine U+1F308 with it. This is why we end up with two different
things on screen.

It really seems to me that screen simply doesn't understand Unicode extended
grapheme clusters, only the legacy “legacy grapheme clusters”; from UAX#29
(https://unicode.org/reports/tr29/):

  A legacy grapheme cluster is defined as a base (such as A or カ) followed by
  zero or more continuing characters. One way to think of this is as a sequence
  of characters that form a “stack”.

This seems to come from ansi.c line 705 (WriteString()):

  if (curr->w_encoding == UTF8 && c >= 0x0300 && utf8_iscomb(c))
   {
 […]
 utf8_handle_comb(c, &omc);

In other words, it thinks certain code points are inherently combining
and thus are to be attached to the previous character. However, that's
changed since Unicode 5.1.0, circa 2008; now all of these four together
should form an extended grapheme cluster.

The rules for extended grapheme clusters are locale-dependent, but I'd
guess screen would do just fine with the default grapheme clusters
(certainly much better than today). The rules are actually pretty simple,
if a tad verbose:

https://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules

In particular, I would believe what we need here is rule GB11,
“Do not break within emoji modifier sequences or emoji zwj sequences.”:

  \p{Extended_Pictographic} Extend* ZWJ × \p{Extended_Pictographic}

where “×” means “don't break here”, i.e., continue to run
utf8_handle_comb(). Specifically, the rule matches because:

  U+1F3F3 WAVING WHITE FLAG is indeed Extended_Pictographic
  (according to 
https://unicode.org/Public/15.0.0/ucd/emoji/emoji-data.txt)
  U+FE0F  VARIATION SELECTOR-16 is Extend, because it is Grapheme_Extend
  (according to 
https://unicode.org/Public/15.0.0/ucd/DerivedCoreProperties.txt)
  U+200D  ZERO-WIDTH JOINER is, well, ZWJ
  U+1F308 RAINBOW is also Extended_Pictographic (same file)

Is it possible to retrofit these rules? This specific rule would seem to
hit a lot of modern emoji sequences (the Unicode Consortium seems to prefer
using such sequences instead of defining new code points where possible);
I would assume including the full table of grapheme clusters would help
e.g. Korean text, too, although I cannot read Korean and have no idea
whether it's actually a problem.

As a hack, it seems that anything that follows a ZWJ would be very likely
to keep being part of the same grapheme cluster, but I haven't tested this
out in practice.

FWIW, tmux seems to have no problems showing the flag, although I haven't
checked its implementation.

-- Package-specific info:
File Existence and Permissions
--

drwxr-xr-x 34 root root   1140 Jun 26 17:15 /run
lrwxrwxrwx  1 root root  4