In src/backend/utils/mb/wchar.c, function ucs_wcwidth(), there is a list of Unicode combining characters, so that those can be ignored for computing the display length of a Unicode string. It seems to me that that list is either outdated or plain incorrect.
For example, the list starts with {0x0300, 0x034E}, {0x0360, 0x0362}, {0x0483, 0x0486}, Let's look at the characters around the first "gap": (https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt) 034C;COMBINING ALMOST EQUAL TO ABOVE;Mn;230;NSM;;;;;N;;;;; 034D;COMBINING LEFT RIGHT ARROW BELOW;Mn;220;NSM;;;;;N;;;;; 034E;COMBINING UPWARDS ARROW BELOW;Mn;220;NSM;;;;;N;;;;; 034F;COMBINING GRAPHEME JOINER;Mn;0;NSM;;;;;N;;;;; 0350;COMBINING RIGHT ARROWHEAD ABOVE;Mn;230;NSM;;;;;N;;;;; 0351;COMBINING LEFT HALF RING ABOVE;Mn;230;NSM;;;;;N;;;;; So these are all in the "Mn" category, so they should be treated all the same here. Indeed, psql doesn't compute the width of some of them correctly: postgres=> select u&'|oo\034Coo|'; +----------+ | ?column? | +----------+ | |oXoo| | +----------+ postgres=> select u&'|oo\0350oo|'; +----------+ | ?column? | +----------+ | |oXoo| | +----------+ (I have replaced the combined character with X above so that the mail client rendering doesn't add another layer of uncertainty to this issue. The point is that the box is off in the second example.) AFAICT, these Unicode definitions haven't changed since that list was put in originally around 2006, so I wonder what's going on there. I have written a script that recomputes that list from the current Unicode data. Patch and script are attached. This makes those above cases all render correctly. (This should eventually get better built system integration.) Thoughts? -- Peter Eisentraut http://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
use strict; use warnings; my $range_start = undef; my $count = 0; print "\tstatic const struct mbinterval combining[] = {"; foreach my $line (<ARGV>) { chomp $line; my @fields = split ';', $line; my $codepoint = hex $fields[0]; next if $codepoint > 0xFFFF; if ($fields[2] eq 'Me' || $fields[2] eq 'Mn') { # combining character, save for start of range if (!defined($range_start)) { $range_start = $codepoint; } } else { # not a combining character, print out previous range if any if (defined($range_start)) { if ($count++ % 3 == 0) { print "\n\t\t"; } else { print " "; } printf "{0x%04X, 0x%04X},", $range_start, $codepoint; $range_start = undef; } } } print "\n\t};\n";
From a83a7e1bcc3cfee5efa24b4800720d1dc0c13e8b Mon Sep 17 00:00:00 2001 From: Peter Eisentraut <pe...@eisentraut.org> Date: Tue, 4 Jun 2019 22:43:05 +0200 Subject: [PATCH] Update list of combining characters --- src/backend/utils/mb/wchar.c | 98 +++++++++++++++++++++++------------- 1 file changed, 64 insertions(+), 34 deletions(-) diff --git a/src/backend/utils/mb/wchar.c b/src/backend/utils/mb/wchar.c index 8e5116dfc1..6a95c330ae 100644 --- a/src/backend/utils/mb/wchar.c +++ b/src/backend/utils/mb/wchar.c @@ -645,40 +645,70 @@ ucs_wcwidth(pg_wchar ucs) { /* sorted list of non-overlapping intervals of non-spacing characters */ static const struct mbinterval combining[] = { - {0x0300, 0x034E}, {0x0360, 0x0362}, {0x0483, 0x0486}, - {0x0488, 0x0489}, {0x0591, 0x05A1}, {0x05A3, 0x05B9}, - {0x05BB, 0x05BD}, {0x05BF, 0x05BF}, {0x05C1, 0x05C2}, - {0x05C4, 0x05C4}, {0x064B, 0x0655}, {0x0670, 0x0670}, - {0x06D6, 0x06E4}, {0x06E7, 0x06E8}, {0x06EA, 0x06ED}, - {0x070F, 0x070F}, {0x0711, 0x0711}, {0x0730, 0x074A}, - {0x07A6, 0x07B0}, {0x0901, 0x0902}, {0x093C, 0x093C}, - {0x0941, 0x0948}, {0x094D, 0x094D}, {0x0951, 0x0954}, - {0x0962, 0x0963}, {0x0981, 0x0981}, {0x09BC, 0x09BC}, - {0x09C1, 0x09C4}, {0x09CD, 0x09CD}, {0x09E2, 0x09E3}, - {0x0A02, 0x0A02}, {0x0A3C, 0x0A3C}, {0x0A41, 0x0A42}, - {0x0A47, 0x0A48}, {0x0A4B, 0x0A4D}, {0x0A70, 0x0A71}, - {0x0A81, 0x0A82}, {0x0ABC, 0x0ABC}, {0x0AC1, 0x0AC5}, - {0x0AC7, 0x0AC8}, {0x0ACD, 0x0ACD}, {0x0B01, 0x0B01}, - {0x0B3C, 0x0B3C}, {0x0B3F, 0x0B3F}, {0x0B41, 0x0B43}, - {0x0B4D, 0x0B4D}, {0x0B56, 0x0B56}, {0x0B82, 0x0B82}, - {0x0BC0, 0x0BC0}, {0x0BCD, 0x0BCD}, {0x0C3E, 0x0C40}, - {0x0C46, 0x0C48}, {0x0C4A, 0x0C4D}, {0x0C55, 0x0C56}, - {0x0CBF, 0x0CBF}, {0x0CC6, 0x0CC6}, {0x0CCC, 0x0CCD}, - {0x0D41, 0x0D43}, {0x0D4D, 0x0D4D}, {0x0DCA, 0x0DCA}, - {0x0DD2, 0x0DD4}, {0x0DD6, 0x0DD6}, {0x0E31, 0x0E31}, - {0x0E34, 0x0E3A}, {0x0E47, 0x0E4E}, {0x0EB1, 0x0EB1}, - {0x0EB4, 0x0EB9}, {0x0EBB, 0x0EBC}, {0x0EC8, 0x0ECD}, - {0x0F18, 0x0F19}, {0x0F35, 0x0F35}, {0x0F37, 0x0F37}, - {0x0F39, 0x0F39}, {0x0F71, 0x0F7E}, {0x0F80, 0x0F84}, - {0x0F86, 0x0F87}, {0x0F90, 0x0F97}, {0x0F99, 0x0FBC}, - {0x0FC6, 0x0FC6}, {0x102D, 0x1030}, {0x1032, 0x1032}, - {0x1036, 0x1037}, {0x1039, 0x1039}, {0x1058, 0x1059}, - {0x1160, 0x11FF}, {0x17B7, 0x17BD}, {0x17C6, 0x17C6}, - {0x17C9, 0x17D3}, {0x180B, 0x180E}, {0x18A9, 0x18A9}, - {0x200B, 0x200F}, {0x202A, 0x202E}, {0x206A, 0x206F}, - {0x20D0, 0x20E3}, {0x302A, 0x302F}, {0x3099, 0x309A}, - {0xFB1E, 0xFB1E}, {0xFE20, 0xFE23}, {0xFEFF, 0xFEFF}, - {0xFFF9, 0xFFFB} + {0x0300, 0x0370}, {0x0483, 0x048A}, {0x0591, 0x05BE}, + {0x05BF, 0x05C0}, {0x05C1, 0x05C3}, {0x05C4, 0x05C6}, + {0x05C7, 0x05D0}, {0x0610, 0x061B}, {0x064B, 0x0660}, + {0x0670, 0x0671}, {0x06D6, 0x06DD}, {0x06DF, 0x06E5}, + {0x06E7, 0x06E9}, {0x06EA, 0x06EE}, {0x0711, 0x0712}, + {0x0730, 0x074D}, {0x07A6, 0x07B1}, {0x07EB, 0x07F4}, + {0x07FD, 0x07FE}, {0x0816, 0x081A}, {0x081B, 0x0824}, + {0x0825, 0x0828}, {0x0829, 0x0830}, {0x0859, 0x085E}, + {0x08D3, 0x08E2}, {0x08E3, 0x0903}, {0x093A, 0x093B}, + {0x093C, 0x093D}, {0x0941, 0x0949}, {0x094D, 0x094E}, + {0x0951, 0x0958}, {0x0962, 0x0964}, {0x0981, 0x0982}, + {0x09BC, 0x09BD}, {0x09C1, 0x09C7}, {0x09CD, 0x09CE}, + {0x09E2, 0x09E6}, {0x09FE, 0x0A03}, {0x0A3C, 0x0A3E}, + {0x0A41, 0x0A59}, {0x0A70, 0x0A72}, {0x0A75, 0x0A76}, + {0x0A81, 0x0A83}, {0x0ABC, 0x0ABD}, {0x0AC1, 0x0AC9}, + {0x0ACD, 0x0AD0}, {0x0AE2, 0x0AE6}, {0x0AFA, 0x0B02}, + {0x0B3C, 0x0B3D}, {0x0B3F, 0x0B40}, {0x0B41, 0x0B47}, + {0x0B4D, 0x0B57}, {0x0B62, 0x0B66}, {0x0B82, 0x0B83}, + {0x0BC0, 0x0BC1}, {0x0BCD, 0x0BD0}, {0x0C00, 0x0C01}, + {0x0C04, 0x0C05}, {0x0C3E, 0x0C41}, {0x0C46, 0x0C58}, + {0x0C62, 0x0C66}, {0x0C81, 0x0C82}, {0x0CBC, 0x0CBD}, + {0x0CBF, 0x0CC0}, {0x0CC6, 0x0CC7}, {0x0CCC, 0x0CD5}, + {0x0CE2, 0x0CE6}, {0x0D00, 0x0D02}, {0x0D3B, 0x0D3D}, + {0x0D41, 0x0D46}, {0x0D4D, 0x0D4E}, {0x0D62, 0x0D66}, + {0x0DCA, 0x0DCF}, {0x0DD2, 0x0DD8}, {0x0E31, 0x0E32}, + {0x0E34, 0x0E3F}, {0x0E47, 0x0E4F}, {0x0EB1, 0x0EB2}, + {0x0EB4, 0x0EBD}, {0x0EC8, 0x0ED0}, {0x0F18, 0x0F1A}, + {0x0F35, 0x0F36}, {0x0F37, 0x0F38}, {0x0F39, 0x0F3A}, + {0x0F71, 0x0F7F}, {0x0F80, 0x0F85}, {0x0F86, 0x0F88}, + {0x0F8D, 0x0FBE}, {0x0FC6, 0x0FC7}, {0x102D, 0x1031}, + {0x1032, 0x1038}, {0x1039, 0x103B}, {0x103D, 0x103F}, + {0x1058, 0x105A}, {0x105E, 0x1061}, {0x1071, 0x1075}, + {0x1082, 0x1083}, {0x1085, 0x1087}, {0x108D, 0x108E}, + {0x109D, 0x109E}, {0x135D, 0x1360}, {0x1712, 0x1720}, + {0x1732, 0x1735}, {0x1752, 0x1760}, {0x1772, 0x1780}, + {0x17B4, 0x17B6}, {0x17B7, 0x17BE}, {0x17C6, 0x17C7}, + {0x17C9, 0x17D4}, {0x17DD, 0x17E0}, {0x180B, 0x180E}, + {0x1885, 0x1887}, {0x18A9, 0x18AA}, {0x1920, 0x1923}, + {0x1927, 0x1929}, {0x1932, 0x1933}, {0x1939, 0x1940}, + {0x1A17, 0x1A19}, {0x1A1B, 0x1A1E}, {0x1A56, 0x1A57}, + {0x1A58, 0x1A61}, {0x1A62, 0x1A63}, {0x1A65, 0x1A6D}, + {0x1A73, 0x1A80}, {0x1AB0, 0x1B04}, {0x1B34, 0x1B35}, + {0x1B36, 0x1B3B}, {0x1B3C, 0x1B3D}, {0x1B42, 0x1B43}, + {0x1B6B, 0x1B74}, {0x1B80, 0x1B82}, {0x1BA2, 0x1BA6}, + {0x1BA8, 0x1BAA}, {0x1BAB, 0x1BAE}, {0x1BE6, 0x1BE7}, + {0x1BE8, 0x1BEA}, {0x1BED, 0x1BEE}, {0x1BEF, 0x1BF2}, + {0x1C2C, 0x1C34}, {0x1C36, 0x1C3B}, {0x1CD0, 0x1CD3}, + {0x1CD4, 0x1CE1}, {0x1CE2, 0x1CE9}, {0x1CED, 0x1CEE}, + {0x1CF4, 0x1CF5}, {0x1CF8, 0x1CFA}, {0x1DC0, 0x1E00}, + {0x20D0, 0x2100}, {0x2CEF, 0x2CF2}, {0x2D7F, 0x2D80}, + {0x2DE0, 0x2E00}, {0x302A, 0x302E}, {0x3099, 0x309B}, + {0xA66F, 0xA673}, {0xA674, 0xA67E}, {0xA69E, 0xA6A0}, + {0xA6F0, 0xA6F2}, {0xA802, 0xA803}, {0xA806, 0xA807}, + {0xA80B, 0xA80C}, {0xA825, 0xA827}, {0xA8C4, 0xA8CE}, + {0xA8E0, 0xA8F2}, {0xA8FF, 0xA900}, {0xA926, 0xA92E}, + {0xA947, 0xA952}, {0xA980, 0xA983}, {0xA9B3, 0xA9B4}, + {0xA9B6, 0xA9BA}, {0xA9BC, 0xA9BE}, {0xA9E5, 0xA9E6}, + {0xAA29, 0xAA2F}, {0xAA31, 0xAA33}, {0xAA35, 0xAA40}, + {0xAA43, 0xAA44}, {0xAA4C, 0xAA4D}, {0xAA7C, 0xAA7D}, + {0xAAB0, 0xAAB1}, {0xAAB2, 0xAAB5}, {0xAAB7, 0xAAB9}, + {0xAABE, 0xAAC0}, {0xAAC1, 0xAAC2}, {0xAAEC, 0xAAEE}, + {0xAAF6, 0xAB01}, {0xABE5, 0xABE6}, {0xABE8, 0xABE9}, + {0xABED, 0xABF0}, {0xFB1E, 0xFB1F}, {0xFE00, 0xFE10}, + {0xFE20, 0xFE30} }; /* test for 8-bit control characters */ -- 2.21.0