[systemd-devel] [PATCH 1/2] util, utf8: make ellipsize and ellipsize_mem take into account multi-byte characters
rename old versions to ascii_* Do not take into account zerowidth characters, but do consider double-wide characters. Import needed utf8 helper code from glib. --- TODO | 4 -- src/shared/utf8.c | 120 ++ src/shared/utf8.h | 100 + src/shared/util.c | 70 ++- src/shared/util.h | 3 ++ 5 files changed, 292 insertions(+), 5 deletions(-) diff --git a/TODO b/TODO index 08d4914..509b31f 100644 --- a/TODO +++ b/TODO @@ -19,10 +19,6 @@ Bugfixes: * properly handle .mount unit state tracking when two mount points are stacked one on top of another on the exact same mount point. -* ellipsize_mem must take into account multi-byte unicode characters, and - - make the resulting line the requested number of *characters*, not *bytes*, - - avoid truncuating multi-byte sequences in the middle. - * When we detect invalid UTF-8, we cant't use it in an error message: log...("Path is not UTF-8 clean, ignoring assignment: %s", rvalue); diff --git a/src/shared/utf8.c b/src/shared/utf8.c index 655cc77..a9308b5 100644 --- a/src/shared/utf8.c +++ b/src/shared/utf8.c @@ -22,6 +22,11 @@ /* This file is based on the GLIB utf8 validation functions. The * original license text follows. */ +/* gunicode.h - Unicode manipulation functions + * + * Copyright (C) 1999, 2000 Tom Tromey + * Copyright 2000, 2005 Red Hat, Inc. + */ /* gutf8.c - Operations on UTF-8 strings. * * Copyright (C) 1999 Tom Tromey @@ -317,3 +322,118 @@ char *utf16_to_utf8(const void *s, size_t length) { return r; } + +/** + * g_utf8_prev_char: + * @p: a pointer to a position within a UTF-8 encoded string + * + * Finds the previous UTF-8 character in the string before @p. + * + * @p does not have to be at the beginning of a UTF-8 character. No check + * is made to see if the character found is actually valid other than + * it starts with an appropriate byte. If @p might be the first + * character of the string, you must use g_utf8_find_prev_char() instead. + * + * Return value: a pointer to the found character. + **/ +char * +utf8_prev_char (const char *p) +{ + while (1) +{ + p--; + if ((*p & 0xc0) != 0x80) + return (char *)p; +} +} + +/** + * g_utf8_get_char: + * @p: a pointer to Unicode character encoded as UTF-8 + * + * Converts a sequence of bytes encoded as UTF-8 to a Unicode character. + * If @p does not point to a valid UTF-8 encoded character, results are + * undefined. If you are not sure that the bytes are complete + * valid Unicode characters, you should use g_utf8_get_char_validated() + * instead. + * + * Return value: the resulting character + **/ +unichar +utf8_get_char (const char *p) +{ + int i, mask = 0, len; + unichar result; + unsigned char c = (unsigned char) *p; + + UTF8_COMPUTE (c, mask, len); + if (len == -1) +return (unichar)-1; + UTF8_GET (result, p, i, mask, len); + + return result; +} + +struct Interval +{ + unichar start, end; +}; + +static int +interval_compare (const void *key, const void *elt) +{ + unichar c = (unichar) (long) (key); + struct Interval *interval = (struct Interval *)elt; + + if (c < interval->start) +return -1; + if (c > interval->end) +return +1; + + return 0; +} + +/* + * NOTE: + * + * The tables for g_unichar_iswide() and g_unichar_iswide_cjk() are + * generated from the Unicode Character Database's file + * extracted/DerivedEastAsianWidth.txt using the gen-iswide-table.py + * in this way: + * + * ./gen-iswide-table.py < path/to/ucd/extracted/DerivedEastAsianWidth.txt | fmt + * + * Last update for Unicode 6.0. + */ + +/** + * g_unichar_iswide: + * @c: a Unicode character + * + * Determines if a character is typically rendered in a double-width + * cell. + * + * Return value: %TRUE if the character is wide + **/ +bool +unichar_iswide (unichar c) +{ + /* See NOTE earlier for how to update this table. */ + static const struct Interval wide[] = { +{0x1100, 0x115F}, {0x2329, 0x232A}, {0x2E80, 0x2E99}, {0x2E9B, 0x2EF3}, +{0x2F00, 0x2FD5}, {0x2FF0, 0x2FFB}, {0x3000, 0x303E}, {0x3041, 0x3096}, +{0x3099, 0x30FF}, {0x3105, 0x312D}, {0x3131, 0x318E}, {0x3190, 0x31BA}, +{0x31C0, 0x31E3}, {0x31F0, 0x321E}, {0x3220, 0x3247}, {0x3250, 0x32FE}, +{0x3300, 0x4DBF}, {0x4E00, 0xA48C}, {0xA490, 0xA4C6}, {0xA960, 0xA97C}, +{0xAC00, 0xD7A3}, {0xF900, 0xFAFF}, {0xFE10, 0xFE19}, {0xFE30, 0xFE52}, +{0xFE54, 0xFE66}, {0xFE68, 0xFE6B}, {0xFF01, 0xFF60}, {0xFFE0, 0xFFE6}, +{0x1B000, 0x1B001}, {0x1F200, 0x1F202}, {0x1F210, 0x1F23A}, {0x1F240, +0x1F248}, {0x1F250, 0x1F251}, {0x2, 0x2FFFD}, {0x3, 0x3FFFD} + }; + + if (bsearch ((long *)c, wide, (sizeof (wide) / sizeof ((wide)[0])), sizeof wide[0], + interval_compare)) +return true; + + return false; +} diff --git a/src/shared/utf8.h b/src/shared/utf8.h index f805ea6..f1be180 100644 --- a/src/shared/utf8.h ++
[systemd-devel] [PATCH 1/2] util, utf8: make ellipsize and ellipsize_mem take into account multi-byte characters
rename old versions to ascii_* Do not take into account zerowidth characters, but do consider double-wide characters. Import needed utf8 helper code from glib. --- TODO | 4 -- src/shared/utf8.c | 120 ++ src/shared/utf8.h | 100 + src/shared/util.c | 77 +-- src/shared/util.h | 3 ++ 5 files changed, 296 insertions(+), 8 deletions(-) diff --git a/TODO b/TODO index 08d4914..509b31f 100644 --- a/TODO +++ b/TODO @@ -19,10 +19,6 @@ Bugfixes: * properly handle .mount unit state tracking when two mount points are stacked one on top of another on the exact same mount point. -* ellipsize_mem must take into account multi-byte unicode characters, and - - make the resulting line the requested number of *characters*, not *bytes*, - - avoid truncuating multi-byte sequences in the middle. - * When we detect invalid UTF-8, we cant't use it in an error message: log...("Path is not UTF-8 clean, ignoring assignment: %s", rvalue); diff --git a/src/shared/utf8.c b/src/shared/utf8.c index 655cc77..a9308b5 100644 --- a/src/shared/utf8.c +++ b/src/shared/utf8.c @@ -22,6 +22,11 @@ /* This file is based on the GLIB utf8 validation functions. The * original license text follows. */ +/* gunicode.h - Unicode manipulation functions + * + * Copyright (C) 1999, 2000 Tom Tromey + * Copyright 2000, 2005 Red Hat, Inc. + */ /* gutf8.c - Operations on UTF-8 strings. * * Copyright (C) 1999 Tom Tromey @@ -317,3 +322,118 @@ char *utf16_to_utf8(const void *s, size_t length) { return r; } + +/** + * g_utf8_prev_char: + * @p: a pointer to a position within a UTF-8 encoded string + * + * Finds the previous UTF-8 character in the string before @p. + * + * @p does not have to be at the beginning of a UTF-8 character. No check + * is made to see if the character found is actually valid other than + * it starts with an appropriate byte. If @p might be the first + * character of the string, you must use g_utf8_find_prev_char() instead. + * + * Return value: a pointer to the found character. + **/ +char * +utf8_prev_char (const char *p) +{ + while (1) +{ + p--; + if ((*p & 0xc0) != 0x80) + return (char *)p; +} +} + +/** + * g_utf8_get_char: + * @p: a pointer to Unicode character encoded as UTF-8 + * + * Converts a sequence of bytes encoded as UTF-8 to a Unicode character. + * If @p does not point to a valid UTF-8 encoded character, results are + * undefined. If you are not sure that the bytes are complete + * valid Unicode characters, you should use g_utf8_get_char_validated() + * instead. + * + * Return value: the resulting character + **/ +unichar +utf8_get_char (const char *p) +{ + int i, mask = 0, len; + unichar result; + unsigned char c = (unsigned char) *p; + + UTF8_COMPUTE (c, mask, len); + if (len == -1) +return (unichar)-1; + UTF8_GET (result, p, i, mask, len); + + return result; +} + +struct Interval +{ + unichar start, end; +}; + +static int +interval_compare (const void *key, const void *elt) +{ + unichar c = (unichar) (long) (key); + struct Interval *interval = (struct Interval *)elt; + + if (c < interval->start) +return -1; + if (c > interval->end) +return +1; + + return 0; +} + +/* + * NOTE: + * + * The tables for g_unichar_iswide() and g_unichar_iswide_cjk() are + * generated from the Unicode Character Database's file + * extracted/DerivedEastAsianWidth.txt using the gen-iswide-table.py + * in this way: + * + * ./gen-iswide-table.py < path/to/ucd/extracted/DerivedEastAsianWidth.txt | fmt + * + * Last update for Unicode 6.0. + */ + +/** + * g_unichar_iswide: + * @c: a Unicode character + * + * Determines if a character is typically rendered in a double-width + * cell. + * + * Return value: %TRUE if the character is wide + **/ +bool +unichar_iswide (unichar c) +{ + /* See NOTE earlier for how to update this table. */ + static const struct Interval wide[] = { +{0x1100, 0x115F}, {0x2329, 0x232A}, {0x2E80, 0x2E99}, {0x2E9B, 0x2EF3}, +{0x2F00, 0x2FD5}, {0x2FF0, 0x2FFB}, {0x3000, 0x303E}, {0x3041, 0x3096}, +{0x3099, 0x30FF}, {0x3105, 0x312D}, {0x3131, 0x318E}, {0x3190, 0x31BA}, +{0x31C0, 0x31E3}, {0x31F0, 0x321E}, {0x3220, 0x3247}, {0x3250, 0x32FE}, +{0x3300, 0x4DBF}, {0x4E00, 0xA48C}, {0xA490, 0xA4C6}, {0xA960, 0xA97C}, +{0xAC00, 0xD7A3}, {0xF900, 0xFAFF}, {0xFE10, 0xFE19}, {0xFE30, 0xFE52}, +{0xFE54, 0xFE66}, {0xFE68, 0xFE6B}, {0xFF01, 0xFF60}, {0xFFE0, 0xFFE6}, +{0x1B000, 0x1B001}, {0x1F200, 0x1F202}, {0x1F210, 0x1F23A}, {0x1F240, +0x1F248}, {0x1F250, 0x1F251}, {0x2, 0x2FFFD}, {0x3, 0x3FFFD} + }; + + if (bsearch ((long *)c, wide, (sizeof (wide) / sizeof ((wide)[0])), sizeof wide[0], + interval_compare)) +return true; + + return false; +} diff --git a/src/shared/utf8.h b/src/shared/utf8.h index f805ea6..f1be180 100644 --- a/src/shared/utf8.
[systemd-devel] [PATCH 1/2] util, utf8: make ellipsize and ellipsize_mem take into account multi-byte characters
rename old versions to ascii_* Do not take into account zerowidth characters, but do consider double-wide characters. Import needed utf8 helper code from glib. v3: rebase ontop of utf8 restructuring work --- TODO | 4 -- src/shared/utf8.c | 120 ++ src/shared/utf8.h | 100 + src/shared/util.c | 70 ++- src/shared/util.h | 3 ++ 5 files changed, 292 insertions(+), 5 deletions(-) diff --git a/TODO b/TODO index 01bc993..c54e986 100644 --- a/TODO +++ b/TODO @@ -19,10 +19,6 @@ Bugfixes: * properly handle .mount unit state tracking when two mount points are stacked one on top of another on the exact same mount point. -* ellipsize_mem must take into account multi-byte unicode characters, and - - make the resulting line the requested number of *characters*, not *bytes*, - - avoid truncuating multi-byte sequences in the middle. - * When we detect invalid UTF-8, we cant't use it in an error message: log...("Path is not UTF-8 clean, ignoring assignment: %s", rvalue); diff --git a/src/shared/utf8.c b/src/shared/utf8.c index c3d97cc..2b07265 100644 --- a/src/shared/utf8.c +++ b/src/shared/utf8.c @@ -22,6 +22,11 @@ /* This file is based on the GLIB utf8 validation functions. The * original license text follows. */ +/* gunicode.h - Unicode manipulation functions + * + * Copyright (C) 1999, 2000 Tom Tromey + * Copyright 2000, 2005 Red Hat, Inc. + */ /* gutf8.c - Operations on UTF-8 strings. * * Copyright (C) 1999 Tom Tromey @@ -285,3 +290,118 @@ int utf8_encoded_valid_unichar(const char *str) { return len; } + +/** + * g_utf8_prev_char: + * @p: a pointer to a position within a UTF-8 encoded string + * + * Finds the previous UTF-8 character in the string before @p. + * + * @p does not have to be at the beginning of a UTF-8 character. No check + * is made to see if the character found is actually valid other than + * it starts with an appropriate byte. If @p might be the first + * character of the string, you must use g_utf8_find_prev_char() instead. + * + * Return value: a pointer to the found character. + **/ +char * +utf8_prev_char (const char *p) +{ + while (1) +{ + p--; + if ((*p & 0xc0) != 0x80) +return (char *)p; +} +} + +/** + * g_utf8_get_char: + * @p: a pointer to Unicode character encoded as UTF-8 + * + * Converts a sequence of bytes encoded as UTF-8 to a Unicode character. + * If @p does not point to a valid UTF-8 encoded character, results are + * undefined. If you are not sure that the bytes are complete + * valid Unicode characters, you should use g_utf8_get_char_validated() + * instead. + * + * Return value: the resulting character + **/ +unichar +utf8_get_char (const char *p) +{ + int i, mask = 0, len; + unichar result; + unsigned char c = (unsigned char) *p; + + UTF8_COMPUTE (c, mask, len); + if (len == -1) +return (unichar)-1; + UTF8_GET (result, p, i, mask, len); + + return result; +} + +struct Interval +{ + unichar start, end; +}; + +static int +interval_compare (const void *key, const void *elt) +{ + unichar c = (unichar) (long) (key); + struct Interval *interval = (struct Interval *)elt; + + if (c < interval->start) +return -1; + if (c > interval->end) +return +1; + + return 0; +} + +/* + * NOTE: + * + * The tables for g_unichar_iswide() and g_unichar_iswide_cjk() are + * generated from the Unicode Character Database's file + * extracted/DerivedEastAsianWidth.txt using the gen-iswide-table.py + * in this way: + * + * ./gen-iswide-table.py < path/to/ucd/extracted/DerivedEastAsianWidth.txt | fmt + * + * Last update for Unicode 6.0. + */ + +/** + * g_unichar_iswide: + * @c: a Unicode character + * + * Determines if a character is typically rendered in a double-width + * cell. + * + * Return value: %TRUE if the character is wide + **/ +bool +unichar_iswide (unichar c) +{ + /* See NOTE earlier for how to update this table. */ + static const struct Interval wide[] = { +{0x1100, 0x115F}, {0x2329, 0x232A}, {0x2E80, 0x2E99}, {0x2E9B, 0x2EF3}, +{0x2F00, 0x2FD5}, {0x2FF0, 0x2FFB}, {0x3000, 0x303E}, {0x3041, 0x3096}, +{0x3099, 0x30FF}, {0x3105, 0x312D}, {0x3131, 0x318E}, {0x3190, 0x31BA}, +{0x31C0, 0x31E3}, {0x31F0, 0x321E}, {0x3220, 0x3247}, {0x3250, 0x32FE}, +{0x3300, 0x4DBF}, {0x4E00, 0xA48C}, {0xA490, 0xA4C6}, {0xA960, 0xA97C}, +{0xAC00, 0xD7A3}, {0xF900, 0xFAFF}, {0xFE10, 0xFE19}, {0xFE30, 0xFE52}, +{0xFE54, 0xFE66}, {0xFE68, 0xFE6B}, {0xFF01, 0xFF60}, {0xFFE0, 0xFFE6}, +{0x1B000, 0x1B001}, {0x1F200, 0x1F202}, {0x1F210, 0x1F23A}, {0x1F240, +0x1F248}, {0x1F250, 0x1F251}, {0x2, 0x2FFFD}, {0x3, 0x3FFFD} + }; + + if (bsearch ((long *)c, wide, (sizeof (wide) / sizeof ((wide)[0])), sizeof wide[0], + interval_compare)) +return true; + + return false; +} diff --git a/src/shared/utf8.h b/src/shared/utf8.h index 96a