--- Makefile.am | 7 +++ TODO | 4 -- src/shared/utf8.c | 120 +++++++++++++++++++++++++++++++++++++++++++++ src/shared/utf8.h | 100 +++++++++++++++++++++++++++++++++++++ src/shared/util.c | 83 +++++++++++++++++++++++++++++-- src/shared/util.h | 3 ++ src/test/test-wellipsize.c | 42 ++++++++++++++++ 7 files changed, 351 insertions(+), 8 deletions(-) create mode 100644 src/test/test-wellipsize.c
diff --git a/Makefile.am b/Makefile.am index cdbfdea..8813299 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1124,6 +1124,7 @@ tests += \ test-unit-file \ test-util \ test-date \ + test-wellipsize \ test-sleep \ test-replace-var \ test-sched-prio \ @@ -1302,6 +1303,12 @@ test_date_SOURCES = \ test_date_LDADD = \ libsystemd-core.la +test_wellipsize_SOURCES = \ + src/test/test-wellipsize.c + +test_wellipsize_LDADD = \ + libsystemd-core.la + test_sleep_SOURCES = \ src/test/test-sleep.c diff --git a/TODO b/TODO index fe305ec..a77ebe5 100644 --- a/TODO +++ b/TODO @@ -19,10 +19,6 @@ Bugfixes: * properly handle .mount unit state tracking when two mount points are stacked one on top of another on the exact same mount point. -* ellipsize_mem must take into account multi-byte unicode characters, and - - make the resulting line the requested number of *characters*, not *bytes*, - - avoid truncuating multi-byte sequences in the middle. - * When we detect invalid UTF-8, we cant't use it in an error message: log...("Path is not UTF-8 clean, ignoring assignment: %s", rvalue); diff --git a/src/shared/utf8.c b/src/shared/utf8.c index 655cc77..a9308b5 100644 --- a/src/shared/utf8.c +++ b/src/shared/utf8.c @@ -22,6 +22,11 @@ /* This file is based on the GLIB utf8 validation functions. The * original license text follows. */ +/* gunicode.h - Unicode manipulation functions + * + * Copyright (C) 1999, 2000 Tom Tromey + * Copyright 2000, 2005 Red Hat, Inc. + */ /* gutf8.c - Operations on UTF-8 strings. * * Copyright (C) 1999 Tom Tromey @@ -317,3 +322,118 @@ char *utf16_to_utf8(const void *s, size_t length) { return r; } + +/** + * g_utf8_prev_char: + * @p: a pointer to a position within a UTF-8 encoded string + * + * Finds the previous UTF-8 character in the string before @p. + * + * @p does not have to be at the beginning of a UTF-8 character. No check + * is made to see if the character found is actually valid other than + * it starts with an appropriate byte. If @p might be the first + * character of the string, you must use g_utf8_find_prev_char() instead. + * + * Return value: a pointer to the found character. + **/ +char * +utf8_prev_char (const char *p) +{ + while (1) + { + p--; + if ((*p & 0xc0) != 0x80) + return (char *)p; + } +} + +/** + * g_utf8_get_char: + * @p: a pointer to Unicode character encoded as UTF-8 + * + * Converts a sequence of bytes encoded as UTF-8 to a Unicode character. + * If @p does not point to a valid UTF-8 encoded character, results are + * undefined. If you are not sure that the bytes are complete + * valid Unicode characters, you should use g_utf8_get_char_validated() + * instead. + * + * Return value: the resulting character + **/ +unichar +utf8_get_char (const char *p) +{ + int i, mask = 0, len; + unichar result; + unsigned char c = (unsigned char) *p; + + UTF8_COMPUTE (c, mask, len); + if (len == -1) + return (unichar)-1; + UTF8_GET (result, p, i, mask, len); + + return result; +} + +struct Interval +{ + unichar start, end; +}; + +static int +interval_compare (const void *key, const void *elt) +{ + unichar c = (unichar) (long) (key); + struct Interval *interval = (struct Interval *)elt; + + if (c < interval->start) + return -1; + if (c > interval->end) + return +1; + + return 0; +} + +/* + * NOTE: + * + * The tables for g_unichar_iswide() and g_unichar_iswide_cjk() are + * generated from the Unicode Character Database's file + * extracted/DerivedEastAsianWidth.txt using the gen-iswide-table.py + * in this way: + * + * ./gen-iswide-table.py < path/to/ucd/extracted/DerivedEastAsianWidth.txt | fmt + * + * Last update for Unicode 6.0. + */ + +/** + * g_unichar_iswide: + * @c: a Unicode character + * + * Determines if a character is typically rendered in a double-width + * cell. + * + * Return value: %TRUE if the character is wide + **/ +bool +unichar_iswide (unichar c) +{ + /* See NOTE earlier for how to update this table. */ + static const struct Interval wide[] = { + {0x1100, 0x115F}, {0x2329, 0x232A}, {0x2E80, 0x2E99}, {0x2E9B, 0x2EF3}, + {0x2F00, 0x2FD5}, {0x2FF0, 0x2FFB}, {0x3000, 0x303E}, {0x3041, 0x3096}, + {0x3099, 0x30FF}, {0x3105, 0x312D}, {0x3131, 0x318E}, {0x3190, 0x31BA}, + {0x31C0, 0x31E3}, {0x31F0, 0x321E}, {0x3220, 0x3247}, {0x3250, 0x32FE}, + {0x3300, 0x4DBF}, {0x4E00, 0xA48C}, {0xA490, 0xA4C6}, {0xA960, 0xA97C}, + {0xAC00, 0xD7A3}, {0xF900, 0xFAFF}, {0xFE10, 0xFE19}, {0xFE30, 0xFE52}, + {0xFE54, 0xFE66}, {0xFE68, 0xFE6B}, {0xFF01, 0xFF60}, {0xFFE0, 0xFFE6}, + {0x1B000, 0x1B001}, {0x1F200, 0x1F202}, {0x1F210, 0x1F23A}, {0x1F240, + 0x1F248}, {0x1F250, 0x1F251}, {0x20000, 0x2FFFD}, {0x30000, 0x3FFFD} + }; + + if (bsearch ((long *)c, wide, (sizeof (wide) / sizeof ((wide)[0])), sizeof wide[0], + interval_compare)) + return true; + + return false; +} diff --git a/src/shared/utf8.h b/src/shared/utf8.h index f805ea6..f1be180 100644 --- a/src/shared/utf8.h +++ b/src/shared/utf8.h @@ -34,3 +34,103 @@ char *utf8_filter(const char *s); char *ascii_filter(const char *s); char *utf16_to_utf8(const void *s, size_t length); + +#define unichar uint32_t + +char *utf8_prev_char (const char *p); +unichar utf8_get_char (const char *p); + +#define UTF8_COMPUTE(Char, Mask, Len) \ + if (Char < 128) \ + { \ + Len = 1; \ + Mask = 0x7f; \ + } \ + else if ((Char & 0xe0) == 0xc0) \ + { \ + Len = 2; \ + Mask = 0x1f; \ + } \ + else if ((Char & 0xf0) == 0xe0) \ + { \ + Len = 3; \ + Mask = 0x0f; \ + } \ + else if ((Char & 0xf8) == 0xf0) \ + { \ + Len = 4; \ + Mask = 0x07; \ + } \ + else if ((Char & 0xfc) == 0xf8) \ + { \ + Len = 5; \ + Mask = 0x03; \ + } \ + else if ((Char & 0xfe) == 0xfc) \ + { \ + Len = 6; \ + Mask = 0x01; \ + } \ + else \ + Len = -1; + +#define UTF8_LENGTH(Char) \ + ((Char) < 0x80 ? 1 : \ + ((Char) < 0x800 ? 2 : \ + ((Char) < 0x10000 ? 3 : \ + ((Char) < 0x200000 ? 4 : \ + ((Char) < 0x4000000 ? 5 : 6))))) + + +#define UTF8_GET(Result, Chars, Count, Mask, Len) \ + (Result) = (Chars)[0] & (Mask); \ + for ((Count) = 1; (Count) < (Len); ++(Count)) \ + { \ + if (((Chars)[(Count)] & 0xc0) != 0x80) \ + { \ + (Result) = -1; \ + break; \ + } \ + (Result) <<= 6; \ + (Result) |= ((Chars)[(Count)] & 0x3f); \ + } + +/* + * Check whether a Unicode (5.2) char is in a valid range. + * + * The first check comes from the Unicode guarantee to never encode + * a point above 0x0010ffff, since UTF-16 couldn't represent it. + * + * The second check covers surrogate pairs (category Cs). + * + * @param Char the character + */ +#define UNICODE_VALID(Char) \ + ((Char) < 0x110000 && \ + (((Char) & 0xFFFFF800) != 0xD800)) + +static const char utf8_skip_data[256] = { + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1 +}; + +/** + * g_utf8_next_char: + * @p: Pointer to the start of a valid UTF-8 character + * + * Skips to the next character in a UTF-8 string. The string must be + * valid; this macro is as fast as possible, and has no error-checking. + * You would use this macro to iterate over a string character by + * character. The macro returns the start of the next UTF-8 character. + * Before using this macro, use g_utf8_validate() to validate strings + * that may contain invalid UTF-8. + */ +#define utf8_next_char(p) (char *)((p) + utf8_skip_data[*(const char *)(p)]) + +bool unichar_iswide (unichar c); \ No newline at end of file diff --git a/src/shared/util.c b/src/shared/util.c index 1dde8af..b791433 100644 --- a/src/shared/util.c +++ b/src/shared/util.c @@ -73,6 +73,7 @@ #include "hashmap.h" #include "env-util.h" #include "fileio.h" +#include "utf8.h" int saved_argc = 0; char **saved_argv = NULL; @@ -3285,8 +3286,8 @@ int running_in_chroot(void) { a.st_ino != b.st_ino; } -char *ellipsize_mem(const char *s, size_t old_length, size_t new_length, unsigned percent) { - size_t x; +char *ascii_ellipsize_mem(const char *s, size_t old_length, size_t new_length, unsigned percent) { + size_t x, j; char *r; assert(s); @@ -3305,17 +3306,91 @@ char *ellipsize_mem(const char *s, size_t old_length, size_t new_length, unsigne if (x > new_length - 3) x = new_length - 3; + for (;(unsigned char)s[x-1] & 0x80;x--) + continue; + memcpy(r, s, x); r[x] = '.'; r[x+1] = '.'; r[x+2] = '.'; + + for (j=(x+3);(unsigned char)s[j] & 0x80;j++) + continue; + memcpy(r + x + 3, - s + old_length - (new_length - x - 3), - new_length - x - 3); + s + old_length - (new_length - j), + new_length - j); return r; } +char *ascii_ellipsize(const char *s, size_t length, unsigned percent) { + return ascii_ellipsize_mem(s, strlen(s), length, percent); +} + +char *ellipsize_mem(const char *s, size_t old_length, size_t new_length, unsigned percent) { + size_t x; + char *e, *i; + unichar c; + unsigned j, k = 0; + + assert(s); + assert(percent <= 100); + assert(new_length >= 3); + + /* if no multibyte characters use ellipsize_mem for speed */ + if (ascii_is_valid(s)) + return ascii_ellipsize_mem(s, old_length, new_length, percent); + + if (old_length <= 3 || old_length <= new_length) + return strndup(s, old_length); + + if (!utf8_is_valid(s)) + return NULL; + + e = new0(char, MIN(new_length*4,old_length)); + if (!e) + return NULL; + + x = (new_length * percent) / 100; + + if (x > new_length - 3) + x = new_length - 3; + + for (i = (char *)s;k < x;i = utf8_next_char(i)) { + c = utf8_get_char(i); + k++; + if (unichar_iswide(c)) + k++; + } + + if (k > x) /* last character was wide and went over quota */ + x++; + + j = i - s; + memcpy(e, s, j); + e[j] = 0xe2; + e[j+1] = 0x80; + e[j+2] = 0xa6; + + k = 0; + for (i = (char *)s + old_length; + k < new_length - x - 1;) { + i = utf8_prev_char(i); + c = utf8_get_char(i); + k++; + if (unichar_iswide(c)) + k++; + } + + if (k > new_length - x - 1) /* last (reverse) character was wide and went over quota */ + i = utf8_next_char(i); + + strcpy(e + j + 3, i); + + return e; +} + char *ellipsize(const char *s, size_t length, unsigned percent) { return ellipsize_mem(s, strlen(s), length, percent); } diff --git a/src/shared/util.h b/src/shared/util.h index 63f4e3d..eb21855 100644 --- a/src/shared/util.h +++ b/src/shared/util.h @@ -402,7 +402,10 @@ static inline const char *ansi_highlight_off(void) { int running_in_chroot(void); +char *assii_ellipsize(const char *s, size_t length, unsigned percent); +char *ascii_ellipsize_mem(const char *s, size_t old_length, size_t new_length, unsigned percent); char *ellipsize(const char *s, size_t length, unsigned percent); + /* bytes columns */ char *ellipsize_mem(const char *s, size_t old_length, size_t new_length, unsigned percent); int touch(const char *path); diff --git a/src/test/test-wellipsize.c b/src/test/test-wellipsize.c new file mode 100644 index 0000000..f6db82c --- /dev/null +++ b/src/test/test-wellipsize.c @@ -0,0 +1,42 @@ +/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/ + +/*** + This file is part of systemd. + + Copyright 2013 Shawn Landden + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see <http://www.gnu.org/licenses/>. +***/ + +#include <stdio.h> + +#include "util.h" +#include "utf8.h" + +static void test_one(const char *p) { + _cleanup_free_ char *t = NULL; + t = ellipsize(p, 80, 70); + puts(t); +} + +int main(int argc, char *argv[]) { + test_one("s??????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????"); + test_one("?????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????"); + test_one("??????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????"); + test_one("?????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????"); + test_one("????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????"); + test_one("asdfnjaskdfnklasdgnjaskdghnkasdgfklasdfjkasdfjaksdfaskldfnaskldfnaskldfnaklsdfnaklsdfnklnaskjgdknl"); + + return 0; +} -- 1.8.4.rc3
_______________________________________________ systemd-devel mailing list systemd-devel@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/systemd-devel