subject:"\[systemd\-devel\] \[PATCH 1\/2\] util, utf8\: make ellipsize and ellipsize_mem take into account multi\-byte characters"

[systemd-devel] [PATCH 1/2] util, utf8: make ellipsize and ellipsize_mem take into account multi-byte characters

2013-09-13 Thread Shawn Landden

rename old versions to ascii_*

Do not take into account zerowidth characters, but do consider double-wide 
characters.
Import needed utf8 helper code from glib.
---
 TODO  |   4 --
 src/shared/utf8.c | 120 ++
 src/shared/utf8.h | 100 +
 src/shared/util.c |  70 ++-
 src/shared/util.h |   3 ++
 5 files changed, 292 insertions(+), 5 deletions(-)

diff --git a/TODO b/TODO
index 08d4914..509b31f 100644
--- a/TODO
+++ b/TODO
@@ -19,10 +19,6 @@ Bugfixes:
 
 * properly handle .mount unit state tracking when two mount points are stacked 
one on top of another on the exact same mount point.
 
-* ellipsize_mem must take into account multi-byte unicode characters, and
-  - make the resulting line the requested number of *characters*, not *bytes*,
-  - avoid truncuating multi-byte sequences in the middle.
-
 * When we detect invalid UTF-8, we cant't use it in an error message:
   log...("Path is not UTF-8 clean, ignoring assignment: %s", rvalue);
 
diff --git a/src/shared/utf8.c b/src/shared/utf8.c
index 655cc77..a9308b5 100644
--- a/src/shared/utf8.c
+++ b/src/shared/utf8.c
@@ -22,6 +22,11 @@
 /* This file is based on the GLIB utf8 validation functions. The
  * original license text follows. */
 
+/* gunicode.h - Unicode manipulation functions
+ *
+ *  Copyright (C) 1999, 2000 Tom Tromey
+ *  Copyright 2000, 2005 Red Hat, Inc.
+ */
 /* gutf8.c - Operations on UTF-8 strings.
  *
  * Copyright (C) 1999 Tom Tromey
@@ -317,3 +322,118 @@ char *utf16_to_utf8(const void *s, size_t length) {
 
 return r;
 }
+
+/**
+ * g_utf8_prev_char:
+ * @p: a pointer to a position within a UTF-8 encoded string
+ *
+ * Finds the previous UTF-8 character in the string before @p.
+ *
+ * @p does not have to be at the beginning of a UTF-8 character. No check
+ * is made to see if the character found is actually valid other than
+ * it starts with an appropriate byte. If @p might be the first
+ * character of the string, you must use g_utf8_find_prev_char() instead.
+ *
+ * Return value: a pointer to the found character.
+ **/
+char *
+utf8_prev_char (const char *p)
+{
+  while (1)
+{
+  p--;
+  if ((*p & 0xc0) != 0x80)
+   return (char *)p;
+}
+}
+
+/**
+ * g_utf8_get_char:
+ * @p: a pointer to Unicode character encoded as UTF-8
+ *
+ * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
+ * If @p does not point to a valid UTF-8 encoded character, results are
+ * undefined. If you are not sure that the bytes are complete
+ * valid Unicode characters, you should use g_utf8_get_char_validated()
+ * instead.
+ *
+ * Return value: the resulting character
+ **/
+unichar
+utf8_get_char (const char *p)
+{
+  int i, mask = 0, len;
+  unichar result;
+  unsigned char c = (unsigned char) *p;
+
+  UTF8_COMPUTE (c, mask, len);
+  if (len == -1)
+return (unichar)-1;
+  UTF8_GET (result, p, i, mask, len);
+
+  return result;
+}
+
+struct Interval
+{
+  unichar start, end;
+};
+
+static int
+interval_compare (const void *key, const void *elt)
+{
+  unichar c = (unichar) (long) (key);
+  struct Interval *interval = (struct Interval *)elt;
+
+  if (c < interval->start)
+return -1;
+  if (c > interval->end)
+return +1;
+
+  return 0;
+}
+
+/*
+ * NOTE:
+ *
+ * The tables for g_unichar_iswide() and g_unichar_iswide_cjk() are
+ * generated from the Unicode Character Database's file
+ * extracted/DerivedEastAsianWidth.txt using the gen-iswide-table.py
+ * in this way:
+ *
+ *   ./gen-iswide-table.py < path/to/ucd/extracted/DerivedEastAsianWidth.txt | 
fmt
+ *
+ * Last update for Unicode 6.0.
+ */
+
+/**
+ * g_unichar_iswide:
+ * @c: a Unicode character
+ *
+ * Determines if a character is typically rendered in a double-width
+ * cell.
+ *
+ * Return value: %TRUE if the character is wide
+ **/
+bool
+unichar_iswide (unichar c)
+{
+  /* See NOTE earlier for how to update this table. */
+  static const struct Interval wide[] = {
+{0x1100, 0x115F}, {0x2329, 0x232A}, {0x2E80, 0x2E99}, {0x2E9B, 0x2EF3},
+{0x2F00, 0x2FD5}, {0x2FF0, 0x2FFB}, {0x3000, 0x303E}, {0x3041, 0x3096},
+{0x3099, 0x30FF}, {0x3105, 0x312D}, {0x3131, 0x318E}, {0x3190, 0x31BA},
+{0x31C0, 0x31E3}, {0x31F0, 0x321E}, {0x3220, 0x3247}, {0x3250, 0x32FE},
+{0x3300, 0x4DBF}, {0x4E00, 0xA48C}, {0xA490, 0xA4C6}, {0xA960, 0xA97C},
+{0xAC00, 0xD7A3}, {0xF900, 0xFAFF}, {0xFE10, 0xFE19}, {0xFE30, 0xFE52},
+{0xFE54, 0xFE66}, {0xFE68, 0xFE6B}, {0xFF01, 0xFF60}, {0xFFE0, 0xFFE6},
+{0x1B000, 0x1B001}, {0x1F200, 0x1F202}, {0x1F210, 0x1F23A}, {0x1F240,
+0x1F248}, {0x1F250, 0x1F251}, {0x2, 0x2FFFD}, {0x3, 0x3FFFD}
+  };
+
+  if (bsearch ((long *)c, wide, (sizeof (wide) / sizeof ((wide)[0])), sizeof 
wide[0],
+  interval_compare))
+return true;
+
+  return false;
+}
diff --git a/src/shared/utf8.h b/src/shared/utf8.h
index f805ea6..f1be180 100644
--- a/src/shared/utf8.h
++

[systemd-devel] [PATCH 1/2] util, utf8: make ellipsize and ellipsize_mem take into account multi-byte characters

2013-09-13 Thread Shawn Landden

rename old versions to ascii_*

Do not take into account zerowidth characters, but do consider double-wide 
characters.
Import needed utf8 helper code from glib.
---
 TODO  |   4 --
 src/shared/utf8.c | 120 ++
 src/shared/utf8.h | 100 +
 src/shared/util.c |  77 +--
 src/shared/util.h |   3 ++
 5 files changed, 296 insertions(+), 8 deletions(-)

diff --git a/TODO b/TODO
index 08d4914..509b31f 100644
--- a/TODO
+++ b/TODO
@@ -19,10 +19,6 @@ Bugfixes:
 
 * properly handle .mount unit state tracking when two mount points are stacked 
one on top of another on the exact same mount point.
 
-* ellipsize_mem must take into account multi-byte unicode characters, and
-  - make the resulting line the requested number of *characters*, not *bytes*,
-  - avoid truncuating multi-byte sequences in the middle.
-
 * When we detect invalid UTF-8, we cant't use it in an error message:
   log...("Path is not UTF-8 clean, ignoring assignment: %s", rvalue);
 
diff --git a/src/shared/utf8.c b/src/shared/utf8.c
index 655cc77..a9308b5 100644
--- a/src/shared/utf8.c
+++ b/src/shared/utf8.c
@@ -22,6 +22,11 @@
 /* This file is based on the GLIB utf8 validation functions. The
  * original license text follows. */
 
+/* gunicode.h - Unicode manipulation functions
+ *
+ *  Copyright (C) 1999, 2000 Tom Tromey
+ *  Copyright 2000, 2005 Red Hat, Inc.
+ */
 /* gutf8.c - Operations on UTF-8 strings.
  *
  * Copyright (C) 1999 Tom Tromey
@@ -317,3 +322,118 @@ char *utf16_to_utf8(const void *s, size_t length) {
 
 return r;
 }
+
+/**
+ * g_utf8_prev_char:
+ * @p: a pointer to a position within a UTF-8 encoded string
+ *
+ * Finds the previous UTF-8 character in the string before @p.
+ *
+ * @p does not have to be at the beginning of a UTF-8 character. No check
+ * is made to see if the character found is actually valid other than
+ * it starts with an appropriate byte. If @p might be the first
+ * character of the string, you must use g_utf8_find_prev_char() instead.
+ *
+ * Return value: a pointer to the found character.
+ **/
+char *
+utf8_prev_char (const char *p)
+{
+  while (1)
+{
+  p--;
+  if ((*p & 0xc0) != 0x80)
+   return (char *)p;
+}
+}
+
+/**
+ * g_utf8_get_char:
+ * @p: a pointer to Unicode character encoded as UTF-8
+ *
+ * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
+ * If @p does not point to a valid UTF-8 encoded character, results are
+ * undefined. If you are not sure that the bytes are complete
+ * valid Unicode characters, you should use g_utf8_get_char_validated()
+ * instead.
+ *
+ * Return value: the resulting character
+ **/
+unichar
+utf8_get_char (const char *p)
+{
+  int i, mask = 0, len;
+  unichar result;
+  unsigned char c = (unsigned char) *p;
+
+  UTF8_COMPUTE (c, mask, len);
+  if (len == -1)
+return (unichar)-1;
+  UTF8_GET (result, p, i, mask, len);
+
+  return result;
+}
+
+struct Interval
+{
+  unichar start, end;
+};
+
+static int
+interval_compare (const void *key, const void *elt)
+{
+  unichar c = (unichar) (long) (key);
+  struct Interval *interval = (struct Interval *)elt;
+
+  if (c < interval->start)
+return -1;
+  if (c > interval->end)
+return +1;
+
+  return 0;
+}
+
+/*
+ * NOTE:
+ *
+ * The tables for g_unichar_iswide() and g_unichar_iswide_cjk() are
+ * generated from the Unicode Character Database's file
+ * extracted/DerivedEastAsianWidth.txt using the gen-iswide-table.py
+ * in this way:
+ *
+ *   ./gen-iswide-table.py < path/to/ucd/extracted/DerivedEastAsianWidth.txt | 
fmt
+ *
+ * Last update for Unicode 6.0.
+ */
+
+/**
+ * g_unichar_iswide:
+ * @c: a Unicode character
+ *
+ * Determines if a character is typically rendered in a double-width
+ * cell.
+ *
+ * Return value: %TRUE if the character is wide
+ **/
+bool
+unichar_iswide (unichar c)
+{
+  /* See NOTE earlier for how to update this table. */
+  static const struct Interval wide[] = {
+{0x1100, 0x115F}, {0x2329, 0x232A}, {0x2E80, 0x2E99}, {0x2E9B, 0x2EF3},
+{0x2F00, 0x2FD5}, {0x2FF0, 0x2FFB}, {0x3000, 0x303E}, {0x3041, 0x3096},
+{0x3099, 0x30FF}, {0x3105, 0x312D}, {0x3131, 0x318E}, {0x3190, 0x31BA},
+{0x31C0, 0x31E3}, {0x31F0, 0x321E}, {0x3220, 0x3247}, {0x3250, 0x32FE},
+{0x3300, 0x4DBF}, {0x4E00, 0xA48C}, {0xA490, 0xA4C6}, {0xA960, 0xA97C},
+{0xAC00, 0xD7A3}, {0xF900, 0xFAFF}, {0xFE10, 0xFE19}, {0xFE30, 0xFE52},
+{0xFE54, 0xFE66}, {0xFE68, 0xFE6B}, {0xFF01, 0xFF60}, {0xFFE0, 0xFFE6},
+{0x1B000, 0x1B001}, {0x1F200, 0x1F202}, {0x1F210, 0x1F23A}, {0x1F240,
+0x1F248}, {0x1F250, 0x1F251}, {0x2, 0x2FFFD}, {0x3, 0x3FFFD}
+  };
+
+  if (bsearch ((long *)c, wide, (sizeof (wide) / sizeof ((wide)[0])), sizeof 
wide[0],
+  interval_compare))
+return true;
+
+  return false;
+}
diff --git a/src/shared/utf8.h b/src/shared/utf8.h
index f805ea6..f1be180 100644
--- a/src/shared/utf8.

[systemd-devel] [PATCH 1/2] util, utf8: make ellipsize and ellipsize_mem take into account multi-byte characters

2013-09-20 Thread Shawn Landden

rename old versions to ascii_*

Do not take into account zerowidth characters, but do consider double-wide 
characters.
Import needed utf8 helper code from glib.

v3: rebase ontop of utf8 restructuring work
---
 TODO  |   4 --
 src/shared/utf8.c | 120 ++
 src/shared/utf8.h | 100 +
 src/shared/util.c |  70 ++-
 src/shared/util.h |   3 ++
 5 files changed, 292 insertions(+), 5 deletions(-)

diff --git a/TODO b/TODO
index 01bc993..c54e986 100644
--- a/TODO
+++ b/TODO
@@ -19,10 +19,6 @@ Bugfixes:
 
 * properly handle .mount unit state tracking when two mount points are stacked 
one on top of another on the exact same mount point.
 
-* ellipsize_mem must take into account multi-byte unicode characters, and
-  - make the resulting line the requested number of *characters*, not *bytes*,
-  - avoid truncuating multi-byte sequences in the middle.
-
 * When we detect invalid UTF-8, we cant't use it in an error message:
   log...("Path is not UTF-8 clean, ignoring assignment: %s", rvalue);
 
diff --git a/src/shared/utf8.c b/src/shared/utf8.c
index c3d97cc..2b07265 100644
--- a/src/shared/utf8.c
+++ b/src/shared/utf8.c
@@ -22,6 +22,11 @@
 /* This file is based on the GLIB utf8 validation functions. The
  * original license text follows. */
 
+/* gunicode.h - Unicode manipulation functions
+ *
+ *  Copyright (C) 1999, 2000 Tom Tromey
+ *  Copyright 2000, 2005 Red Hat, Inc.
+ */
 /* gutf8.c - Operations on UTF-8 strings.
  *
  * Copyright (C) 1999 Tom Tromey
@@ -285,3 +290,118 @@ int utf8_encoded_valid_unichar(const char *str) {
 
 return len;
 }
+
+/**
+ * g_utf8_prev_char:
+ * @p: a pointer to a position within a UTF-8 encoded string
+ *
+ * Finds the previous UTF-8 character in the string before @p.
+ *
+ * @p does not have to be at the beginning of a UTF-8 character. No check
+ * is made to see if the character found is actually valid other than
+ * it starts with an appropriate byte. If @p might be the first
+ * character of the string, you must use g_utf8_find_prev_char() instead.
+ *
+ * Return value: a pointer to the found character.
+ **/
+char *
+utf8_prev_char (const char *p)
+{
+  while (1)
+{
+  p--;
+  if ((*p & 0xc0) != 0x80)
+return (char *)p;
+}
+}
+
+/**
+ * g_utf8_get_char:
+ * @p: a pointer to Unicode character encoded as UTF-8
+ *
+ * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
+ * If @p does not point to a valid UTF-8 encoded character, results are
+ * undefined. If you are not sure that the bytes are complete
+ * valid Unicode characters, you should use g_utf8_get_char_validated()
+ * instead.
+ *
+ * Return value: the resulting character
+ **/
+unichar
+utf8_get_char (const char *p)
+{
+  int i, mask = 0, len;
+  unichar result;
+  unsigned char c = (unsigned char) *p;
+
+  UTF8_COMPUTE (c, mask, len);
+  if (len == -1)
+return (unichar)-1;
+  UTF8_GET (result, p, i, mask, len);
+
+  return result;
+}
+
+struct Interval
+{
+  unichar start, end;
+};
+
+static int
+interval_compare (const void *key, const void *elt)
+{
+  unichar c = (unichar) (long) (key);
+  struct Interval *interval = (struct Interval *)elt;
+
+  if (c < interval->start)
+return -1;
+  if (c > interval->end)
+return +1;
+
+  return 0;
+}
+
+/*
+ * NOTE:
+ *
+ * The tables for g_unichar_iswide() and g_unichar_iswide_cjk() are
+ * generated from the Unicode Character Database's file
+ * extracted/DerivedEastAsianWidth.txt using the gen-iswide-table.py
+ * in this way:
+ *
+ *   ./gen-iswide-table.py < path/to/ucd/extracted/DerivedEastAsianWidth.txt | 
fmt
+ *
+ * Last update for Unicode 6.0.
+ */
+
+/**
+ * g_unichar_iswide:
+ * @c: a Unicode character
+ *
+ * Determines if a character is typically rendered in a double-width
+ * cell.
+ *
+ * Return value: %TRUE if the character is wide
+ **/
+bool
+unichar_iswide (unichar c)
+{
+  /* See NOTE earlier for how to update this table. */
+  static const struct Interval wide[] = {
+{0x1100, 0x115F}, {0x2329, 0x232A}, {0x2E80, 0x2E99}, {0x2E9B, 0x2EF3},
+{0x2F00, 0x2FD5}, {0x2FF0, 0x2FFB}, {0x3000, 0x303E}, {0x3041, 0x3096},
+{0x3099, 0x30FF}, {0x3105, 0x312D}, {0x3131, 0x318E}, {0x3190, 0x31BA},
+{0x31C0, 0x31E3}, {0x31F0, 0x321E}, {0x3220, 0x3247}, {0x3250, 0x32FE},
+{0x3300, 0x4DBF}, {0x4E00, 0xA48C}, {0xA490, 0xA4C6}, {0xA960, 0xA97C},
+{0xAC00, 0xD7A3}, {0xF900, 0xFAFF}, {0xFE10, 0xFE19}, {0xFE30, 0xFE52},
+{0xFE54, 0xFE66}, {0xFE68, 0xFE6B}, {0xFF01, 0xFF60}, {0xFFE0, 0xFFE6},
+{0x1B000, 0x1B001}, {0x1F200, 0x1F202}, {0x1F210, 0x1F23A}, {0x1F240,
+0x1F248}, {0x1F250, 0x1F251}, {0x2, 0x2FFFD}, {0x3, 0x3FFFD}
+  };
+
+  if (bsearch ((long *)c, wide, (sizeof (wide) / sizeof ((wide)[0])), sizeof 
wide[0],
+   interval_compare))
+return true;
+
+  return false;
+}
diff --git a/src/shared/utf8.h b/src/shared/utf8.h
index 96a

[systemd-devel] [PATCH 1/2] util, utf8: make ellipsize and ellipsize_mem take into account multi-byte characters

[systemd-devel] [PATCH 1/2] util, utf8: make ellipsize and ellipsize_mem take into account multi-byte characters

[systemd-devel] [PATCH 1/2] util, utf8: make ellipsize and ellipsize_mem take into account multi-byte characters

3 matches

Site Navigation

Mail list logo

Footer information