Re: [systemd-devel] [PATCH v2 1/2] utf8: intruduce utf8_escape_non_printable

2014-12-03 Thread Lennart Poettering
On Wed, 19.11.14 12:35, David Herrmann (dh.herrm...@gmail.com) wrote:

  +} else {
  +if ((*str  ' ') || (*str = 127)) {
  +*(s++) = '\\';
  +*(s++) = 'x';
  +*(s++) = hexchar((int) *str  4);
  +*(s++) = hexchar((int) *str);
  +} else
  +*(s++) = *str;
  +
  +str += 1;
 
 This part is wrong. You cannot rely on ``*str'' to be the correct
 Unicode value for the character. utf8_is_printable() returns false
 also for multi-byte UTF8 characters. By taking it unmodified, it will
 include the UTF8 management bits, which we really don't want here.
 
 If you really want this, I'd prefer if you decode each UTF8 character,
 and if it is non-printable you print \uABCD or \UABCDWXYZ (like
 C++ does) as a 6-byte or 10-byte sequence. Other characters are just
 printed normally.

I have now committed the proposed patch but then changed the code to
iterate through all bytes of the unichar and escape that
individually. This form of escaping should be safe and be compatible
with C-style escaping (which \u isn't really...). Hope this makes sense.

Lennart

-- 
Lennart Poettering, Red Hat
___
systemd-devel mailing list
systemd-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/systemd-devel


Re: [systemd-devel] [PATCH v2 1/2] utf8: intruduce utf8_escape_non_printable

2014-11-19 Thread David Herrmann
Hi

On Wed, Nov 12, 2014 at 11:49 AM, WaLyong Cho walyong@samsung.com wrote:
 ---
  src/shared/utf8.c| 39 +++
  src/shared/utf8.h|  1 +
  src/test/test-utf8.c | 25 +
  3 files changed, 65 insertions(+)

 diff --git a/src/shared/utf8.c b/src/shared/utf8.c
 index 8702ceb..0b6c38e 100644
 --- a/src/shared/utf8.c
 +++ b/src/shared/utf8.c
 @@ -212,6 +212,45 @@ char *utf8_escape_invalid(const char *str) {
  return p;
  }

 +char *utf8_escape_non_printable(const char *str) {
 +char *p, *s;
 +
 +assert(str);
 +
 +p = s = malloc(strlen(str) * 4 + 1);
 +if (!p)
 +return NULL;
 +
 +while (*str) {
 +int len;
 +
 +len = utf8_encoded_valid_unichar(str);
 +if (len  0) {
 +if (utf8_is_printable(str, len)) {
 +s = mempcpy(s, str, len);
 +str += len;
 +} else {
 +if ((*str  ' ') || (*str = 127)) {
 +*(s++) = '\\';
 +*(s++) = 'x';
 +*(s++) = hexchar((int) *str  4);
 +*(s++) = hexchar((int) *str);
 +} else
 +*(s++) = *str;
 +
 +str += 1;

This part is wrong. You cannot rely on ``*str'' to be the correct
Unicode value for the character. utf8_is_printable() returns false
also for multi-byte UTF8 characters. By taking it unmodified, it will
include the UTF8 management bits, which we really don't want here.

If you really want this, I'd prefer if you decode each UTF8 character,
and if it is non-printable you print \uABCD or \UABCDWXYZ (like
C++ does) as a 6-byte or 10-byte sequence. Other characters are just
printed normally.

Thanks
David

 +}
 +} else {
 +s = mempcpy(s, UTF8_REPLACEMENT_CHARACTER, 
 strlen(UTF8_REPLACEMENT_CHARACTER));
 +str += 1;
 +}
 +}
 +
 +*s = '\0';
 +
 +return p;
 +}
 +
  char *ascii_is_valid(const char *str) {
  const char *p;

 diff --git a/src/shared/utf8.h b/src/shared/utf8.h
 index c087995..1fe1a35 100644
 --- a/src/shared/utf8.h
 +++ b/src/shared/utf8.h
 @@ -30,6 +30,7 @@
  const char *utf8_is_valid(const char *s) _pure_;
  char *ascii_is_valid(const char *s) _pure_;
  char *utf8_escape_invalid(const char *s);
 +char *utf8_escape_non_printable(const char *str);

  bool utf8_is_printable_newline(const char* str, size_t length, bool newline) 
 _pure_;
  _pure_ static inline bool utf8_is_printable(const char* str, size_t length) {
 diff --git a/src/test/test-utf8.c b/src/test/test-utf8.c
 index b7d988f..6dde63c 100644
 --- a/src/test/test-utf8.c
 +++ b/src/test/test-utf8.c
 @@ -66,12 +66,37 @@ static void test_utf8_escaping(void) {
  assert_se(utf8_is_valid(p3));
  }

 +static void test_utf8_escaping_printable(void) {
 +_cleanup_free_ char *p1, *p2, *p3, *p4, *p5;
 +
 +p1 = utf8_escape_non_printable(goo goo goo);
 +puts(p1);
 +assert_se(utf8_is_valid(p1));
 +
 +p2 = utf8_escape_non_printable(\341\204\341\204);
 +puts(p2);
 +assert_se(utf8_is_valid(p2));
 +
 +p3 = utf8_escape_non_printable(\341\204);
 +puts(p3);
 +assert_se(utf8_is_valid(p3));
 +
 +p4 = 
 utf8_escape_non_printable(ąę\n가너도루\n1234\n\341\204\341\204\n\001 \019\20\a);
 +puts(p4);
 +assert_se(utf8_is_valid(p4));
 +
 +p5 = utf8_escape_non_printable(\001 \019\20\a);
 +puts(p5);
 +assert_se(utf8_is_valid(p5));
 +}
 +
  int main(int argc, char *argv[]) {
  test_utf8_is_valid();
  test_utf8_is_printable();
  test_ascii_is_valid();
  test_utf8_encoded_valid_unichar();
  test_utf8_escaping();
 +test_utf8_escaping_printable();

  return 0;
  }
 --
 1.9.3

 ___
 systemd-devel mailing list
 systemd-devel@lists.freedesktop.org
 http://lists.freedesktop.org/mailman/listinfo/systemd-devel
___
systemd-devel mailing list
systemd-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/systemd-devel


[systemd-devel] [PATCH v2 1/2] utf8: intruduce utf8_escape_non_printable

2014-11-12 Thread WaLyong Cho
---
 src/shared/utf8.c| 39 +++
 src/shared/utf8.h|  1 +
 src/test/test-utf8.c | 25 +
 3 files changed, 65 insertions(+)

diff --git a/src/shared/utf8.c b/src/shared/utf8.c
index 8702ceb..0b6c38e 100644
--- a/src/shared/utf8.c
+++ b/src/shared/utf8.c
@@ -212,6 +212,45 @@ char *utf8_escape_invalid(const char *str) {
 return p;
 }
 
+char *utf8_escape_non_printable(const char *str) {
+char *p, *s;
+
+assert(str);
+
+p = s = malloc(strlen(str) * 4 + 1);
+if (!p)
+return NULL;
+
+while (*str) {
+int len;
+
+len = utf8_encoded_valid_unichar(str);
+if (len  0) {
+if (utf8_is_printable(str, len)) {
+s = mempcpy(s, str, len);
+str += len;
+} else {
+if ((*str  ' ') || (*str = 127)) {
+*(s++) = '\\';
+*(s++) = 'x';
+*(s++) = hexchar((int) *str  4);
+*(s++) = hexchar((int) *str);
+} else
+*(s++) = *str;
+
+str += 1;
+}
+} else {
+s = mempcpy(s, UTF8_REPLACEMENT_CHARACTER, 
strlen(UTF8_REPLACEMENT_CHARACTER));
+str += 1;
+}
+}
+
+*s = '\0';
+
+return p;
+}
+
 char *ascii_is_valid(const char *str) {
 const char *p;
 
diff --git a/src/shared/utf8.h b/src/shared/utf8.h
index c087995..1fe1a35 100644
--- a/src/shared/utf8.h
+++ b/src/shared/utf8.h
@@ -30,6 +30,7 @@
 const char *utf8_is_valid(const char *s) _pure_;
 char *ascii_is_valid(const char *s) _pure_;
 char *utf8_escape_invalid(const char *s);
+char *utf8_escape_non_printable(const char *str);
 
 bool utf8_is_printable_newline(const char* str, size_t length, bool newline) 
_pure_;
 _pure_ static inline bool utf8_is_printable(const char* str, size_t length) {
diff --git a/src/test/test-utf8.c b/src/test/test-utf8.c
index b7d988f..6dde63c 100644
--- a/src/test/test-utf8.c
+++ b/src/test/test-utf8.c
@@ -66,12 +66,37 @@ static void test_utf8_escaping(void) {
 assert_se(utf8_is_valid(p3));
 }
 
+static void test_utf8_escaping_printable(void) {
+_cleanup_free_ char *p1, *p2, *p3, *p4, *p5;
+
+p1 = utf8_escape_non_printable(goo goo goo);
+puts(p1);
+assert_se(utf8_is_valid(p1));
+
+p2 = utf8_escape_non_printable(\341\204\341\204);
+puts(p2);
+assert_se(utf8_is_valid(p2));
+
+p3 = utf8_escape_non_printable(\341\204);
+puts(p3);
+assert_se(utf8_is_valid(p3));
+
+p4 = utf8_escape_non_printable(ąę\n가너도루\n1234\n\341\204\341\204\n\001 
\019\20\a);
+puts(p4);
+assert_se(utf8_is_valid(p4));
+
+p5 = utf8_escape_non_printable(\001 \019\20\a);
+puts(p5);
+assert_se(utf8_is_valid(p5));
+}
+
 int main(int argc, char *argv[]) {
 test_utf8_is_valid();
 test_utf8_is_printable();
 test_ascii_is_valid();
 test_utf8_encoded_valid_unichar();
 test_utf8_escaping();
+test_utf8_escaping_printable();
 
 return 0;
 }
-- 
1.9.3

___
systemd-devel mailing list
systemd-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/systemd-devel


Re: [systemd-devel] [PATCH v2 1/2] utf8: intruduce utf8_escape_non_printable

2014-11-06 Thread Lennart Poettering
On Mon, 03.11.14 15:00, WaLyong Cho (walyong@samsung.com) wrote:

 ---
  src/shared/utf8.c| 87 
 
  src/shared/utf8.h|  1 +
  src/test/test-utf8.c | 30 ++
  3 files changed, 118 insertions(+)
 
 diff --git a/src/shared/utf8.c b/src/shared/utf8.c
 index 9353559..5245604 100644
 --- a/src/shared/utf8.c
 +++ b/src/shared/utf8.c
 @@ -210,6 +210,93 @@ char *utf8_escape_invalid(const char *str) {
  return p;
  }
  
 +char *utf8_escape_non_printable(const char *str) {
 +char *p, *s;
 +
 +assert(str);
 +
 +p = s = malloc(strlen(str) * 4 + 1);
 +if (!p)
 +return NULL;
 +
 +while (*str) {
 +int len;
 +
 +len = utf8_encoded_valid_unichar(str);
 +if (len  0) {
 +if (utf8_is_printable(str, len)) {
 +s = mempcpy(s, str, len);
 +str += len;
 +} else {
 +switch (*str) {
 +
 +case '\a':
 +*(s++) = '\\';
 +*(s++) = 'a';
 +break;
 +case '\b':
 +*(s++) = '\\';
 +*(s++) = 'b';
 +break;
 +case '\f':
 +*(s++) = '\\';
 +*(s++) = 'f';
 +break;
 +case '\n':
 +*(s++) = '\\';
 +*(s++) = 'n';
 +break;
 +case '\r':
 +*(s++) = '\\';
 +*(s++) = 'r';
 +break;
 +case '\t':
 +*(s++) = '\\';
 +*(s++) = 't';
 +break;
 +case '\v':
 +*(s++) = '\\';
 +*(s++) = 'v';
 +break;
 +case '\\':
 +*(s++) = '\\';
 +*(s++) = '\\';
 +break;
 +case '':
 +*(s++) = '\\';
 +*(s++) = '';
 +break;
 +case '\'':
 +*(s++) = '\\';
 +*(s++) = '\'';
 +break;
 +
 +default:
 +/* For special chars we prefer octal 
 over
 + * hexadecimal encoding, simply 
 because glib's
 + * g_strescape() does the same */
 +if ((*str  ' ') || (*str = 127)) {
 +*(s++) = '\\';
 +*(s++) = octchar((unsigned 
 char) *str  6);
 +*(s++) = octchar((unsigned 
 char) *str  3);
 +*(s++) = octchar((unsigned 
 char) *str);
 +} else
 +*(s++) = *str;
 +break;
 +}

Hmm, do we really want the C style of escaping here? wouldn't be the
\x style of escaping more appropriate here?

If the C style of escaping is appropriate, then we should find a way
to unify this case block between cescape() and this call, i.e. split
it out in a new call, maybe called:

   char* cescape_one(char c, char *buf);

That call would take the char to escape, plus a pointer to the buf
where to place the escaped version, and return a pointer that points
into the buffer right after where the escaped version was written. 

That way cescape() and your new call could call it like this:

 s = cescape_one(*str, s);

To escape one character. If you follow what I mean?

Lennart

-- 
Lennart Poettering, Red Hat
___
systemd-devel mailing list
systemd-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/systemd-devel


[systemd-devel] [PATCH v2 1/2] utf8: intruduce utf8_escape_non_printable

2014-11-02 Thread WaLyong Cho
---
 src/shared/utf8.c| 87 
 src/shared/utf8.h|  1 +
 src/test/test-utf8.c | 30 ++
 3 files changed, 118 insertions(+)

diff --git a/src/shared/utf8.c b/src/shared/utf8.c
index 9353559..5245604 100644
--- a/src/shared/utf8.c
+++ b/src/shared/utf8.c
@@ -210,6 +210,93 @@ char *utf8_escape_invalid(const char *str) {
 return p;
 }
 
+char *utf8_escape_non_printable(const char *str) {
+char *p, *s;
+
+assert(str);
+
+p = s = malloc(strlen(str) * 4 + 1);
+if (!p)
+return NULL;
+
+while (*str) {
+int len;
+
+len = utf8_encoded_valid_unichar(str);
+if (len  0) {
+if (utf8_is_printable(str, len)) {
+s = mempcpy(s, str, len);
+str += len;
+} else {
+switch (*str) {
+
+case '\a':
+*(s++) = '\\';
+*(s++) = 'a';
+break;
+case '\b':
+*(s++) = '\\';
+*(s++) = 'b';
+break;
+case '\f':
+*(s++) = '\\';
+*(s++) = 'f';
+break;
+case '\n':
+*(s++) = '\\';
+*(s++) = 'n';
+break;
+case '\r':
+*(s++) = '\\';
+*(s++) = 'r';
+break;
+case '\t':
+*(s++) = '\\';
+*(s++) = 't';
+break;
+case '\v':
+*(s++) = '\\';
+*(s++) = 'v';
+break;
+case '\\':
+*(s++) = '\\';
+*(s++) = '\\';
+break;
+case '':
+*(s++) = '\\';
+*(s++) = '';
+break;
+case '\'':
+*(s++) = '\\';
+*(s++) = '\'';
+break;
+
+default:
+/* For special chars we prefer octal 
over
+ * hexadecimal encoding, simply 
because glib's
+ * g_strescape() does the same */
+if ((*str  ' ') || (*str = 127)) {
+*(s++) = '\\';
+*(s++) = octchar((unsigned 
char) *str  6);
+*(s++) = octchar((unsigned 
char) *str  3);
+*(s++) = octchar((unsigned 
char) *str);
+} else
+*(s++) = *str;
+break;
+}
+str += 1;
+}
+} else {
+s = mempcpy(s, UTF8_REPLACEMENT_CHARACTER, 
strlen(UTF8_REPLACEMENT_CHARACTER));
+str += 1;
+}
+}
+
+*s = '\0';
+
+return p;
+}
+
 char *ascii_is_valid(const char *str) {
 const char *p;
 
diff --git a/src/shared/utf8.h b/src/shared/utf8.h
index c087995..1fe1a35 100644
--- a/src/shared/utf8.h
+++ b/src/shared/utf8.h
@@ -30,6 +30,7 @@
 const char *utf8_is_valid(const char *s) _pure_;
 char *ascii_is_valid(const char *s) _pure_;
 char *utf8_escape_invalid(const char *s);
+char *utf8_escape_non_printable(const char *str);
 
 bool utf8_is_printable_newline(const char* str, size_t length, bool newline) 
_pure_;
 _pure_ static inline bool utf8_is_printable(const char* str, size_t length) {
diff --git a/src/test/test-utf8.c b/src/test/test-utf8.c
index b7d988f..fb27fe5 100644
--- a/src/test/test-utf8.c
+++ b/src/test/test-utf8.c
@@ -66,12 +66,42 @@ static void