Re: [systemd-devel] [PATCH v2 1/2] utf8: intruduce utf8_escape_non_printable
On Wed, 19.11.14 12:35, David Herrmann (dh.herrm...@gmail.com) wrote: +} else { +if ((*str ' ') || (*str = 127)) { +*(s++) = '\\'; +*(s++) = 'x'; +*(s++) = hexchar((int) *str 4); +*(s++) = hexchar((int) *str); +} else +*(s++) = *str; + +str += 1; This part is wrong. You cannot rely on ``*str'' to be the correct Unicode value for the character. utf8_is_printable() returns false also for multi-byte UTF8 characters. By taking it unmodified, it will include the UTF8 management bits, which we really don't want here. If you really want this, I'd prefer if you decode each UTF8 character, and if it is non-printable you print \uABCD or \UABCDWXYZ (like C++ does) as a 6-byte or 10-byte sequence. Other characters are just printed normally. I have now committed the proposed patch but then changed the code to iterate through all bytes of the unichar and escape that individually. This form of escaping should be safe and be compatible with C-style escaping (which \u isn't really...). Hope this makes sense. Lennart -- Lennart Poettering, Red Hat ___ systemd-devel mailing list systemd-devel@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/systemd-devel
Re: [systemd-devel] [PATCH v2 1/2] utf8: intruduce utf8_escape_non_printable
Hi On Wed, Nov 12, 2014 at 11:49 AM, WaLyong Cho walyong@samsung.com wrote: --- src/shared/utf8.c| 39 +++ src/shared/utf8.h| 1 + src/test/test-utf8.c | 25 + 3 files changed, 65 insertions(+) diff --git a/src/shared/utf8.c b/src/shared/utf8.c index 8702ceb..0b6c38e 100644 --- a/src/shared/utf8.c +++ b/src/shared/utf8.c @@ -212,6 +212,45 @@ char *utf8_escape_invalid(const char *str) { return p; } +char *utf8_escape_non_printable(const char *str) { +char *p, *s; + +assert(str); + +p = s = malloc(strlen(str) * 4 + 1); +if (!p) +return NULL; + +while (*str) { +int len; + +len = utf8_encoded_valid_unichar(str); +if (len 0) { +if (utf8_is_printable(str, len)) { +s = mempcpy(s, str, len); +str += len; +} else { +if ((*str ' ') || (*str = 127)) { +*(s++) = '\\'; +*(s++) = 'x'; +*(s++) = hexchar((int) *str 4); +*(s++) = hexchar((int) *str); +} else +*(s++) = *str; + +str += 1; This part is wrong. You cannot rely on ``*str'' to be the correct Unicode value for the character. utf8_is_printable() returns false also for multi-byte UTF8 characters. By taking it unmodified, it will include the UTF8 management bits, which we really don't want here. If you really want this, I'd prefer if you decode each UTF8 character, and if it is non-printable you print \uABCD or \UABCDWXYZ (like C++ does) as a 6-byte or 10-byte sequence. Other characters are just printed normally. Thanks David +} +} else { +s = mempcpy(s, UTF8_REPLACEMENT_CHARACTER, strlen(UTF8_REPLACEMENT_CHARACTER)); +str += 1; +} +} + +*s = '\0'; + +return p; +} + char *ascii_is_valid(const char *str) { const char *p; diff --git a/src/shared/utf8.h b/src/shared/utf8.h index c087995..1fe1a35 100644 --- a/src/shared/utf8.h +++ b/src/shared/utf8.h @@ -30,6 +30,7 @@ const char *utf8_is_valid(const char *s) _pure_; char *ascii_is_valid(const char *s) _pure_; char *utf8_escape_invalid(const char *s); +char *utf8_escape_non_printable(const char *str); bool utf8_is_printable_newline(const char* str, size_t length, bool newline) _pure_; _pure_ static inline bool utf8_is_printable(const char* str, size_t length) { diff --git a/src/test/test-utf8.c b/src/test/test-utf8.c index b7d988f..6dde63c 100644 --- a/src/test/test-utf8.c +++ b/src/test/test-utf8.c @@ -66,12 +66,37 @@ static void test_utf8_escaping(void) { assert_se(utf8_is_valid(p3)); } +static void test_utf8_escaping_printable(void) { +_cleanup_free_ char *p1, *p2, *p3, *p4, *p5; + +p1 = utf8_escape_non_printable(goo goo goo); +puts(p1); +assert_se(utf8_is_valid(p1)); + +p2 = utf8_escape_non_printable(\341\204\341\204); +puts(p2); +assert_se(utf8_is_valid(p2)); + +p3 = utf8_escape_non_printable(\341\204); +puts(p3); +assert_se(utf8_is_valid(p3)); + +p4 = utf8_escape_non_printable(ąę\n가너도루\n1234\n\341\204\341\204\n\001 \019\20\a); +puts(p4); +assert_se(utf8_is_valid(p4)); + +p5 = utf8_escape_non_printable(\001 \019\20\a); +puts(p5); +assert_se(utf8_is_valid(p5)); +} + int main(int argc, char *argv[]) { test_utf8_is_valid(); test_utf8_is_printable(); test_ascii_is_valid(); test_utf8_encoded_valid_unichar(); test_utf8_escaping(); +test_utf8_escaping_printable(); return 0; } -- 1.9.3 ___ systemd-devel mailing list systemd-devel@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/systemd-devel ___ systemd-devel mailing list systemd-devel@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/systemd-devel
[systemd-devel] [PATCH v2 1/2] utf8: intruduce utf8_escape_non_printable
--- src/shared/utf8.c| 39 +++ src/shared/utf8.h| 1 + src/test/test-utf8.c | 25 + 3 files changed, 65 insertions(+) diff --git a/src/shared/utf8.c b/src/shared/utf8.c index 8702ceb..0b6c38e 100644 --- a/src/shared/utf8.c +++ b/src/shared/utf8.c @@ -212,6 +212,45 @@ char *utf8_escape_invalid(const char *str) { return p; } +char *utf8_escape_non_printable(const char *str) { +char *p, *s; + +assert(str); + +p = s = malloc(strlen(str) * 4 + 1); +if (!p) +return NULL; + +while (*str) { +int len; + +len = utf8_encoded_valid_unichar(str); +if (len 0) { +if (utf8_is_printable(str, len)) { +s = mempcpy(s, str, len); +str += len; +} else { +if ((*str ' ') || (*str = 127)) { +*(s++) = '\\'; +*(s++) = 'x'; +*(s++) = hexchar((int) *str 4); +*(s++) = hexchar((int) *str); +} else +*(s++) = *str; + +str += 1; +} +} else { +s = mempcpy(s, UTF8_REPLACEMENT_CHARACTER, strlen(UTF8_REPLACEMENT_CHARACTER)); +str += 1; +} +} + +*s = '\0'; + +return p; +} + char *ascii_is_valid(const char *str) { const char *p; diff --git a/src/shared/utf8.h b/src/shared/utf8.h index c087995..1fe1a35 100644 --- a/src/shared/utf8.h +++ b/src/shared/utf8.h @@ -30,6 +30,7 @@ const char *utf8_is_valid(const char *s) _pure_; char *ascii_is_valid(const char *s) _pure_; char *utf8_escape_invalid(const char *s); +char *utf8_escape_non_printable(const char *str); bool utf8_is_printable_newline(const char* str, size_t length, bool newline) _pure_; _pure_ static inline bool utf8_is_printable(const char* str, size_t length) { diff --git a/src/test/test-utf8.c b/src/test/test-utf8.c index b7d988f..6dde63c 100644 --- a/src/test/test-utf8.c +++ b/src/test/test-utf8.c @@ -66,12 +66,37 @@ static void test_utf8_escaping(void) { assert_se(utf8_is_valid(p3)); } +static void test_utf8_escaping_printable(void) { +_cleanup_free_ char *p1, *p2, *p3, *p4, *p5; + +p1 = utf8_escape_non_printable(goo goo goo); +puts(p1); +assert_se(utf8_is_valid(p1)); + +p2 = utf8_escape_non_printable(\341\204\341\204); +puts(p2); +assert_se(utf8_is_valid(p2)); + +p3 = utf8_escape_non_printable(\341\204); +puts(p3); +assert_se(utf8_is_valid(p3)); + +p4 = utf8_escape_non_printable(ąę\n가너도루\n1234\n\341\204\341\204\n\001 \019\20\a); +puts(p4); +assert_se(utf8_is_valid(p4)); + +p5 = utf8_escape_non_printable(\001 \019\20\a); +puts(p5); +assert_se(utf8_is_valid(p5)); +} + int main(int argc, char *argv[]) { test_utf8_is_valid(); test_utf8_is_printable(); test_ascii_is_valid(); test_utf8_encoded_valid_unichar(); test_utf8_escaping(); +test_utf8_escaping_printable(); return 0; } -- 1.9.3 ___ systemd-devel mailing list systemd-devel@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/systemd-devel
Re: [systemd-devel] [PATCH v2 1/2] utf8: intruduce utf8_escape_non_printable
On Mon, 03.11.14 15:00, WaLyong Cho (walyong@samsung.com) wrote: --- src/shared/utf8.c| 87 src/shared/utf8.h| 1 + src/test/test-utf8.c | 30 ++ 3 files changed, 118 insertions(+) diff --git a/src/shared/utf8.c b/src/shared/utf8.c index 9353559..5245604 100644 --- a/src/shared/utf8.c +++ b/src/shared/utf8.c @@ -210,6 +210,93 @@ char *utf8_escape_invalid(const char *str) { return p; } +char *utf8_escape_non_printable(const char *str) { +char *p, *s; + +assert(str); + +p = s = malloc(strlen(str) * 4 + 1); +if (!p) +return NULL; + +while (*str) { +int len; + +len = utf8_encoded_valid_unichar(str); +if (len 0) { +if (utf8_is_printable(str, len)) { +s = mempcpy(s, str, len); +str += len; +} else { +switch (*str) { + +case '\a': +*(s++) = '\\'; +*(s++) = 'a'; +break; +case '\b': +*(s++) = '\\'; +*(s++) = 'b'; +break; +case '\f': +*(s++) = '\\'; +*(s++) = 'f'; +break; +case '\n': +*(s++) = '\\'; +*(s++) = 'n'; +break; +case '\r': +*(s++) = '\\'; +*(s++) = 'r'; +break; +case '\t': +*(s++) = '\\'; +*(s++) = 't'; +break; +case '\v': +*(s++) = '\\'; +*(s++) = 'v'; +break; +case '\\': +*(s++) = '\\'; +*(s++) = '\\'; +break; +case '': +*(s++) = '\\'; +*(s++) = ''; +break; +case '\'': +*(s++) = '\\'; +*(s++) = '\''; +break; + +default: +/* For special chars we prefer octal over + * hexadecimal encoding, simply because glib's + * g_strescape() does the same */ +if ((*str ' ') || (*str = 127)) { +*(s++) = '\\'; +*(s++) = octchar((unsigned char) *str 6); +*(s++) = octchar((unsigned char) *str 3); +*(s++) = octchar((unsigned char) *str); +} else +*(s++) = *str; +break; +} Hmm, do we really want the C style of escaping here? wouldn't be the \x style of escaping more appropriate here? If the C style of escaping is appropriate, then we should find a way to unify this case block between cescape() and this call, i.e. split it out in a new call, maybe called: char* cescape_one(char c, char *buf); That call would take the char to escape, plus a pointer to the buf where to place the escaped version, and return a pointer that points into the buffer right after where the escaped version was written. That way cescape() and your new call could call it like this: s = cescape_one(*str, s); To escape one character. If you follow what I mean? Lennart -- Lennart Poettering, Red Hat ___ systemd-devel mailing list systemd-devel@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/systemd-devel
[systemd-devel] [PATCH v2 1/2] utf8: intruduce utf8_escape_non_printable
--- src/shared/utf8.c| 87 src/shared/utf8.h| 1 + src/test/test-utf8.c | 30 ++ 3 files changed, 118 insertions(+) diff --git a/src/shared/utf8.c b/src/shared/utf8.c index 9353559..5245604 100644 --- a/src/shared/utf8.c +++ b/src/shared/utf8.c @@ -210,6 +210,93 @@ char *utf8_escape_invalid(const char *str) { return p; } +char *utf8_escape_non_printable(const char *str) { +char *p, *s; + +assert(str); + +p = s = malloc(strlen(str) * 4 + 1); +if (!p) +return NULL; + +while (*str) { +int len; + +len = utf8_encoded_valid_unichar(str); +if (len 0) { +if (utf8_is_printable(str, len)) { +s = mempcpy(s, str, len); +str += len; +} else { +switch (*str) { + +case '\a': +*(s++) = '\\'; +*(s++) = 'a'; +break; +case '\b': +*(s++) = '\\'; +*(s++) = 'b'; +break; +case '\f': +*(s++) = '\\'; +*(s++) = 'f'; +break; +case '\n': +*(s++) = '\\'; +*(s++) = 'n'; +break; +case '\r': +*(s++) = '\\'; +*(s++) = 'r'; +break; +case '\t': +*(s++) = '\\'; +*(s++) = 't'; +break; +case '\v': +*(s++) = '\\'; +*(s++) = 'v'; +break; +case '\\': +*(s++) = '\\'; +*(s++) = '\\'; +break; +case '': +*(s++) = '\\'; +*(s++) = ''; +break; +case '\'': +*(s++) = '\\'; +*(s++) = '\''; +break; + +default: +/* For special chars we prefer octal over + * hexadecimal encoding, simply because glib's + * g_strescape() does the same */ +if ((*str ' ') || (*str = 127)) { +*(s++) = '\\'; +*(s++) = octchar((unsigned char) *str 6); +*(s++) = octchar((unsigned char) *str 3); +*(s++) = octchar((unsigned char) *str); +} else +*(s++) = *str; +break; +} +str += 1; +} +} else { +s = mempcpy(s, UTF8_REPLACEMENT_CHARACTER, strlen(UTF8_REPLACEMENT_CHARACTER)); +str += 1; +} +} + +*s = '\0'; + +return p; +} + char *ascii_is_valid(const char *str) { const char *p; diff --git a/src/shared/utf8.h b/src/shared/utf8.h index c087995..1fe1a35 100644 --- a/src/shared/utf8.h +++ b/src/shared/utf8.h @@ -30,6 +30,7 @@ const char *utf8_is_valid(const char *s) _pure_; char *ascii_is_valid(const char *s) _pure_; char *utf8_escape_invalid(const char *s); +char *utf8_escape_non_printable(const char *str); bool utf8_is_printable_newline(const char* str, size_t length, bool newline) _pure_; _pure_ static inline bool utf8_is_printable(const char* str, size_t length) { diff --git a/src/test/test-utf8.c b/src/test/test-utf8.c index b7d988f..fb27fe5 100644 --- a/src/test/test-utf8.c +++ b/src/test/test-utf8.c @@ -66,12 +66,42 @@ static void