Hi, Jeffrey Walton wrote in <https://lists.gnu.org/archive/html/bug-libunistring/2019-03/msg00002.html>: > I used CFLAGS="-DNDEBUG -g2 -O1 -fsanitize=undefined". I built the > library, then ran a 'make test V=1'. The output presented to the user > looks OK. However, it looks like UBsan found some undefined behavior: > > libunistring-0.9.10$ grep -IR 'runtime error' > tests/test-u8-width-linebreaks.log:unilbrk/u8-possible-linebreaks.c:49:3: > runtime error: null pointer passed as argument 1, which is declared to > never be null > tests/test-u32-to-u8.log:unistr/u8-cmp.c:38:10: runtime error: null > pointer passed as argument 2, which is declared to never be null > tests/test-numeric.log:unictype/numeric.c:52:54: runtime error: left > shift of 34927 by 16 places cannot be represented in type 'int' > tests/test-nfkc.log:unistr/u8-cmp.c:38:10: runtime error: null pointer > passed as argument 2, which is declared to never be null > tests/test-u8-tolower.log:unistr/u8-cmp.c:38:10: runtime error: null > pointer passed as argument 2, which is declared to never be null > tests/test-u16-to-u8.log:unistr/u8-cmp.c:38:10: runtime error: null > pointer passed as argument 2, which is declared to never be null > tests/test-nfkd.log:unistr/u8-cmp.c:38:10: runtime error: null pointer > passed as argument 2, which is declared to never be null > tests/test-u8-casefold.log:unistr/u8-cmp.c:38:10: runtime error: null > pointer passed as argument 2, which is declared to never be null > tests/unicase/test-ulc-casecmp2.sh.log:uniconv/u8-conv-from-enc.c:89:7: > runtime error: null pointer passed as argument 2, which is declared to > never be null > tests/unicase/test-ulc-casecoll2.sh.log:uniconv/u8-conv-from-enc.c:89:7: > runtime error: null pointer passed as argument 2, which is declared to > never be null > tests/test-nfc.log:unistr/u8-cmp.c:38:10: runtime error: null pointer > passed as argument 2, which is declared to never be null > tests/test-u8-totitle.log:unistr/u8-cmp.c:38:10: runtime error: null > pointer passed as argument 2, which is declared to never be null > tests/test-nfd.log:unistr/u8-cmp.c:38:10: runtime error: null pointer > passed as argument 2, which is declared to never be null > tests/test-u16-possible-linebreaks.log:unilbrk/u16-possible-linebreaks.c:49:3: > runtime error: null pointer passed as argument 1, which is declared to > never be null > tests/test-u16-width-linebreaks.log:unilbrk/u16-possible-linebreaks.c:49:3: > runtime error: null pointer passed as argument 1, which is declared to > never be null > tests/test-u8-toupper.log:unistr/u8-cmp.c:38:10: runtime error: null > pointer passed as argument 2, which is declared to never be null > tests/test-u8-possible-linebreaks.log:unilbrk/u8-possible-linebreaks.c:49:3: > runtime error: null pointer passed as argument 1, which is declared to > never be null
Thanks for these reports. Indeed, use of CC="gcc -fsanitize=undefined" produces error messages in the 'make check' logs. 1) unictype/numeric.c:52:54: runtime error: left shift of 34927 by 16 places cannot be represented in type 'int' The code was implicitly casting an 'unsigned short' to 'int' and doing a shift of the result. It is better to case from 'unsigned short' to 'unsigned int' explicitly. 2) unistr/u8-cmp.c:38:10: runtime error: null pointer passed as argument 2, which is declared to never be null memcmp(NULL, NULL, 0) is invalid in ISO C and POSIX. ISO C11 7.24.1.(2) states: "Where an argument declared as size_t n specifies the length of the array for a function, n can have the value zero on a call to that function. Unless explicitly stated otherwise in the description of a particular function in this subclause, pointer arguments on such a call shall still have valid values, as described in 7.1.4." References: https://lists.gnu.org/archive/html/bug-gnulib/2009-05/msg00156.html https://stackoverflow.com/questions/16362925/can-i-pass-a-null-pointer-to-memcmp 3) uniconv/u8-conv-from-enc.c:89:7: runtime error: null pointer passed as argument 2, which is declared to never be null memcpy(dest, NULL, 0) is invalid in ISO C and POSIX, similarly. 4) unilbrk/u8-possible-linebreaks.c:49:3: runtime error: null pointer passed as argument 1, which is declared to never be null memset(NULL, c, 0) is invalid in ISO C and POSIX, similarly. I'm pushing the attached patches.
>From 01ec92af9fda3163f3379d5932991b47f011aa33 Mon Sep 17 00:00:00 2001 From: Bruno Haible <br...@clisp.org> Date: Fri, 8 Mar 2019 19:17:37 +0100 Subject: [PATCH 1/4] unictype/numeric: Fix undefined behaviour. Reported by Jeffrey Walton <noloa...@gmail.com>. * lib/unictype/numeric.c (uc_numeric_value): Avoid undefined behaviour on shift overflow, caught by "gcc -fsanitize=undefined". * lib/unictype/bidi_of.c (uc_bidi_class): Add cast, for clarity. * lib/unictype/categ_of.c (lookup_withtable): Likewise. * lib/unictype/joininggroup_of.c (uc_joining_group): Likewise. --- ChangeLog | 10 ++++++++++ lib/unictype/bidi_of.c | 2 +- lib/unictype/categ_of.c | 2 +- lib/unictype/joininggroup_of.c | 2 +- lib/unictype/numeric.c | 4 ++-- 5 files changed, 15 insertions(+), 5 deletions(-) diff --git a/ChangeLog b/ChangeLog index e5ba157..2e61261 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,13 @@ +2019-03-08 Bruno Haible <br...@clisp.org> + + unictype/numeric: Fix undefined behaviour. + Reported by Jeffrey Walton <noloa...@gmail.com>. + * lib/unictype/numeric.c (uc_numeric_value): Avoid undefined behaviour + on shift overflow, caught by "gcc -fsanitize=undefined". + * lib/unictype/bidi_of.c (uc_bidi_class): Add cast, for clarity. + * lib/unictype/categ_of.c (lookup_withtable): Likewise. + * lib/unictype/joininggroup_of.c (uc_joining_group): Likewise. + 2019-03-05 Paul Eggert <egg...@cs.ucla.edu> git-version-gen: fix --version copyright year diff --git a/lib/unictype/bidi_of.c b/lib/unictype/bidi_of.c index 7e52117..3db1f3a 100644 --- a/lib/unictype/bidi_of.c +++ b/lib/unictype/bidi_of.c @@ -39,7 +39,7 @@ uc_bidi_class (ucs4_t uc) unsigned int index3 = ((uc & bidi_category_header_4) + lookup2) * 5; /* level3 contains 5-bit values, packed into 16-bit words. */ unsigned int lookup3 = - ((u_bidi_category.level3[index3>>4] + (((unsigned int) u_bidi_category.level3[index3>>4] | ((unsigned int) u_bidi_category.level3[(index3>>4)+1] << 16)) >> (index3 % 16)) & 0x1f; diff --git a/lib/unictype/categ_of.c b/lib/unictype/categ_of.c index a1e6762..4c3f127 100644 --- a/lib/unictype/categ_of.c +++ b/lib/unictype/categ_of.c @@ -39,7 +39,7 @@ lookup_withtable (ucs4_t uc) unsigned int index3 = ((uc & category_header_4) + lookup2) * 5; /* level3 contains 5-bit values, packed into 16-bit words. */ unsigned int lookup3 = - ((u_category.level3[index3>>4] + (((unsigned int) u_category.level3[index3>>4] | ((unsigned int) u_category.level3[(index3>>4)+1] << 16)) >> (index3 % 16)) & 0x1f; diff --git a/lib/unictype/joininggroup_of.c b/lib/unictype/joininggroup_of.c index db254a9..3b2a75e 100644 --- a/lib/unictype/joininggroup_of.c +++ b/lib/unictype/joininggroup_of.c @@ -39,7 +39,7 @@ uc_joining_group (ucs4_t uc) unsigned int index3 = ((uc & joining_group_header_4) + lookup2) * 7; /* level3 contains 7-bit values, packed into 16-bit words. */ unsigned int lookup3 = - ((u_joining_group.level3[index3>>4] + (((unsigned int) u_joining_group.level3[index3>>4] | ((unsigned int) u_joining_group.level3[(index3>>4)+1] << 16)) >> (index3 % 16)) & 0x7f; diff --git a/lib/unictype/numeric.c b/lib/unictype/numeric.c index fd6eab9..a999183 100644 --- a/lib/unictype/numeric.c +++ b/lib/unictype/numeric.c @@ -39,8 +39,8 @@ uc_numeric_value (ucs4_t uc) unsigned int index3 = ((uc & numeric_header_4) + lookup2) * 8; /* level3 contains 8-bit values, packed into 16-bit words. */ unsigned int lookup3 = - ((u_numeric.level3[index3>>4] - | (u_numeric.level3[(index3>>4)+1] << 16)) + (((unsigned int) u_numeric.level3[index3>>4] + | ((unsigned int) u_numeric.level3[(index3>>4)+1] << 16)) >> (index3 % 16)) & 0xff; -- 2.7.4
>From 7e7501fa24bcd2e1484fee5648d88b8d94f0bf38 Mon Sep 17 00:00:00 2001 From: Bruno Haible <br...@clisp.org> Date: Fri, 8 Mar 2019 20:38:22 +0100 Subject: [PATCH 2/4] unistr/u8-cmp: Fix undefined behaviour. Reported by Jeffrey Walton <noloa...@gmail.com>. * lib/unistr/u8-cmp.c (u8_cmp): Don't invoke memcmp if n is zero. --- ChangeLog | 6 ++++++ lib/unistr/u8-cmp.c | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index 2e61261..074b1ca 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,11 @@ 2019-03-08 Bruno Haible <br...@clisp.org> + unistr/u8-cmp: Fix undefined behaviour. + Reported by Jeffrey Walton <noloa...@gmail.com>. + * lib/unistr/u8-cmp.c (u8_cmp): Don't invoke memcmp if n is zero. + +2019-03-08 Bruno Haible <br...@clisp.org> + unictype/numeric: Fix undefined behaviour. Reported by Jeffrey Walton <noloa...@gmail.com>. * lib/unictype/numeric.c (uc_numeric_value): Avoid undefined behaviour diff --git a/lib/unistr/u8-cmp.c b/lib/unistr/u8-cmp.c index bd2f11b..5349ec6 100644 --- a/lib/unistr/u8-cmp.c +++ b/lib/unistr/u8-cmp.c @@ -26,5 +26,5 @@ int u8_cmp (const uint8_t *s1, const uint8_t *s2, size_t n) { /* Use the fact that the UTF-8 encoding preserves lexicographic order. */ - return memcmp ((const char *) s1, (const char *) s2, n); + return n == 0 ? 0 : memcmp ((const char *) s1, (const char *) s2, n); } -- 2.7.4
>From f6f567a94d1f34b96ae05e2354bfaa994c9de840 Mon Sep 17 00:00:00 2001 From: Bruno Haible <br...@clisp.org> Date: Sat, 9 Mar 2019 00:01:47 +0100 Subject: [PATCH 3/4] unistr/*, uniconv/*: Fix undefined behaviour. Reported by Jeffrey Walton <noloa...@gmail.com>. * lib/unistr/u-cpy.h (FUNC): Don't invoke memcpy with a zero size. * lib/unistr/u-cpy-alloc.h (FUNC): Likewise. * lib/uniconv/u8-conv-from-enc.c (u8_conv_from_encoding): Likewise. * lib/uniconv/u8-conv-to-enc.c (u8_conv_to_encoding): Likewise. --- ChangeLog | 9 +++++++++ lib/uniconv/u8-conv-from-enc.c | 3 ++- lib/uniconv/u8-conv-to-enc.c | 3 ++- lib/unistr/u-cpy-alloc.h | 3 ++- lib/unistr/u-cpy.h | 3 ++- 5 files changed, 17 insertions(+), 4 deletions(-) diff --git a/ChangeLog b/ChangeLog index 074b1ca..687357e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,14 @@ 2019-03-08 Bruno Haible <br...@clisp.org> + unistr/*, uniconv/*: Fix undefined behaviour. + Reported by Jeffrey Walton <noloa...@gmail.com>. + * lib/unistr/u-cpy.h (FUNC): Don't invoke memcpy with a zero size. + * lib/unistr/u-cpy-alloc.h (FUNC): Likewise. + * lib/uniconv/u8-conv-from-enc.c (u8_conv_from_encoding): Likewise. + * lib/uniconv/u8-conv-to-enc.c (u8_conv_to_encoding): Likewise. + +2019-03-08 Bruno Haible <br...@clisp.org> + unistr/u8-cmp: Fix undefined behaviour. Reported by Jeffrey Walton <noloa...@gmail.com>. * lib/unistr/u8-cmp.c (u8_cmp): Don't invoke memcmp if n is zero. diff --git a/lib/uniconv/u8-conv-from-enc.c b/lib/uniconv/u8-conv-from-enc.c index 623dc1f..f8e9bd9 100644 --- a/lib/uniconv/u8-conv-from-enc.c +++ b/lib/uniconv/u8-conv-from-enc.c @@ -77,7 +77,8 @@ u8_conv_from_encoding (const char *fromcode, } } - memcpy ((char *) result, src, srclen); + if (srclen > 0) + memcpy ((char *) result, src, srclen); *lengthp = srclen; return result; } diff --git a/lib/uniconv/u8-conv-to-enc.c b/lib/uniconv/u8-conv-to-enc.c index ff0bc64..56e3121 100644 --- a/lib/uniconv/u8-conv-to-enc.c +++ b/lib/uniconv/u8-conv-to-enc.c @@ -60,7 +60,8 @@ u8_conv_to_encoding (const char *tocode, } } - memcpy (result, (const char *) src, srclen); + if (srclen > 0) + memcpy (result, (const char *) src, srclen); *lengthp = srclen; return result; } diff --git a/lib/unistr/u-cpy-alloc.h b/lib/unistr/u-cpy-alloc.h index 8afa66b..55fef32 100644 --- a/lib/unistr/u-cpy-alloc.h +++ b/lib/unistr/u-cpy-alloc.h @@ -33,7 +33,8 @@ FUNC (const UNIT *s, size_t n) for (; n > 0; n--) *destptr++ = *s++; #else - memcpy ((char *) dest, (const char *) s, n * sizeof (UNIT)); + if (n > 0) + memcpy ((char *) dest, (const char *) s, n * sizeof (UNIT)); #endif } return dest; diff --git a/lib/unistr/u-cpy.h b/lib/unistr/u-cpy.h index 1dc2129..c023a27 100644 --- a/lib/unistr/u-cpy.h +++ b/lib/unistr/u-cpy.h @@ -26,7 +26,8 @@ FUNC (UNIT *dest, const UNIT *src, size_t n) for (; n > 0; n--) *destptr++ = *src++; #else - memcpy ((char *) dest, (const char *) src, n * sizeof (UNIT)); + if (n > 0) + memcpy ((char *) dest, (const char *) src, n * sizeof (UNIT)); #endif return dest; } -- 2.7.4
>From fa418ebbdf307d0efc1e61677045f48de5a93398 Mon Sep 17 00:00:00 2001 From: Bruno Haible <br...@clisp.org> Date: Sat, 9 Mar 2019 00:27:19 +0100 Subject: [PATCH 4/4] unilbrk/u*-possible-linebreaks: Fix undefined behaviour. Reported by Jeffrey Walton <noloa...@gmail.com>. * lib/unilbrk/u8-possible-linebreaks.c (u8_possible_linebreaks): Don't invoke memset with a zero size. * lib/unilbrk/u16-possible-linebreaks.c (u16_possible_linebreaks): Likewise. * lib/unilbrk/u32-possible-linebreaks.c (u32_possible_linebreaks): Adjust accordingly. --- ChangeLog | 11 ++ lib/unilbrk/u16-possible-linebreaks.c | 212 +++++++++++++++++----------------- lib/unilbrk/u32-possible-linebreaks.c | 204 ++++++++++++++++---------------- lib/unilbrk/u8-possible-linebreaks.c | 212 +++++++++++++++++----------------- 4 files changed, 331 insertions(+), 308 deletions(-) diff --git a/ChangeLog b/ChangeLog index 687357e..a67ed92 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,16 @@ 2019-03-08 Bruno Haible <br...@clisp.org> + unilbrk/u*-possible-linebreaks: Fix undefined behaviour. + Reported by Jeffrey Walton <noloa...@gmail.com>. + * lib/unilbrk/u8-possible-linebreaks.c (u8_possible_linebreaks): Don't + invoke memset with a zero size. + * lib/unilbrk/u16-possible-linebreaks.c (u16_possible_linebreaks): + Likewise. + * lib/unilbrk/u32-possible-linebreaks.c (u32_possible_linebreaks): + Adjust accordingly. + +2019-03-08 Bruno Haible <br...@clisp.org> + unistr/*, uniconv/*: Fix undefined behaviour. Reported by Jeffrey Walton <noloa...@gmail.com>. * lib/unistr/u-cpy.h (FUNC): Don't invoke memcpy with a zero size. diff --git a/lib/unilbrk/u16-possible-linebreaks.c b/lib/unilbrk/u16-possible-linebreaks.c index 3ee91ad..0a91368 100644 --- a/lib/unilbrk/u16-possible-linebreaks.c +++ b/lib/unilbrk/u16-possible-linebreaks.c @@ -30,135 +30,139 @@ void u16_possible_linebreaks (const uint16_t *s, size_t n, const char *encoding, char *p) { - int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL); - const uint16_t *s_end = s + n; - int last_prop = LBP_BK; /* line break property of last non-space character */ - char *seen_space = NULL; /* Was a space seen after the last non-space character? */ - char *seen_space2 = NULL; /* At least two spaces after the last non-space? */ - - /* Don't break inside multibyte characters. */ - memset (p, UC_BREAK_PROHIBITED, n); - - while (s < s_end) + if (n > 0) { - ucs4_t uc; - int count = u16_mbtouc_unsafe (&uc, s, s_end - s); - int prop = unilbrkprop_lookup (uc); + int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL); + const uint16_t *s_end = s + n; + int last_prop = LBP_BK; /* line break property of last non-space character */ + char *seen_space = NULL; /* Was a space seen after the last non-space character? */ + char *seen_space2 = NULL; /* At least two spaces after the last non-space? */ - if (prop == LBP_BK) - { - /* Mandatory break. */ - *p = UC_BREAK_MANDATORY; - last_prop = LBP_BK; - seen_space = NULL; - seen_space2 = NULL; - } - else - { - char *q; + /* Don't break inside multibyte characters. */ + memset (p, UC_BREAK_PROHIBITED, n); - /* Resolve property values whose behaviour is not fixed. */ - switch (prop) - { - case LBP_AI: - /* Resolve ambiguous. */ - prop = LBP_AI_REPLACEMENT; - break; - case LBP_CB: - /* This is arbitrary. */ - prop = LBP_ID; - break; - case LBP_SA: - /* We don't handle complex scripts yet. - Treat LBP_SA like LBP_XX. */ - case LBP_XX: - /* This is arbitrary. */ - prop = LBP_AL; - break; - } + do + { + ucs4_t uc; + int count = u16_mbtouc_unsafe (&uc, s, s_end - s); + int prop = unilbrkprop_lookup (uc); - /* Deal with spaces and combining characters. */ - q = p; - if (prop == LBP_SP) - { - /* Don't break just before a space. */ - *p = UC_BREAK_PROHIBITED; - seen_space2 = seen_space; - seen_space = p; - } - else if (prop == LBP_ZW) + if (prop == LBP_BK) { - /* Don't break just before a zero-width space. */ - *p = UC_BREAK_PROHIBITED; - last_prop = LBP_ZW; + /* Mandatory break. */ + *p = UC_BREAK_MANDATORY; + last_prop = LBP_BK; seen_space = NULL; seen_space2 = NULL; } - else if (prop == LBP_CM) + else { - /* Don't break just before a combining character, except immediately after a - zero-width space. */ - if (last_prop == LBP_ZW) + char *q; + + /* Resolve property values whose behaviour is not fixed. */ + switch (prop) { - /* Break after zero-width space. */ - *p = UC_BREAK_POSSIBLE; - /* A combining character turns a preceding space into LBP_ID. */ - last_prop = LBP_ID; + case LBP_AI: + /* Resolve ambiguous. */ + prop = LBP_AI_REPLACEMENT; + break; + case LBP_CB: + /* This is arbitrary. */ + prop = LBP_ID; + break; + case LBP_SA: + /* We don't handle complex scripts yet. + Treat LBP_SA like LBP_XX. */ + case LBP_XX: + /* This is arbitrary. */ + prop = LBP_AL; + break; } - else + + /* Deal with spaces and combining characters. */ + q = p; + if (prop == LBP_SP) { + /* Don't break just before a space. */ *p = UC_BREAK_PROHIBITED; - /* A combining character turns a preceding space into LBP_ID. */ - if (seen_space != NULL) - { - q = seen_space; - seen_space = seen_space2; - prop = LBP_ID; - goto lookup_via_table; - } + seen_space2 = seen_space; + seen_space = p; } - } - else - { - lookup_via_table: - /* prop must be usable as an index for table 7.3 of UTR #14. */ - if (!(prop >= 0 && prop < sizeof (unilbrk_table) / sizeof (unilbrk_table[0]))) - abort (); - - if (last_prop == LBP_BK) + else if (prop == LBP_ZW) { - /* Don't break at the beginning of a line. */ - *q = UC_BREAK_PROHIBITED; + /* Don't break just before a zero-width space. */ + *p = UC_BREAK_PROHIBITED; + last_prop = LBP_ZW; + seen_space = NULL; + seen_space2 = NULL; } - else if (last_prop == LBP_ZW) + else if (prop == LBP_CM) { - /* Break after zero-width space. */ - *q = UC_BREAK_POSSIBLE; + /* Don't break just before a combining character, except immediately + after a zero-width space. */ + if (last_prop == LBP_ZW) + { + /* Break after zero-width space. */ + *p = UC_BREAK_POSSIBLE; + /* A combining character turns a preceding space into LBP_ID. */ + last_prop = LBP_ID; + } + else + { + *p = UC_BREAK_PROHIBITED; + /* A combining character turns a preceding space into LBP_ID. */ + if (seen_space != NULL) + { + q = seen_space; + seen_space = seen_space2; + prop = LBP_ID; + goto lookup_via_table; + } + } } else { - switch (unilbrk_table [last_prop] [prop]) + lookup_via_table: + /* prop must be usable as an index for table 7.3 of UTR #14. */ + if (!(prop >= 0 && prop < sizeof (unilbrk_table) / sizeof (unilbrk_table[0]))) + abort (); + + if (last_prop == LBP_BK) { - case D: - *q = UC_BREAK_POSSIBLE; - break; - case I: - *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED); - break; - case P: + /* Don't break at the beginning of a line. */ *q = UC_BREAK_PROHIBITED; - break; - default: - abort (); } + else if (last_prop == LBP_ZW) + { + /* Break after zero-width space. */ + *q = UC_BREAK_POSSIBLE; + } + else + { + switch (unilbrk_table [last_prop] [prop]) + { + case D: + *q = UC_BREAK_POSSIBLE; + break; + case I: + *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED); + break; + case P: + *q = UC_BREAK_PROHIBITED; + break; + default: + abort (); + } + } + last_prop = prop; + seen_space = NULL; + seen_space2 = NULL; } - last_prop = prop; - seen_space = NULL; - seen_space2 = NULL; } - } - s += count; - p += count; + s += count; + p += count; + } + while (s < s_end); } } diff --git a/lib/unilbrk/u32-possible-linebreaks.c b/lib/unilbrk/u32-possible-linebreaks.c index cdd00c8..9a0bafa 100644 --- a/lib/unilbrk/u32-possible-linebreaks.c +++ b/lib/unilbrk/u32-possible-linebreaks.c @@ -28,131 +28,135 @@ void u32_possible_linebreaks (const uint32_t *s, size_t n, const char *encoding, char *p) { - int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL); - const uint32_t *s_end = s + n; - int last_prop = LBP_BK; /* line break property of last non-space character */ - char *seen_space = NULL; /* Was a space seen after the last non-space character? */ - char *seen_space2 = NULL; /* At least two spaces after the last non-space? */ - - while (s < s_end) + if (n > 0) { - ucs4_t uc = *s; - int prop = unilbrkprop_lookup (uc); + int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL); + const uint32_t *s_end = s + n; + int last_prop = LBP_BK; /* line break property of last non-space character */ + char *seen_space = NULL; /* Was a space seen after the last non-space character? */ + char *seen_space2 = NULL; /* At least two spaces after the last non-space? */ - if (prop == LBP_BK) - { - /* Mandatory break. */ - *p = UC_BREAK_MANDATORY; - last_prop = LBP_BK; - seen_space = NULL; - seen_space2 = NULL; - } - else + do { - char *q; + ucs4_t uc = *s; + int prop = unilbrkprop_lookup (uc); - /* Resolve property values whose behaviour is not fixed. */ - switch (prop) + if (prop == LBP_BK) { - case LBP_AI: - /* Resolve ambiguous. */ - prop = LBP_AI_REPLACEMENT; - break; - case LBP_CB: - /* This is arbitrary. */ - prop = LBP_ID; - break; - case LBP_SA: - /* We don't handle complex scripts yet. - Treat LBP_SA like LBP_XX. */ - case LBP_XX: - /* This is arbitrary. */ - prop = LBP_AL; - break; - } - - /* Deal with spaces and combining characters. */ - q = p; - if (prop == LBP_SP) - { - /* Don't break just before a space. */ - *p = UC_BREAK_PROHIBITED; - seen_space2 = seen_space; - seen_space = p; - } - else if (prop == LBP_ZW) - { - /* Don't break just before a zero-width space. */ - *p = UC_BREAK_PROHIBITED; - last_prop = LBP_ZW; + /* Mandatory break. */ + *p = UC_BREAK_MANDATORY; + last_prop = LBP_BK; seen_space = NULL; seen_space2 = NULL; } - else if (prop == LBP_CM) + else { - /* Don't break just before a combining character, except immediately after a - zero-width space. */ - if (last_prop == LBP_ZW) + char *q; + + /* Resolve property values whose behaviour is not fixed. */ + switch (prop) { - /* Break after zero-width space. */ - *p = UC_BREAK_POSSIBLE; - /* A combining character turns a preceding space into LBP_ID. */ - last_prop = LBP_ID; + case LBP_AI: + /* Resolve ambiguous. */ + prop = LBP_AI_REPLACEMENT; + break; + case LBP_CB: + /* This is arbitrary. */ + prop = LBP_ID; + break; + case LBP_SA: + /* We don't handle complex scripts yet. + Treat LBP_SA like LBP_XX. */ + case LBP_XX: + /* This is arbitrary. */ + prop = LBP_AL; + break; } - else + + /* Deal with spaces and combining characters. */ + q = p; + if (prop == LBP_SP) { + /* Don't break just before a space. */ *p = UC_BREAK_PROHIBITED; - /* A combining character turns a preceding space into LBP_ID. */ - if (seen_space != NULL) - { - q = seen_space; - seen_space = seen_space2; - prop = LBP_ID; - goto lookup_via_table; - } + seen_space2 = seen_space; + seen_space = p; } - } - else - { - lookup_via_table: - /* prop must be usable as an index for table 7.3 of UTR #14. */ - if (!(prop >= 0 && prop < sizeof (unilbrk_table) / sizeof (unilbrk_table[0]))) - abort (); - - if (last_prop == LBP_BK) + else if (prop == LBP_ZW) { - /* Don't break at the beginning of a line. */ - *q = UC_BREAK_PROHIBITED; + /* Don't break just before a zero-width space. */ + *p = UC_BREAK_PROHIBITED; + last_prop = LBP_ZW; + seen_space = NULL; + seen_space2 = NULL; } - else if (last_prop == LBP_ZW) + else if (prop == LBP_CM) { - /* Break after zero-width space. */ - *q = UC_BREAK_POSSIBLE; + /* Don't break just before a combining character, except immediately + after a zero-width space. */ + if (last_prop == LBP_ZW) + { + /* Break after zero-width space. */ + *p = UC_BREAK_POSSIBLE; + /* A combining character turns a preceding space into LBP_ID. */ + last_prop = LBP_ID; + } + else + { + *p = UC_BREAK_PROHIBITED; + /* A combining character turns a preceding space into LBP_ID. */ + if (seen_space != NULL) + { + q = seen_space; + seen_space = seen_space2; + prop = LBP_ID; + goto lookup_via_table; + } + } } else { - switch (unilbrk_table [last_prop] [prop]) + lookup_via_table: + /* prop must be usable as an index for table 7.3 of UTR #14. */ + if (!(prop >= 0 && prop < sizeof (unilbrk_table) / sizeof (unilbrk_table[0]))) + abort (); + + if (last_prop == LBP_BK) { - case D: - *q = UC_BREAK_POSSIBLE; - break; - case I: - *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED); - break; - case P: + /* Don't break at the beginning of a line. */ *q = UC_BREAK_PROHIBITED; - break; - default: - abort (); } + else if (last_prop == LBP_ZW) + { + /* Break after zero-width space. */ + *q = UC_BREAK_POSSIBLE; + } + else + { + switch (unilbrk_table [last_prop] [prop]) + { + case D: + *q = UC_BREAK_POSSIBLE; + break; + case I: + *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED); + break; + case P: + *q = UC_BREAK_PROHIBITED; + break; + default: + abort (); + } + } + last_prop = prop; + seen_space = NULL; + seen_space2 = NULL; } - last_prop = prop; - seen_space = NULL; - seen_space2 = NULL; } - } - s++; - p++; + s++; + p++; + } + while (s < s_end); } } diff --git a/lib/unilbrk/u8-possible-linebreaks.c b/lib/unilbrk/u8-possible-linebreaks.c index 1e2224b..79efcb2 100644 --- a/lib/unilbrk/u8-possible-linebreaks.c +++ b/lib/unilbrk/u8-possible-linebreaks.c @@ -30,136 +30,140 @@ void u8_possible_linebreaks (const uint8_t *s, size_t n, const char *encoding, char *p) { - int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL); - const uint8_t *s_end = s + n; - int last_prop = LBP_BK; /* line break property of last non-space character */ - char *seen_space = NULL; /* Was a space seen after the last non-space character? */ - char *seen_space2 = NULL; /* At least two spaces after the last non-space? */ - - /* Don't break inside multibyte characters. */ - memset (p, UC_BREAK_PROHIBITED, n); - - while (s < s_end) + if (n > 0) { - ucs4_t uc; - int count = u8_mbtouc_unsafe (&uc, s, s_end - s); - int prop = unilbrkprop_lookup (uc); + int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL); + const uint8_t *s_end = s + n; + int last_prop = LBP_BK; /* line break property of last non-space character */ + char *seen_space = NULL; /* Was a space seen after the last non-space character? */ + char *seen_space2 = NULL; /* At least two spaces after the last non-space? */ - if (prop == LBP_BK) - { - /* Mandatory break. */ - *p = UC_BREAK_MANDATORY; - last_prop = LBP_BK; - seen_space = NULL; - seen_space2 = NULL; - } - else - { - char *q; + /* Don't break inside multibyte characters. */ + memset (p, UC_BREAK_PROHIBITED, n); - /* Resolve property values whose behaviour is not fixed. */ - switch (prop) - { - case LBP_AI: - /* Resolve ambiguous. */ - prop = LBP_AI_REPLACEMENT; - break; - case LBP_CB: - /* This is arbitrary. */ - prop = LBP_ID; - break; - case LBP_SA: - /* We don't handle complex scripts yet. - Treat LBP_SA like LBP_XX. */ - case LBP_XX: - /* This is arbitrary. */ - prop = LBP_AL; - break; - } + do + { + ucs4_t uc; + int count = u8_mbtouc_unsafe (&uc, s, s_end - s); + int prop = unilbrkprop_lookup (uc); - /* Deal with spaces and combining characters. */ - q = p; - if (prop == LBP_SP) + if (prop == LBP_BK) { - /* Don't break just before a space. */ - *p = UC_BREAK_PROHIBITED; - seen_space2 = seen_space; - seen_space = p; - } - else if (prop == LBP_ZW) - { - /* Don't break just before a zero-width space. */ - *p = UC_BREAK_PROHIBITED; - last_prop = LBP_ZW; + /* Mandatory break. */ + *p = UC_BREAK_MANDATORY; + last_prop = LBP_BK; seen_space = NULL; seen_space2 = NULL; } - else if (prop == LBP_CM) + else { - /* Don't break just before a combining character, except immediately after a - zero-width space. */ - if (last_prop == LBP_ZW) + char *q; + + /* Resolve property values whose behaviour is not fixed. */ + switch (prop) { - /* Break after zero-width space. */ - *p = UC_BREAK_POSSIBLE; - /* A combining character turns a preceding space into LBP_ID. */ - last_prop = LBP_ID; + case LBP_AI: + /* Resolve ambiguous. */ + prop = LBP_AI_REPLACEMENT; + break; + case LBP_CB: + /* This is arbitrary. */ + prop = LBP_ID; + break; + case LBP_SA: + /* We don't handle complex scripts yet. + Treat LBP_SA like LBP_XX. */ + case LBP_XX: + /* This is arbitrary. */ + prop = LBP_AL; + break; } - else + + /* Deal with spaces and combining characters. */ + q = p; + if (prop == LBP_SP) { + /* Don't break just before a space. */ *p = UC_BREAK_PROHIBITED; - /* A combining character turns a preceding space into LBP_ID. */ - if (seen_space != NULL) - { - q = seen_space; - seen_space = seen_space2; - prop = LBP_ID; - goto lookup_via_table; - } + seen_space2 = seen_space; + seen_space = p; } - } - else - { - lookup_via_table: - /* prop must be usable as an index for table 7.3 of UTR #14. */ - if (!(prop >= 0 && prop < sizeof (unilbrk_table) / sizeof (unilbrk_table[0]))) - abort (); - - if (last_prop == LBP_BK) + else if (prop == LBP_ZW) { - /* Don't break at the beginning of a line. */ - *q = UC_BREAK_PROHIBITED; + /* Don't break just before a zero-width space. */ + *p = UC_BREAK_PROHIBITED; + last_prop = LBP_ZW; + seen_space = NULL; + seen_space2 = NULL; } - else if (last_prop == LBP_ZW) + else if (prop == LBP_CM) { - /* Break after zero-width space. */ - *q = UC_BREAK_POSSIBLE; + /* Don't break just before a combining character, except immediately + after a zero-width space. */ + if (last_prop == LBP_ZW) + { + /* Break after zero-width space. */ + *p = UC_BREAK_POSSIBLE; + /* A combining character turns a preceding space into LBP_ID. */ + last_prop = LBP_ID; + } + else + { + *p = UC_BREAK_PROHIBITED; + /* A combining character turns a preceding space into LBP_ID. */ + if (seen_space != NULL) + { + q = seen_space; + seen_space = seen_space2; + prop = LBP_ID; + goto lookup_via_table; + } + } } else { - switch (unilbrk_table [last_prop] [prop]) + lookup_via_table: + /* prop must be usable as an index for table 7.3 of UTR #14. */ + if (!(prop >= 0 && prop < sizeof (unilbrk_table) / sizeof (unilbrk_table[0]))) + abort (); + + if (last_prop == LBP_BK) { - case D: - *q = UC_BREAK_POSSIBLE; - break; - case I: - *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED); - break; - case P: + /* Don't break at the beginning of a line. */ *q = UC_BREAK_PROHIBITED; - break; - default: - abort (); } + else if (last_prop == LBP_ZW) + { + /* Break after zero-width space. */ + *q = UC_BREAK_POSSIBLE; + } + else + { + switch (unilbrk_table [last_prop] [prop]) + { + case D: + *q = UC_BREAK_POSSIBLE; + break; + case I: + *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED); + break; + case P: + *q = UC_BREAK_PROHIBITED; + break; + default: + abort (); + } + } + last_prop = prop; + seen_space = NULL; + seen_space2 = NULL; } - last_prop = prop; - seen_space = NULL; - seen_space2 = NULL; } - } - s += count; - p += count; + s += count; + p += count; + } + while (s < s_end); } } -- 2.7.4