In perl.git, the branch blead has been updated <https://perl5.git.perl.org/perl.git/commitdiff/03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1?hp=9b91093d3e74d13a6e4f67269a587f4ab397998a>
- Log ----------------------------------------------------------------- commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 Author: Karl Williamson <k...@cpan.org> Date: Wed Nov 22 22:30:16 2017 -0700 Add variant_under_utf8_count() core function This function takes a string that isn't encoded in UTF-8 (hence is assumed to be in Latin1), and counts how many of the bytes therein would change if it were to be translated into UTF-8. Each such byte would occupy two UTF-8 bytes. This function is useful for calculating the expansion factor precisely when converting to UTF-8, so as to know how much to malloc. This function uses a non-obvious method to do the calculations word-at-a-time, as opposed to the byte-at-a-time method used now, and hence should be much faster than the current methods. The performance change in short string lengths is equivocal. Here is the result for a single character and a 64-bit word. bytes words Ratio % -------- -------- ------- Ir 932.0 947.0 98.4 Dr 325.0 325.0 100.0 Dw 104.0 104.0 100.0 COND 136.0 137.0 99.3 IND 28.0 28.0 100.0 COND_m 1.0 0.0 Inf IND_m 6.0 6.0 100.0 There are some extra instructions executed and an extra branch to check for and handle the case where we can go word-by-word vs. not. But the one cache miss is removed. The results are essentially the same until we get to being able to handle a full word. Some of the extra instructions are to ensure that if the input is not aligned on a word boundary, that performance doesn't suffer. Here's the results for 8-bytes on a 64-bit system. bytes words Ratio % -------- -------- ------- Ir 974.0 955.0 102.0 Dr 332.0 325.0 102.2 Dw 104.0 104.0 100.0 COND 143.0 138.0 103.6 IND 28.0 28.0 100.0 COND_m 1.0 0.0 Inf IND_m 6.0 6.0 100.0 Things keep improving as the strings get longer. Here's for 24 bytes. bytes words Ratio % -------- -------- ------- Ir 1070.0 975.0 109.7 Dr 348.0 327.0 106.4 Dw 104.0 104.0 100.0 COND 159.0 140.0 113.6 IND 28.0 28.0 100.0 COND_m 2.0 0.0 Inf IND_m 6.0 6.0 100.0 And 96: bytes words Ratio % -------- -------- ------- Ir 1502.0 1065.0 141.0 Dr 420.0 336.0 125.0 Dw 104.0 104.0 100.0 COND 231.0 149.0 155.0 IND 28.0 28.0 100.0 COND_m 2.0 1.0 200.0 IND_m 6.0 6.0 100.0 And 10,000 bytes words Ratio % -------- -------- ------- Ir 60926.0 13445.0 453.1 Dr 10324.0 1574.0 655.9 Dw 104.0 104.0 100.0 COND 10135.0 1387.0 730.7 IND 28.0 28.0 100.0 COND_m 2.0 1.0 200.0 IND_m 6.0 6.0 100.0 I found this trick on the internet many years ago, but I can't seem to find it again to give them credit. commit 099e59a45fd0c4d6657bf384e0539691eb7b1f24 Author: Karl Williamson <k...@cpan.org> Date: Mon Dec 11 18:17:29 2017 -0700 is_utf8_invariant_string(): small speed optimization This adds a few shifing, masking, and integer arithmetic operations to a conditional which in return makes sure that one branch is taken only when it is going to do some good, avoiding a conditional in it. commit 16ef5c6e5c4a7d414ca7ef46cff9f8015fcd9079 Author: Karl Williamson <k...@cpan.org> Date: Mon Dec 11 18:26:52 2017 -0700 APItest/t/handy_base.pl: Avoid uninitialized warning This .pl in /t is generally called from a test file in that directory, but if run by hand, this commit makes sure things are properly initialized ----------------------------------------------------------------------- Summary of changes: embed.fnc | 4 + embed.h | 3 + ext/XS-APItest/APItest.xs | 14 +++- ext/XS-APItest/t/handy_base.pl | 1 + ext/XS-APItest/t/utf8.t | 57 ++++++++++++++ inline.h | 172 +++++++++++++++++++++++++++++++++++++++-- proto.h | 9 +++ 7 files changed, 252 insertions(+), 8 deletions(-) diff --git a/embed.fnc b/embed.fnc index bfbc63af5a..ad4df86324 100644 --- a/embed.fnc +++ b/embed.fnc @@ -782,6 +782,10 @@ AndmoR |bool |is_utf8_invariant_string|NN const U8* const s \ AnidR |bool |is_utf8_invariant_string_loc|NN const U8* const s \ |STRLEN len \ |NULLOK const U8 ** ep +#if defined(PERL_CORE) || defined(PERL_EXT) +EinR |Size_t |variant_under_utf8_count|NN const U8* const s \ + |NN const U8* const e +#endif AmnpdRP |bool |is_ascii_string|NN const U8* const s|const STRLEN len AmnpdRP |bool |is_invariant_string|NN const U8* const s|STRLEN len #if defined(PERL_CORE) || defined (PERL_EXT) diff --git a/embed.h b/embed.h index 06002a1b9a..fb4832d43e 100644 --- a/embed.h +++ b/embed.h @@ -994,6 +994,9 @@ #define is_utf8_non_invariant_string S_is_utf8_non_invariant_string #define sv_or_pv_pos_u2b(a,b,c,d) S_sv_or_pv_pos_u2b(aTHX_ a,b,c,d) # endif +# if defined(PERL_CORE) || defined(PERL_EXT) +#define variant_under_utf8_count S_variant_under_utf8_count +# endif # if defined(PERL_IN_REGCOMP_C) #define _make_exactf_invlist(a,b) S__make_exactf_invlist(aTHX_ a,b) #define add_above_Latin1_folds(a,b,c) S_add_above_Latin1_folds(aTHX_ a,b,c) diff --git a/ext/XS-APItest/APItest.xs b/ext/XS-APItest/APItest.xs index a2d5d697b6..144d62488d 100644 --- a/ext/XS-APItest/APItest.xs +++ b/ext/XS-APItest/APItest.xs @@ -6045,7 +6045,7 @@ test_is_utf8_invariant_string_loc(unsigned char *s, STRLEN offset, STRLEN len) * is to start at. Allocate space that does start at the word * boundary, and copy 's' to the correct offset past it. Then call the * tested function with that position */ - Newx(copy, (len + WORDSIZE - 1) / WORDSIZE, PERL_UINTMAX_T); + Newx(copy, 1 + ((len + WORDSIZE - 1) / WORDSIZE), PERL_UINTMAX_T); Copy(s, (U8 *) copy + offset, len, U8); av = newAV(); av_push(av, newSViv(is_utf8_invariant_string_loc((U8 *) copy + offset, len, &ep))); @@ -6055,6 +6055,18 @@ test_is_utf8_invariant_string_loc(unsigned char *s, STRLEN offset, STRLEN len) OUTPUT: RETVAL +STRLEN +test_variant_under_utf8_count(unsigned char *s, STRLEN offset, STRLEN len) + PREINIT: + PERL_UINTMAX_T * copy; + CODE: + Newx(copy, 1 + ((len + WORDSIZE - 1) / WORDSIZE), PERL_UINTMAX_T); + Copy(s, (U8 *) copy + offset, len, U8); + RETVAL = variant_under_utf8_count((U8 *) copy + offset, (U8 *) copy + offset + len); + Safefree(copy); + OUTPUT: + RETVAL + STRLEN test_utf8_length(unsigned char *s, STRLEN offset, STRLEN len) CODE: diff --git a/ext/XS-APItest/t/handy_base.pl b/ext/XS-APItest/t/handy_base.pl index 676f7dfe6c..7e8194e643 100644 --- a/ext/XS-APItest/t/handy_base.pl +++ b/ext/XS-APItest/t/handy_base.pl @@ -168,6 +168,7 @@ my %utf8_param_code = ( # This test is split into this number of files. my $num_test_files = $ENV{TEST_JOBS} || 1; +$::TEST_CHUNK = 0 if $num_test_files == 1 && ! defined $::TEST_CHUNK; $num_test_files = 10 if $num_test_files > 10; my $property_count = -1; diff --git a/ext/XS-APItest/t/utf8.t b/ext/XS-APItest/t/utf8.t index 66d36dc005..1edc02d643 100644 --- a/ext/XS-APItest/t/utf8.t +++ b/ext/XS-APItest/t/utf8.t @@ -67,6 +67,63 @@ for my $pos (0.. length($all_invariants) - 1) { } } +# Now work on variant_under_utf8_count(). +pass("The tests below are for variant_under_utf8_count() with string" + . " starting $offset bytes after a word boundary"); +is(test_variant_under_utf8_count($all_invariants, $offset, + length $all_invariants), + 0, + "$display_all_invariants contains 0 variants"); + +# First, put a variant in each possible position in the flanking partial words +for my $pos (0 .. $word_length - $offset, + 2 * $word_length .. length($all_invariants) - 1) +{ + my $test_string = $all_invariants; + my $test_display = $display_all_invariants; + + substr($test_string, $pos, 1) = $variant; + substr($test_display, $pos * 2, 2) = $display_variant; + is(test_variant_under_utf8_count($test_string, $offset, length $test_string), + 1, + "$test_display contains 1 variant"); +} + +# Then try all possible combinations of variant/invariant in the full word in +# the middle (We've already tested the case with 0 variants, so start at 1.) +for my $bit_pattern (1 .. (1 << $word_length) - 1) { + my $bits = $bit_pattern; + my $display_word = ""; + my $test_word = ""; + my $count = 0; + + # Every 1 bit gets the variant for this particular $bit_pattern. + for my $bit (0 .. 7) { + if ($bits & 1) { + $count++; + $test_word .= $variant; + $display_word .= $display_variant; + } + else { + $test_word .= $invariant; + $display_word .= $display_invariant; + } + $bits >>= 1; + } + + my $test_string = $variant x ($word_length - 1) + . $test_word + . $variant x ($word_length - 1); + my $display_string = $display_variant x ($word_length - 1) + . $display_word + . $display_variant x ($word_length - 1); + my $expected_count = $count + 2 * $word_length - 2; + is(test_variant_under_utf8_count($test_string, $offset, + length $test_string), $expected_count, + "$display_string contains $expected_count variants"); +} + + my $pound_sign = chr utf8::unicode_to_native(163); # This test file can't use byte_utf8a_to_utf8n() from t/charset_tools.pl diff --git a/inline.h b/inline.h index 70504f805c..26a1b5937e 100644 --- a/inline.h +++ b/inline.h @@ -396,7 +396,27 @@ S_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep) #define PERL_WORDSIZE sizeof(PERL_COUNT_MULTIPLIER) #define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1) - if ((STRLEN) (send - x) >= PERL_WORDSIZE) { +/* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by + * or'ing together the lowest bits of 'x'. Hopefully the final term gets + * optimized out completely on a 32-bit system, and its mask gets optimized out + * on a 64-bit system */ +#define PERL_IS_SUBWORD_ADDR(x) (1 & ( PTR2nat(x) \ + | (PTR2nat(x) >> 1) \ + | ( (PTR2nat(x) >> 2) \ + & PERL_WORD_BOUNDARY_MASK))) + + /* Do the word-at-a-time iff there is at least one usable full word. That + * means that after advancing to a word boundary, there still is at least a + * full word left. The number of bytes needed to advance is 'wordsize - + * offset' unless offset is 0. */ + if ((STRLEN) (send - x) >= PERL_WORDSIZE + + /* This term is wordsize if subword; 0 if not */ + + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x) + + /* 'offset' */ + - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK)) + { /* Process per-byte until reach word boundary. XXX This loop could be * eliminated if we knew that this platform had fast unaligned reads */ @@ -411,8 +431,9 @@ S_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep) x++; } - /* Process per-word as long as we have at least a full word left */ - while (x + PERL_WORDSIZE <= send) { + /* Here, we know we have at least one full word to process. Process + * per-word as long as we have at least a full word left */ + do { if ((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) { /* Found a variant. Just return if caller doesn't want its @@ -425,12 +446,9 @@ S_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep) break; } x += PERL_WORDSIZE; - } + } while (x + PERL_WORDSIZE <= send); } -# undef PERL_WORDSIZE -# undef PERL_WORD_BOUNDARY_MASK -# undef PERL_VARIANTS_WORD_MASK #endif /* Process per-byte */ @@ -449,6 +467,146 @@ S_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep) return TRUE; } +#if defined(PERL_CORE) || defined(PERL_EXT) + +/* +=for apidoc variant_under_utf8_count + +This function looks at the sequence of bytes between C<s> and C<e>, which are +assumed to be encoded in ASCII/Latin1, and returns how many of them would +change should the string be translated into UTF-8. Due to the nature of UTF-8, +each of these would occupy two bytes instead of the single one in the input +string. Thus, this function returns the precise number of bytes the string +would expand by when translated to UTF-8. + +Unlike most of the other functions that have C<utf8> in their name, the input +to this function is NOT a UTF-8-encoded string. The function name is slightly +I<odd> to emphasize this. + +This function is internal to Perl because khw thinks that any XS code that +would want this is probably operating too close to the internals. Presenting a +valid use case could change that. + +See also +C<L<perlapi/is_utf8_invariant_string>> +and +C<L<perlapi/is_utf8_invariant_string_loc>>, + +=cut + +*/ + +PERL_STATIC_INLINE Size_t +S_variant_under_utf8_count(const U8* const s, const U8* const e) +{ + const U8* x = s; + Size_t count = 0; + + PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT; + +# ifndef EBCDIC + + if ((STRLEN) (e - x) >= PERL_WORDSIZE + + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x) + - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK)) + { + + /* Process per-byte until reach word boundary. XXX This loop could be + * eliminated if we knew that this platform had fast unaligned reads */ + while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) { + count += ! UTF8_IS_INVARIANT(*x++); + } + + /* Process per-word as long as we have at least a full word left */ + do { + + /* It's easier to look at a 16-bit word size to see how this works. + * The expression would be: + * + * (((*x & 0x8080) >> 7) * 0x0101) >> 8; + * + * Suppose the value of *x is the 16 bits + * + * 0by_______z_______ + * + * where the 14 bits represented by '_' could be any combination of + * 0's or 1's (we don't care), and 'y' is the high bit of one byte, + * and 'z' is the high bit for the other (endianness doesn't + * matter). On ASCII platforms a byte is variant if the high bit + * is set; invariant otherwise. Thus, our goal, the count of + * variants in this 2-byte word is + * + * y + z + * + * To turn 0by_______z_______ into (y + z) we mask the intial value + * with 0x8080 to turn it into + * + * 0by0000000z0000000 + * + * Then right shifting by 7 yields + * + * 0by0000000z + * + * Viewed as a number, this is + * + * 2**8 * y + z + * + * We then multiply by 0x0101 (which is = 2**8 + 1), so + * + * (2**8 * y + z) * (2**8 + 1) + * = (2**8 * y * 2**8) + (z * 2**8) + (2**8 * y * 1) + (z * 1) + * = (2**16 * y) + (2**8 * (y + z)) + z + * + * However (2**16 * y) doesn't fit in a 16-bit word (unless 'y' is + * zero in which case it is 0), and since this is unsigned + * multiplication, the C standard says that this component just + * gets ignored, so we are left with + * + * = 2**8 * (y + z) + z + * + * We then shift right by 8 bits, which divides by 2**8, and gets + * rid of the lone 'z', leaving us with + * + * = y + z + * + * The same principles apply for longer word sizes. For 32 bit + * words we end up with + * + * = 2**24 * (w + x + y + z) + (lots of other expressions + * below 2**24) + * + * with anything above 2**24 having overflowed and been chopped + * off. Shifting right by 24 yields (w + x + y + z) + */ + + count += ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7) + * PERL_COUNT_MULTIPLIER) + >> ((PERL_WORDSIZE - 1) * CHARBITS); + x += PERL_WORDSIZE; + } while (x + PERL_WORDSIZE <= e); + } + +# endif + + /* Process per-byte */ + while (x < e) { + if (! UTF8_IS_INVARIANT(*x)) { + count++; + } + + x++; + } + + return count; +} + +#endif + +#undef PERL_WORDSIZE +#undef PERL_COUNT_MULTIPLIER +#undef PERL_WORD_BOUNDARY_MASK +#undef PERL_VARIANTS_WORD_MASK + /* =for apidoc is_utf8_string diff --git a/proto.h b/proto.h index 9770feee04..2a2f25a3e6 100644 --- a/proto.h +++ b/proto.h @@ -4313,6 +4313,15 @@ PERL_STATIC_INLINE STRLEN S_sv_or_pv_pos_u2b(pTHX_ SV *sv, const char *pv, STRLE #define PERL_ARGS_ASSERT_SV_OR_PV_POS_U2B \ assert(sv); assert(pv) #endif +#endif +#if defined(PERL_CORE) || defined(PERL_EXT) +#ifndef PERL_NO_INLINE_FUNCTIONS +PERL_STATIC_INLINE Size_t S_variant_under_utf8_count(const U8* const s, const U8* const e) + __attribute__warn_unused_result__; +#define PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT \ + assert(s); assert(e) +#endif + #endif #if defined(PERL_CR_FILTER) # if defined(PERL_IN_TOKE_C) -- Perl5 Master Repository