In perl.git, the branch blead has been updated

<https://perl5.git.perl.org/perl.git/commitdiff/03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1?hp=9b91093d3e74d13a6e4f67269a587f4ab397998a>

- Log -----------------------------------------------------------------
commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1
Author: Karl Williamson <k...@cpan.org>
Date:   Wed Nov 22 22:30:16 2017 -0700

    Add variant_under_utf8_count() core function
    
    This function takes a string that isn't encoded in UTF-8 (hence is
    assumed to be in Latin1), and counts how many of the bytes therein
    would change if it were to be translated into UTF-8.  Each such byte
    would occupy two UTF-8 bytes.
    
    This function is useful for calculating the expansion factor precisely
    when converting to UTF-8, so as to know how much to malloc.
    
    This function uses a non-obvious method to do the calculations
    word-at-a-time, as opposed to the byte-at-a-time method used now, and
    hence should be much faster than the current methods.
    
    The performance change in short string lengths is equivocal.  Here is
    the result for a single character and a 64-bit word.
    
              bytes    words Ratio %
            -------- -------- -------
         Ir    932.0    947.0    98.4
         Dr    325.0    325.0   100.0
         Dw    104.0    104.0   100.0
       COND    136.0    137.0    99.3
        IND     28.0     28.0   100.0
    
     COND_m      1.0      0.0   Inf
      IND_m      6.0      6.0   100.0
    
    There are some extra instructions executed and an extra branch to check
    for and handle the case where we can go word-by-word vs. not.  But the
    one cache miss is removed.
    
    The results are essentially the same until we get to being able to
    handle a full word.  Some of the extra instructions are to ensure that
    if the input is not aligned on a word boundary, that performance doesn't
    suffer.
    
    Here's the results for 8-bytes on a 64-bit system.
    
               bytes    words Ratio %
            -------- -------- -------
         Ir    974.0    955.0   102.0
         Dr    332.0    325.0   102.2
         Dw    104.0    104.0   100.0
       COND    143.0    138.0   103.6
        IND     28.0     28.0   100.0
    
     COND_m      1.0      0.0     Inf
      IND_m      6.0      6.0   100.0
    
    Things keep improving as the strings get longer.  Here's for 24 bytes.
    
               bytes    words Ratio %
            -------- -------- -------
         Ir   1070.0    975.0   109.7
         Dr    348.0    327.0   106.4
         Dw    104.0    104.0   100.0
       COND    159.0    140.0   113.6
        IND     28.0     28.0   100.0
    
     COND_m      2.0      0.0     Inf
      IND_m      6.0      6.0   100.0
    
    And 96:
    
               bytes    words Ratio %
            -------- -------- -------
         Ir   1502.0   1065.0   141.0
         Dr    420.0    336.0   125.0
         Dw    104.0    104.0   100.0
       COND    231.0    149.0   155.0
        IND     28.0     28.0   100.0
    
     COND_m      2.0      1.0   200.0
      IND_m      6.0      6.0   100.0
    
    And 10,000
    
               bytes    words Ratio %
            -------- -------- -------
         Ir  60926.0  13445.0   453.1
         Dr  10324.0   1574.0   655.9
         Dw    104.0    104.0   100.0
       COND  10135.0   1387.0   730.7
        IND     28.0     28.0   100.0
    
     COND_m      2.0      1.0   200.0
      IND_m      6.0      6.0   100.0
    
    I found this trick on the internet many years ago, but I can't seem to
    find it again to give them credit.

commit 099e59a45fd0c4d6657bf384e0539691eb7b1f24
Author: Karl Williamson <k...@cpan.org>
Date:   Mon Dec 11 18:17:29 2017 -0700

    is_utf8_invariant_string(): small speed optimization
    
    This adds a few shifing, masking, and integer arithmetic operations to a
    conditional which in return makes sure that one branch is taken only
    when it is going to do some good, avoiding a conditional in it.

commit 16ef5c6e5c4a7d414ca7ef46cff9f8015fcd9079
Author: Karl Williamson <k...@cpan.org>
Date:   Mon Dec 11 18:26:52 2017 -0700

    APItest/t/handy_base.pl: Avoid uninitialized warning
    
    This .pl in /t is generally called from a test file in that directory,
    but if run by hand, this commit makes sure things are properly
    initialized

-----------------------------------------------------------------------

Summary of changes:
 embed.fnc                      |   4 +
 embed.h                        |   3 +
 ext/XS-APItest/APItest.xs      |  14 +++-
 ext/XS-APItest/t/handy_base.pl |   1 +
 ext/XS-APItest/t/utf8.t        |  57 ++++++++++++++
 inline.h                       | 172 +++++++++++++++++++++++++++++++++++++++--
 proto.h                        |   9 +++
 7 files changed, 252 insertions(+), 8 deletions(-)

diff --git a/embed.fnc b/embed.fnc
index bfbc63af5a..ad4df86324 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -782,6 +782,10 @@ AndmoR     |bool   |is_utf8_invariant_string|NN const U8* 
const s              \
 AnidR  |bool   |is_utf8_invariant_string_loc|NN const U8* const s          \
                |STRLEN len                                                 \
                |NULLOK const U8 ** ep
+#if defined(PERL_CORE) || defined(PERL_EXT)
+EinR   |Size_t |variant_under_utf8_count|NN const U8* const s              \
+               |NN const U8* const e
+#endif
 AmnpdRP        |bool   |is_ascii_string|NN const U8* const s|const STRLEN len
 AmnpdRP        |bool   |is_invariant_string|NN const U8* const s|STRLEN len
 #if defined(PERL_CORE) || defined (PERL_EXT)
diff --git a/embed.h b/embed.h
index 06002a1b9a..fb4832d43e 100644
--- a/embed.h
+++ b/embed.h
@@ -994,6 +994,9 @@
 #define is_utf8_non_invariant_string   S_is_utf8_non_invariant_string
 #define sv_or_pv_pos_u2b(a,b,c,d)      S_sv_or_pv_pos_u2b(aTHX_ a,b,c,d)
 #  endif
+#  if defined(PERL_CORE) || defined(PERL_EXT)
+#define variant_under_utf8_count       S_variant_under_utf8_count
+#  endif
 #  if defined(PERL_IN_REGCOMP_C)
 #define _make_exactf_invlist(a,b)      S__make_exactf_invlist(aTHX_ a,b)
 #define add_above_Latin1_folds(a,b,c)  S_add_above_Latin1_folds(aTHX_ a,b,c)
diff --git a/ext/XS-APItest/APItest.xs b/ext/XS-APItest/APItest.xs
index a2d5d697b6..144d62488d 100644
--- a/ext/XS-APItest/APItest.xs
+++ b/ext/XS-APItest/APItest.xs
@@ -6045,7 +6045,7 @@ test_is_utf8_invariant_string_loc(unsigned char *s, 
STRLEN offset, STRLEN len)
          * is to start at.  Allocate space that does start at the word
          * boundary, and copy 's' to the correct offset past it.  Then call the
          * tested function with that position */
-        Newx(copy, (len + WORDSIZE - 1) / WORDSIZE, PERL_UINTMAX_T);
+        Newx(copy, 1 + ((len + WORDSIZE - 1) / WORDSIZE), PERL_UINTMAX_T);
         Copy(s, (U8 *) copy + offset, len, U8);
         av = newAV();
         av_push(av, newSViv(is_utf8_invariant_string_loc((U8 *) copy + offset, 
len, &ep)));
@@ -6055,6 +6055,18 @@ test_is_utf8_invariant_string_loc(unsigned char *s, 
STRLEN offset, STRLEN len)
     OUTPUT:
         RETVAL
 
+STRLEN
+test_variant_under_utf8_count(unsigned char *s, STRLEN offset, STRLEN len)
+    PREINIT:
+        PERL_UINTMAX_T * copy;
+    CODE:
+        Newx(copy, 1 + ((len + WORDSIZE - 1) / WORDSIZE), PERL_UINTMAX_T);
+        Copy(s, (U8 *) copy + offset, len, U8);
+        RETVAL = variant_under_utf8_count((U8 *) copy + offset, (U8 *) copy + 
offset + len);
+        Safefree(copy);
+    OUTPUT:
+        RETVAL
+
 STRLEN
 test_utf8_length(unsigned char *s, STRLEN offset, STRLEN len)
 CODE:
diff --git a/ext/XS-APItest/t/handy_base.pl b/ext/XS-APItest/t/handy_base.pl
index 676f7dfe6c..7e8194e643 100644
--- a/ext/XS-APItest/t/handy_base.pl
+++ b/ext/XS-APItest/t/handy_base.pl
@@ -168,6 +168,7 @@ my %utf8_param_code = (
 
 # This test is split into this number of files.
 my $num_test_files = $ENV{TEST_JOBS} || 1;
+$::TEST_CHUNK = 0 if $num_test_files == 1 && ! defined $::TEST_CHUNK;
 $num_test_files = 10 if $num_test_files > 10;
 
 my $property_count = -1;
diff --git a/ext/XS-APItest/t/utf8.t b/ext/XS-APItest/t/utf8.t
index 66d36dc005..1edc02d643 100644
--- a/ext/XS-APItest/t/utf8.t
+++ b/ext/XS-APItest/t/utf8.t
@@ -67,6 +67,63 @@ for my $pos (0.. length($all_invariants) - 1) {
     }
 }
 
+# Now work on variant_under_utf8_count().
+pass("The tests below are for variant_under_utf8_count() with string"
+   . " starting $offset bytes after a word boundary");
+is(test_variant_under_utf8_count($all_invariants, $offset,
+                                length $all_invariants),
+                                0,
+                                "$display_all_invariants contains 0 variants");
+
+# First, put a variant in each possible position in the flanking partial words
+for my $pos (0 .. $word_length - $offset,
+             2 * $word_length .. length($all_invariants) - 1)
+{
+    my $test_string = $all_invariants;
+    my $test_display = $display_all_invariants;
+
+    substr($test_string, $pos, 1) = $variant;
+    substr($test_display, $pos * 2, 2) = $display_variant;
+    is(test_variant_under_utf8_count($test_string, $offset, length 
$test_string),
+                                     1,
+                                     "$test_display contains 1 variant");
+}
+
+# Then try all possible combinations of variant/invariant in the full word in
+# the middle  (We've already tested the case with 0 variants, so start at 1.)
+for my $bit_pattern (1 .. (1 << $word_length) - 1) {
+    my $bits = $bit_pattern;
+    my $display_word = "";
+    my $test_word = "";
+    my $count = 0;
+
+    # Every 1 bit gets the variant for this particular $bit_pattern.
+    for my $bit (0 .. 7) {
+        if ($bits & 1) {
+            $count++;
+            $test_word .= $variant;
+            $display_word .= $display_variant;
+        }
+        else {
+            $test_word .= $invariant;
+            $display_word .= $display_invariant;
+        }
+        $bits >>= 1;
+    }
+
+    my $test_string = $variant x ($word_length - 1)
+                    . $test_word
+                    . $variant x ($word_length - 1);
+    my $display_string = $display_variant x ($word_length - 1)
+                        . $display_word
+                        . $display_variant x ($word_length - 1);
+    my $expected_count = $count + 2 * $word_length - 2;
+    is(test_variant_under_utf8_count($test_string, $offset,
+                        length $test_string), $expected_count,
+                        "$display_string contains $expected_count variants");
+}
+
+
 my $pound_sign = chr utf8::unicode_to_native(163);
 
 # This  test file can't use byte_utf8a_to_utf8n() from t/charset_tools.pl
diff --git a/inline.h b/inline.h
index 70504f805c..26a1b5937e 100644
--- a/inline.h
+++ b/inline.h
@@ -396,7 +396,27 @@ S_is_utf8_invariant_string_loc(const U8* const s, STRLEN 
len, const U8 ** ep)
 #define PERL_WORDSIZE            sizeof(PERL_COUNT_MULTIPLIER)
 #define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1)
 
-    if ((STRLEN) (send - x) >= PERL_WORDSIZE) {
+/* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by
+ * or'ing together the lowest bits of 'x'.  Hopefully the final term gets
+ * optimized out completely on a 32-bit system, and its mask gets optimized out
+ * on a 64-bit system */
+#define PERL_IS_SUBWORD_ADDR(x) (1 & (     PTR2nat(x)                      \
+                                      |   (PTR2nat(x) >> 1)                \
+                                      | ( (PTR2nat(x) >> 2)                \
+                                         & PERL_WORD_BOUNDARY_MASK)))
+
+    /* Do the word-at-a-time iff there is at least one usable full word.  That
+     * means that after advancing to a word boundary, there still is at least a
+     * full word left.  The number of bytes needed to advance is 'wordsize -
+     * offset' unless offset is 0. */
+    if ((STRLEN) (send - x) >= PERL_WORDSIZE
+
+                            /* This term is wordsize if subword; 0 if not */
+                          + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
+
+                            /* 'offset' */
+                          - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
+    {
 
         /* Process per-byte until reach word boundary.  XXX This loop could be
          * eliminated if we knew that this platform had fast unaligned reads */
@@ -411,8 +431,9 @@ S_is_utf8_invariant_string_loc(const U8* const s, STRLEN 
len, const U8 ** ep)
             x++;
         }
 
-        /* Process per-word as long as we have at least a full word left */
-        while (x + PERL_WORDSIZE <= send) {
+        /* Here, we know we have at least one full word to process.  Process
+         * per-word as long as we have at least a full word left */
+        do {
             if ((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK)  {
 
                 /* Found a variant.  Just return if caller doesn't want its
@@ -425,12 +446,9 @@ S_is_utf8_invariant_string_loc(const U8* const s, STRLEN 
len, const U8 ** ep)
                 break;
             }
             x += PERL_WORDSIZE;
-        }
+        } while (x + PERL_WORDSIZE <= send);
     }
 
-#  undef PERL_WORDSIZE
-#  undef PERL_WORD_BOUNDARY_MASK
-#  undef PERL_VARIANTS_WORD_MASK
 #endif
 
     /* Process per-byte */
@@ -449,6 +467,146 @@ S_is_utf8_invariant_string_loc(const U8* const s, STRLEN 
len, const U8 ** ep)
     return TRUE;
 }
 
+#if defined(PERL_CORE) || defined(PERL_EXT)
+
+/*
+=for apidoc variant_under_utf8_count
+
+This function looks at the sequence of bytes between C<s> and C<e>, which are
+assumed to be encoded in ASCII/Latin1, and returns how many of them would
+change should the string be translated into UTF-8.  Due to the nature of UTF-8,
+each of these would occupy two bytes instead of the single one in the input
+string.  Thus, this function returns the precise number of bytes the string
+would expand by when translated to UTF-8.
+
+Unlike most of the other functions that have C<utf8> in their name, the input
+to this function is NOT a UTF-8-encoded string.  The function name is slightly
+I<odd> to emphasize this.
+
+This function is internal to Perl because khw thinks that any XS code that
+would want this is probably operating too close to the internals.  Presenting a
+valid use case could change that.
+
+See also
+C<L<perlapi/is_utf8_invariant_string>>
+and
+C<L<perlapi/is_utf8_invariant_string_loc>>,
+
+=cut
+
+*/
+
+PERL_STATIC_INLINE Size_t
+S_variant_under_utf8_count(const U8* const s, const U8* const e)
+{
+    const U8* x = s;
+    Size_t count = 0;
+
+    PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT;
+
+#  ifndef EBCDIC
+
+    if ((STRLEN) (e - x) >= PERL_WORDSIZE
+                          + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
+                          - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
+    {
+
+        /* Process per-byte until reach word boundary.  XXX This loop could be
+         * eliminated if we knew that this platform had fast unaligned reads */
+        while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
+            count += ! UTF8_IS_INVARIANT(*x++);
+        }
+
+        /* Process per-word as long as we have at least a full word left */
+        do {
+
+            /* It's easier to look at a 16-bit word size to see how this works.
+             * The expression would be:
+             *
+             *  (((*x & 0x8080) >> 7) * 0x0101) >> 8;
+             *
+             * Suppose the value of *x is the 16 bits
+             *
+             *      0by_______z_______
+             *
+             * where the 14 bits represented by '_' could be any combination of
+             * 0's or 1's (we don't care), and 'y' is the high bit of one byte,
+             * and 'z' is the high bit for the other (endianness doesn't
+             * matter).  On ASCII platforms a byte is variant if the high bit
+             * is set; invariant otherwise.  Thus, our goal, the count of
+             * variants in this 2-byte word is
+             *
+             *      y + z
+             *
+             * To turn 0by_______z_______ into (y + z) we mask the intial value
+             * with 0x8080 to turn it into
+             *
+             *      0by0000000z0000000
+             *
+             * Then right shifting by 7 yields
+             *
+             *      0by0000000z
+             *
+             * Viewed as a number, this is
+             *
+             *      2**8 * y + z
+             *
+             * We then multiply by 0x0101 (which is = 2**8 + 1), so
+             *
+             *       (2**8 * y + z) * (2**8 + 1)
+             *     = (2**8 * y * 2**8) + (z * 2**8) + (2**8 * y * 1) + (z * 1)
+             *     = (2**16 * y) + (2**8 * (y + z)) + z
+             *
+             * However (2**16 * y) doesn't fit in a 16-bit word (unless 'y' is
+             * zero in which case it is 0), and since this is unsigned
+             * multiplication, the C standard says that this component just
+             * gets ignored, so we are left with
+             *
+             *     =  2**8 * (y + z) + z
+             *
+             * We then shift right by 8 bits, which divides by 2**8, and gets
+             * rid of the lone 'z', leaving us with
+             *
+             *     =  y + z
+             *
+             * The same principles apply for longer word sizes.  For 32 bit
+             * words we end up with
+             *
+             *     =  2**24 * (w + x + y + z) + (lots of other expressions
+             *                                   below 2**24)
+             *
+             * with anything above 2**24 having overflowed and been chopped
+             * off.  Shifting right by 24 yields (w + x + y + z)
+             */
+
+            count += ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 
7)
+                      * PERL_COUNT_MULTIPLIER)
+                    >> ((PERL_WORDSIZE - 1) * CHARBITS);
+            x += PERL_WORDSIZE;
+        } while (x + PERL_WORDSIZE <= e);
+    }
+
+#  endif
+
+    /* Process per-byte */
+    while (x < e) {
+       if (! UTF8_IS_INVARIANT(*x)) {
+            count++;
+        }
+
+        x++;
+    }
+
+    return count;
+}
+
+#endif
+
+#undef PERL_WORDSIZE
+#undef PERL_COUNT_MULTIPLIER
+#undef PERL_WORD_BOUNDARY_MASK
+#undef PERL_VARIANTS_WORD_MASK
+
 /*
 =for apidoc is_utf8_string
 
diff --git a/proto.h b/proto.h
index 9770feee04..2a2f25a3e6 100644
--- a/proto.h
+++ b/proto.h
@@ -4313,6 +4313,15 @@ PERL_STATIC_INLINE STRLEN        
S_sv_or_pv_pos_u2b(pTHX_ SV *sv, const char *pv, STRLE
 #define PERL_ARGS_ASSERT_SV_OR_PV_POS_U2B      \
        assert(sv); assert(pv)
 #endif
+#endif
+#if defined(PERL_CORE) || defined(PERL_EXT)
+#ifndef PERL_NO_INLINE_FUNCTIONS
+PERL_STATIC_INLINE Size_t      S_variant_under_utf8_count(const U8* const s, 
const U8* const e)
+                       __attribute__warn_unused_result__;
+#define PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT      \
+       assert(s); assert(e)
+#endif
+
 #endif
 #if defined(PERL_CR_FILTER)
 #  if defined(PERL_IN_TOKE_C)

-- 
Perl5 Master Repository

Reply via email to