In perl.git, the branch blead has been updated

<https://perl5.git.perl.org/perl.git/commitdiff/f8edfb87c2ebbcbe6d91f1cc4cbd0f085d3b44c1?hp=03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1>

- Log -----------------------------------------------------------------
commit f8edfb87c2ebbcbe6d91f1cc4cbd0f085d3b44c1
Author: Karl Williamson <k...@cpan.org>
Date:   Thu Dec 7 18:47:34 2017 -0700

    sv.c: White-space only
    
    This outdents some lines that the previous commit removed an enclosing
    block of.

commit c58971e910419d745ca5a1e6c474d930af17738a
Author: Karl Williamson <k...@cpan.org>
Date:   Thu Dec 7 18:42:53 2017 -0700

    utf8_upgrade_flags_grow(): Use faster variant count
    
    Now that we have a much faster way of counting the characters that would
    expand to two bytes when a string is translated into UTF-8, use that,
    instead of trying to choose one of two methods.  This simplifies the
    code.  Note that the faster method doesn't happen on EBCDIC platforms,
    but the simplification is worth the potential loss of being able to
    choose methods.

commit 74472cc263121e60f8787f15c2dc2fc910dd8683
Author: Karl Williamson <k...@cpan.org>
Date:   Mon Dec 11 19:13:45 2017 -0700

    inline.h: Change extensive comment to a link
    
    I'm thinking the explanation of how the algorithm works  doesn't belong
    in the code, so replace it by the commit number so it can be found
    quickly.

-----------------------------------------------------------------------

Summary of changes:
 inline.h |  63 +---------------
 sv.c     | 252 +++++++++++++++++++--------------------------------------------
 2 files changed, 76 insertions(+), 239 deletions(-)

diff --git a/inline.h b/inline.h
index 26a1b5937e..28bc1f5bf9 100644
--- a/inline.h
+++ b/inline.h
@@ -518,67 +518,8 @@ S_variant_under_utf8_count(const U8* const s, const U8* 
const e)
         }
 
         /* Process per-word as long as we have at least a full word left */
-        do {
-
-            /* It's easier to look at a 16-bit word size to see how this works.
-             * The expression would be:
-             *
-             *  (((*x & 0x8080) >> 7) * 0x0101) >> 8;
-             *
-             * Suppose the value of *x is the 16 bits
-             *
-             *      0by_______z_______
-             *
-             * where the 14 bits represented by '_' could be any combination of
-             * 0's or 1's (we don't care), and 'y' is the high bit of one byte,
-             * and 'z' is the high bit for the other (endianness doesn't
-             * matter).  On ASCII platforms a byte is variant if the high bit
-             * is set; invariant otherwise.  Thus, our goal, the count of
-             * variants in this 2-byte word is
-             *
-             *      y + z
-             *
-             * To turn 0by_______z_______ into (y + z) we mask the intial value
-             * with 0x8080 to turn it into
-             *
-             *      0by0000000z0000000
-             *
-             * Then right shifting by 7 yields
-             *
-             *      0by0000000z
-             *
-             * Viewed as a number, this is
-             *
-             *      2**8 * y + z
-             *
-             * We then multiply by 0x0101 (which is = 2**8 + 1), so
-             *
-             *       (2**8 * y + z) * (2**8 + 1)
-             *     = (2**8 * y * 2**8) + (z * 2**8) + (2**8 * y * 1) + (z * 1)
-             *     = (2**16 * y) + (2**8 * (y + z)) + z
-             *
-             * However (2**16 * y) doesn't fit in a 16-bit word (unless 'y' is
-             * zero in which case it is 0), and since this is unsigned
-             * multiplication, the C standard says that this component just
-             * gets ignored, so we are left with
-             *
-             *     =  2**8 * (y + z) + z
-             *
-             * We then shift right by 8 bits, which divides by 2**8, and gets
-             * rid of the lone 'z', leaving us with
-             *
-             *     =  y + z
-             *
-             * The same principles apply for longer word sizes.  For 32 bit
-             * words we end up with
-             *
-             *     =  2**24 * (w + x + y + z) + (lots of other expressions
-             *                                   below 2**24)
-             *
-             * with anything above 2**24 having overflowed and been chopped
-             * off.  Shifting right by 24 yields (w + x + y + z)
-             */
-
+        do {    /* Commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 contains an
+                   explanation of how this works */
             count += ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 
7)
                       * PERL_COUNT_MULTIPLIER)
                     >> ((PERL_WORDSIZE - 1) * CHARBITS);
diff --git a/sv.c b/sv.c
index a40d0aa5f0..924a7e3058 100644
--- a/sv.c
+++ b/sv.c
@@ -3401,11 +3401,7 @@ if all the bytes are invariant in UTF-8.
 If C<flags> has C<SV_GMAGIC> bit set,
 will C<mg_get> on C<sv> if appropriate, else not.
 
-If C<flags> has C<SV_FORCE_UTF8_UPGRADE> set, this function assumes that the PV
-will expand when converted to UTF-8, and skips the extra work of checking for
-that.  Typically this flag is used by a routine that has already parsed the
-string and found such characters, and passes this information on so that the
-work doesn't have to be repeated.
+The C<SV_FORCE_UTF8_UPGRADE> flag is now ignored.
 
 Returns the number of bytes in the converted string.
 
@@ -3426,22 +3422,10 @@ Returns the number of bytes in the converted string 
(not including the spares).
 
 =cut
 
-(One might think that the calling routine could pass in the position of the
-first variant character when it has set SV_FORCE_UTF8_UPGRADE, so it wouldn't
-have to be found again.  But that is not the case, because typically when the
-caller is likely to use this flag, it won't be calling this routine unless it
-finds something that won't fit into a byte.  Otherwise it tries to not upgrade
-and just use bytes.  But some things that do fit into a byte are variants in
-utf8, and the caller may not have been keeping track of these.)
-
 If the routine itself changes the string, it adds a trailing C<NUL>.  Such a
 C<NUL> isn't guaranteed due to having other routines do the work in some input
 cases, or if the input is already flagged as being in utf8.
 
-The speed of this could perhaps be improved for many cases if someone wanted to
-write a fast function that counts the number of variant characters in a string,
-especially if it could return the position of the first one.
-
 */
 
 STRLEN
@@ -3484,183 +3468,96 @@ Perl_sv_utf8_upgrade_flags_grow(pTHX_ SV *const sv, 
const I32 flags, STRLEN extr
        /* This function could be much more efficient if we
         * had a FLAG in SVs to signal if there are any variant
         * chars in the PV.  Given that there isn't such a flag
-        * make the loop as fast as possible (although there are certainly ways
-        * to speed this up, eg. through vectorization) */
+        * make the loop as fast as possible. */
        U8 * s = (U8 *) SvPVX_const(sv);
-       U8 * e = (U8 *) SvEND(sv);
        U8 *t = s;
-       STRLEN two_byte_count;
        
-       if (flags & SV_FORCE_UTF8_UPGRADE) {
-            two_byte_count = 0;
-        }
-        else {
-            if (is_utf8_invariant_string_loc(s, SvCUR(sv), (const U8 **) &t)) {
-
-                /* utf8 conversion not needed because all are invariants.  Mark
-                 * as UTF-8 even if no variant - saves scanning loop */
-                SvUTF8_on(sv);
-                if (extra) SvGROW(sv, SvCUR(sv) + extra);
-                return SvCUR(sv);
-            }
+        if (is_utf8_invariant_string_loc(s, SvCUR(sv), (const U8 **) &t)) {
 
-            /* Here, there is at least one variant, and t points to the first
-             * one */
-            two_byte_count = 1;
+            /* utf8 conversion not needed because all are invariants.  Mark
+             * as UTF-8 even if no variant - saves scanning loop */
+            SvUTF8_on(sv);
+            if (extra) SvGROW(sv, SvCUR(sv) + extra);
+            return SvCUR(sv);
         }
 
-        /* Note that the incoming SV may not have a trailing '\0', as certain
-         * code in pp_formline can send us partially built SVs.
+        /* Here, there is at least one variant (t points to the first one), so
+         * the string should be converted to utf8.  Everything from 's' to
+         * 't - 1' will occupy only 1 byte each on output.
          *
-        * Here, the string should be converted to utf8, either because of an
-         * input flag (which causes two_byte_count to be set to 0), or because
-         * a character that requires 2 bytes was found (two_byte_count = 1).  t
-         * points either to the beginning of the string (if we didn't examine
-         * anything), or to the first variant.  In either case, everything from
-         * s to t - 1 will occupy only 1 byte each on output.
+         * Note that the incoming SV may not have a trailing '\0', as certain
+         * code in pp_formline can send us partially built SVs.
         *
         * There are two main ways to convert.  One is to create a new string
         * and go through the input starting from the beginning, appending each
-        * converted value onto the new string as we go along.  It's probably
-        * best to allocate enough space in the string for the worst possible
-        * case rather than possibly running out of space and having to
-        * reallocate and then copy what we've done so far.  Since everything
-        * from s to t - 1 is invariant, the destination can be initialized
-        * with these using a fast memory copy
+         * converted value onto the new string as we go along.  Going this
+         * route, it's probably best to initially allocate enough space in the
+         * string rather than possibly running out of space and having to
+         * reallocate and then copy what we've done so far.  Since everything
+         * from 's' to 't - 1' is invariant, the destination can be initialized
+         * with these using a fast memory copy.  To be sure to allocate enough
+         * space, one could use the worst case scenario, where every remaining
+         * byte expands to two under UTF-8, or one could parse it and count
+         * exactly how many do expand.
         *
-         * The other way is to figure out exactly how big the string should be,
-        * by parsing the entire input.  Then you don't have to make it big
-        * enough to handle the worst possible case, and more importantly, if
-        * the string you already have is large enough, you don't have to
-        * allocate a new string, you can copy the last character in the input
-        * string to the final position(s) that will be occupied by the
-        * converted string and go backwards, stopping at t, since everything
-        * before that is invariant.
+         * The other way is to unconditionally parse the remainder of the
+         * string to figure out exactly how big the expanded string will be,
+         * growing if needed.  Then start at the end of the string and place
+         * the character there at the end of the unfilled space in the expanded
+         * one, working backwards until reaching 't'.
         *
-        * There are advantages and disadvantages to each method.
-        *
-        * In the first method, we can allocate a new string, do the memory
-        * copy from the s to t - 1, and then proceed through the rest of the
-        * string byte-by-byte.
-        *
-        * In the second method, we proceed through the rest of the input
-        * string just calculating how big the converted string will be.  Then
-        * there are two cases:
-        *  1)  if the string has enough extra space to handle the converted
-        *      value.  We go backwards through the string, converting until we
-        *      get to the position we are at now, and then stop.  If this
-        *      position is far enough along in the string, this method is
-         *     faster than the first method above.  If the memory copy were
-         *     the same speed as the byte-by-byte loop, that position would be
-         *     about half-way, as at the half-way mark, parsing to the end and
-         *     back is one complete string's parse, the same amount as
-         *     starting over and going all the way through.  Actually, it
-         *     would be somewhat less than half-way, as it's faster to just
-         *     count bytes than to also copy, and we don't have the overhead
-         *     of allocating a new string, changing the scalar to use it, and
-         *     freeing the existing one.  But if the memory copy is fast, the
-         *     break-even point is somewhere after half way.  The counting
-         *     loop could be sped up by vectorization, etc, to move the
-         *     break-even point further towards the beginning.
-        *  2)  if the string doesn't have enough space to handle the converted
-        *      value.  A new string will have to be allocated, and one might
-        *      as well, given that, start from the beginning doing the first
-        *      method.  We've spent extra time parsing the string and in
-        *      exchange all we've gotten is that we know precisely how big to
-        *      make the new one.  Perl is more optimized for time than space,
-        *      so this case is a loser.
-        * So what I've decided to do is not use the 2nd method unless it is
-        * guaranteed that a new string won't have to be allocated, assuming
-        * the worst case.  I also decided not to put any more conditions on it
-        * than this, for now.  It seems likely that, since the worst case is
-        * twice as big as the unknown portion of the string (plus 1), we won't
-        * be guaranteed enough space, causing us to go to the first method,
-        * unless the string is short, or the first variant character is near
-        * the end of it.  In either of these cases, it seems best to use the
-        * 2nd method.  The only circumstance I can think of where this would
-        * be really slower is if the string had once had much more data in it
-        * than it does now, but there is still a substantial amount in it  */
+         * The problem with assuming the worst case scenario is that for very
+         * long strings, we could allocate much more memory than actually
+         * needed, which can create performance problems.  If we have to parse
+         * anyway, the second method is the winner as it may avoid an extra
+         * copy.  The code used to use the first method under some
+         * circumstances, but now that there is faster variant counting on
+         * ASCII platforms, the second method is used exclusively, eliminating
+         * some code that no longer has to be maintained. */
 
        {
-           STRLEN invariant_head = t - s;
-           STRLEN size = invariant_head + (e - t) * 2 + 1 + extra;
-           if (SvLEN(sv) < size) {
-
-               /* Here, have decided to allocate a new string */
-
-               U8 *dst;
-               U8 *d;
-
-               Newx(dst, size, U8);
-
-               /* If no known invariants at the beginning of the input string,
-                * set so starts from there.  Otherwise, can use memory copy to
-                * get up to where we are now, and then start from here */
-
-               if (invariant_head == 0) {
-                   d = dst;
-               } else {
-                   Copy(s, dst, invariant_head, char);
-                   d = dst + invariant_head;
-               }
-
-               while (t < e) {
-                    append_utf8_from_native_byte(*t, &d);
-                    t++;
-               }
-               *d = '\0';
-               SvPV_free(sv); /* No longer using pre-existing string */
-               SvPV_set(sv, (char*)dst);
-               SvCUR_set(sv, d - dst);
-               SvLEN_set(sv, size);
-           } else {
-
-               /* Here, have decided to get the exact size of the string.
-                * Currently this happens only when we know that there is
-                * guaranteed enough space to fit the converted string, so
-                * don't have to worry about growing.  If two_byte_count is 0,
-                * then t points to the first byte of the string which hasn't
-                * been examined yet.  Otherwise two_byte_count is 1, and t
-                * points to the first byte in the string that will expand to
-                * two.  Depending on this, start examining at t or 1 after t.
-                * */
-
-               U8 *d = t + two_byte_count;
-
-
-               /* Count up the remaining bytes that expand to two */
-
-               while (d < e) {
-                   const U8 chr = *d++;
-                   if (! NATIVE_BYTE_IS_INVARIANT(chr)) two_byte_count++;
-               }
-
-               /* The string will expand by just the number of bytes that
-                * occupy two positions.  But we are one afterwards because of
-                * the increment just above.  This is the place to put the
-                * trailing NUL, and to set the length before we decrement */
-
-               d += two_byte_count;
-               SvCUR_set(sv, d - s);
-               *d-- = '\0';
+            /* Count the total number of variants there are.  We can start
+             * just beyond the first one, which is known to be at 't' */
+            const Size_t invariant_length = t - s;
+            U8 * e = (U8 *) SvEND(sv);
+
+            /* The length of the left overs, plus 1. */
+            const Size_t remaining_length_p1 = e - t;
+
+            /* We expand by 1 for the variant at 't' and one for each remaining
+             * variant (we start looking at 't+1') */
+            Size_t expansion = 1 + variant_under_utf8_count(t + 1, e);
+
+            /* +1 = trailing NUL */
+            Size_t need = SvCUR(sv) + expansion + extra + 1;
+            U8 * d;
+
+            /* Grow if needed */
+            if (SvLEN(sv) < need) {
+                t = invariant_length + (U8*) SvGROW(sv, need);
+                e = t + remaining_length_p1;
+            }
+            SvCUR_set(sv, invariant_length + remaining_length_p1 + expansion);
 
+            /* Set the NUL at the end */
+            d = (U8 *) SvEND(sv);
+            *d-- = '\0';
 
-               /* Having decremented d, it points to the position to put the
-                * very last byte of the expanded string.  Go backwards through
-                * the string, copying and expanding as we go, stopping when we
-                * get to the part that is invariant the rest of the way down */
+            /* Having decremented d, it points to the position to put the
+             * very last byte of the expanded string.  Go backwards through
+             * the string, copying and expanding as we go, stopping when we
+             * get to the part that is invariant the rest of the way down */
 
-               e--;
-               while (e >= t) {
-                   if (NATIVE_BYTE_IS_INVARIANT(*e)) {
-                       *d-- = *e;
-                   } else {
-                       *d-- = UTF8_EIGHT_BIT_LO(*e);
-                       *d-- = UTF8_EIGHT_BIT_HI(*e);
-                   }
-                    e--;
-               }
-           }
+            e--;
+            while (e >= t) {
+                if (NATIVE_BYTE_IS_INVARIANT(*e)) {
+                    *d-- = *e;
+                } else {
+                    *d-- = UTF8_EIGHT_BIT_LO(*e);
+                    *d-- = UTF8_EIGHT_BIT_HI(*e);
+                }
+                e--;
+            }
 
            if (SvTYPE(sv) >= SVt_PVMG && SvMAGIC(sv)) {
                /* Update pos. We do it at the end rather than during
@@ -3679,7 +3576,6 @@ Perl_sv_utf8_upgrade_flags_grow(pTHX_ SV *const sv, const 
I32 flags, STRLEN extr
        }
     }
 
-    /* Mark as UTF-8 even if no variant - saves scanning loop */
     SvUTF8_on(sv);
     return SvCUR(sv);
 }

-- 
Perl5 Master Repository

Reply via email to