In perl.git, the branch blead has been updated

<http://perl5.git.perl.org/perl.git/commitdiff/da42332b10691ba7af7550035ffc7f46c87e4e66?hp=fd609c8d943a834b7d5da7251eb47c05d7f8f9fc>

- Log -----------------------------------------------------------------
commit da42332b10691ba7af7550035ffc7f46c87e4e66
Author: Yves Orton <[email protected]>
Date:   Thu Oct 27 13:52:24 2016 +0200

    regcomp.c: fix perl #129950 - fix firstchar bitmap under utf8 with prefix 
optimisation
    
    The trie code contains a number of sub optimisations, one of which
    extracts common prefixes from alternations, and another which isa
    bitmap of the possible matching first chars.
    
    The bitmap needs to contain the possible first octets of the string
    which the trie can match, and for codepoints which might have a different
    first octet under utf8 or non-utf8 need to register BOTH codepoints.
    
    So for instance in the pattern (?:a|a\x{E4}) we should restructure this
    as a(|\x{E4), and the bitmap for the trie should contain both \x{E4} AND
    \x{C3} as \x{C3} is the first byte of \x{EF} expressed as utf8.
-----------------------------------------------------------------------

Summary of changes:
 regcomp.c  | 14 ++++++++++++++
 t/re/pat.t | 11 +++++++++--
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/regcomp.c b/regcomp.c
index 832c6783e0..a0782d3ff1 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -3264,6 +3264,13 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode 
*startbranch,
                                     TRIE_BITMAP_SET(trie,*ch);
                                     if ( folder )
                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
+                                    if ( !UTF ) {
+                                        /* store first byte of utf8 
representation of
+                                           variant codepoints */
+                                        if (! UVCHR_IS_INVARIANT(*ch)) {
+                                            TRIE_BITMAP_SET(trie, 
UTF8_TWO_BYTE_HI(*ch));
+                                        }
+                                    }
                                     DEBUG_OPTIMISE_r(
                                         Perl_re_printf( aTHX_  "%s", (char*)ch)
                                     );
@@ -3272,6 +3279,13 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode 
*startbranch,
                            TRIE_BITMAP_SET(trie,*ch);
                            if ( folder )
                                TRIE_BITMAP_SET(trie,folder[ *ch ]);
+                            if ( !UTF ) {
+                                /* store first byte of utf8 representation of
+                                   variant codepoints */
+                                if (! UVCHR_IS_INVARIANT(*ch)) {
+                                    TRIE_BITMAP_SET(trie, 
UTF8_TWO_BYTE_HI(*ch));
+                                }
+                            }
                             DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
                        }
                         idx = ofs;
diff --git a/t/re/pat.t b/t/re/pat.t
index 8a562270f1..2a8fe859fa 100644
--- a/t/re/pat.t
+++ b/t/re/pat.t
@@ -23,7 +23,7 @@ BEGIN {
     skip_all('no re module') unless defined &DynaLoader::boot_DynaLoader;
     skip_all_without_unicode_tables();
 
-plan tests => 800;  # Update this when adding/deleting tests.
+plan tests => 802;  # Update this when adding/deleting tests.
 
 run_tests() unless caller;
 
@@ -1799,7 +1799,14 @@ EOP
     TODO: {
         local $::TODO = "RT #21491: m'' interpolates escape sequences";
         is(0+("\n" =~ m'\n'), 0, q|RT #21491: m'\n' should not interpolate|);
-    }
+        }
+
+        {
+            my $str = "a\xE4";
+            ok( $str =~ m{^(a|a\x{e4})$}, "fix [perl #129950] - latin1 case" );
+            utf8::upgrade($str);
+            ok( $str =~ m{^(a|a\x{e4})$}, "fix [perl #129950] - utf8 case" );
+        }
 } # End of sub run_tests
 
 1;

--
Perl5 Master Repository

Reply via email to