In perl.git, the branch blead has been updated

<http://perl5.git.perl.org/perl.git/commitdiff/4ee329b3000cfabea3246d7ff8a5cb2511455807?hp=a42d02426c51c2ef1bdefac84284a828de703cda>

- Log -----------------------------------------------------------------
commit 4ee329b3000cfabea3246d7ff8a5cb2511455807
Author: Karl Williamson <pub...@khwilliamson.com>
Date:   Wed Dec 15 15:34:48 2010 -0700

    regexec.c: white space changes due to prev. commit
    
    This is essentially a white-space patch.  Commit
    dbaae6dafbe2bb8bf9fcd1e83874b9a683e0290f effectively removed two nests
    of blocks.  This finishes that removal, outdents the previous innter
    blocks, and reformats the comments for the extra space.
    
    One declaration was moved as a result of removing a block.

M       regexec.c

commit 5a6441acb59053a086abbc64bfec4353a69aff0c
Author: Karl Williamson <pub...@khwilliamson.com>
Date:   Wed Dec 15 15:22:38 2010 -0700

    regex: Multi-char /i shouldnt match single char []
    
    ":\N{LATIN SMALL LIGATURE ST}:" !~ /:[_st]:/i
    
    because it is two character, but previously it did match.  The cause of
    this is that the code was using swash_fetch() to test if the fold
    matches.  But swash_fetch() only checks if the first character matches,
    not all characters, so it was falsely returning true.
    
    This is an intermediate commit, with some cleanup of blocks, comments,
    and accompanying indentation of regexec.c to follow immediately.

M       regexec.c
M       t/uni/fold.t

commit daf3b8d4938645bc97bae0c97b089ea40463c913
Author: Karl Williamson <pub...@khwilliamson.com>
Date:   Wed Dec 15 09:57:25 2010 -0700

    Revamp t/uni/fold.t
    
    This patch revamps fold.t but using essentially the same tests on
    essentially the same character set.  It:
        1) Works on EBCDIC
        2) Uses test.pl
        3) Separates out the 8 tests per character that previously were all
            combined into a single test per character
        4) Outputs on each line the actual test performed
        5) Corrects and hardens some tests on multi-character folding
            characters.
    
    To expand on point 5):  Previously, the wrong behavior was tested for;
    correct behavior failed.  For example,
        ":\N{LATIN SMALL LIGATURE ST}:" =~ /:[_st]:/i
    previously passed.  But the fold of the string is two characters, and so
    should not match a one-character long character class.  Instead it
    should match:
        ":\N{LATIN SMALL LIGATURE ST}:" =~ /:[_st]{2}:/i
    The new test includes TODO tests for both of them.
        ":\N{LATIN SMALL LIGATURE ST}:" !~ /:[_st]:/i
        ":\N{LATIN SMALL LIGATURE ST}:" =~ /:[_st]{2}:/i
    
    Also the inverse relation
        ":st:" =~ /:[_\N{LATIN SMALL LIGATURE ST}]:/i
    passes, semi-coincidentally, for some.  By changing the test to
        ":ST:" =~ /:[_\N{LATIN SMALL LIGATURE ST}]:/i
    they all fail, (and are made TODO's).

M       t/uni/fold.t
-----------------------------------------------------------------------

Summary of changes:
 regexec.c    |  149 ++++++++++++++++++++++++----------------------------------
 t/uni/fold.t |  142 +++++++++++++++++++++++++++++++++++++++++--------------
 2 files changed, 166 insertions(+), 125 deletions(-)

diff --git a/regexec.c b/regexec.c
index f2723e4..b818aa1 100644
--- a/regexec.c
+++ b/regexec.c
@@ -6438,104 +6438,77 @@ S_reginclass(pTHX_ const regexp * const prog, register 
const regnode * const n,
                            }
                        }
                    }
-                   if (!match) {
+                   if (!match) { /* See if the folded version matches */
                        U8 folded[UTF8_MAXBYTES_CASE+1];
-
-                       /* See if the folded version matches */
+                       SV** listp;
                        STRLEN foldlen;
+
                        to_utf8_fold(utf8_p, folded, &foldlen);
-                       if (swash_fetch(sw, folded, 1)) {   /* 1 => is utf8 */
-                           match = TRUE;
-                       }
-                       else {
-                           /* The fold in a few cases  of an above Latin1 char
-                            * is in the Latin1 range, and hence may be in the
-                            * bitmap */
-                           if (UTF8_IS_INVARIANT(*folded)
-                               && ANYOF_BITMAP_TEST(n, UNI_TO_NATIVE(*folded)))
-                           {
-                               match = TRUE;
-                           }
-                           else if (UTF8_IS_DOWNGRADEABLE_START(*folded)
-                                    && ANYOF_BITMAP_TEST(n,
-                                         UNI_TO_NATIVE(
-                                            TWO_BYTE_UTF8_TO_UNI(folded[0],
-                                                                  folded[1]))))
-                           { /* Since the fold comes from internally
-                              * generated data, we can safely assume it is
-                              * valid utf8 in the test above */
 
-                               match = TRUE;
+                       /* Consider "k" =~ /[K]/i.  The line above would have
+                        * just folded the 'k' to itself, and that isn't going
+                        * to match 'K'.  So we look through the closure of
+                        * everything that folds to 'k'.  That will find the
+                        * 'K'.  Initialize the list, if necessary */
+                       if (! PL_utf8_foldclosures) {
+
+                           /* If the folds haven't been read in, call a fold
+                            * function to force that */
+                           if (! PL_utf8_tofold) {
+                               U8 dummy[UTF8_MAXBYTES+1];
+                               STRLEN dummy_len;
+                               to_utf8_fold((U8*) "A", dummy, &dummy_len);
                            }
-                            if (! match) {
-                               SV** listp;
-
-                               /* Consider "k" =~ /[K]/i.  The line above
-                                * would have just folded the 'k' to itself,
-                                * and that isn't going to match 'K'.  So we
-                                * look through the closure of everything that
-                                * folds to 'k'.  That will find the 'K'.
-                                * Initialize the list, if necessary */
-                               if (! PL_utf8_foldclosures) {
-
-                                   /* If the folds haven't been read in, call a
-                                   * fold function to force that */
-                                   if (! PL_utf8_tofold) {
-                                       U8 dummy[UTF8_MAXBYTES+1];
-                                       STRLEN dummy_len;
-                                       to_utf8_fold((U8*) "A",
-                                                           dummy, &dummy_len);
-                                   }
-                                   PL_utf8_foldclosures =
-                                         _swash_inversion_hash(PL_utf8_tofold);
-                               }
+                           PL_utf8_foldclosures =
+                                 _swash_inversion_hash(PL_utf8_tofold);
+                       }
 
-                               /* The data structure is a hash with the keys
-                                * every character that is folded to, like 'k',
-                                * and the values each an array of everything
-                                * that folds to its key.  e.g. [ 'k', 'K',
-                                * KELVIN_SIGN ] */
-                               if ((listp = hv_fetch(PL_utf8_foldclosures,
-                                             (char *) folded, foldlen, FALSE)))
+                       /* The data structure is a hash with the keys every
+                        * character that is folded to, like 'k', and the
+                        * values each an array of everything that folds to its
+                        * key.  e.g. [ 'k', 'K', KELVIN_SIGN ] */
+                       if ((listp = hv_fetch(PL_utf8_foldclosures,
+                                     (char *) folded, foldlen, FALSE)))
+                       {
+                           AV* list = (AV*) *listp;
+                           IV i;
+                           for (i = 0; i <= av_len(list); i++) {
+                               SV** try_p = av_fetch(list, i, FALSE);
+                               char* try_c;
+                               if (try_p == NULL) {
+                                   Perl_croak(aTHX_ "panic: invalid 
PL_utf8_foldclosures structure");
+                               }
+                               /* Don't have to worry about embeded nulls
+                                * since NULL isn't folded or foldable */
+                               try_c = SvPVX(*try_p);
+
+                               /* The fold in a few cases  of an above Latin1
+                                * char is in the Latin1 range, and hence may
+                                * be in the bitmap */
+                               if (UTF8_IS_INVARIANT(*try_c)
+                                   && ANYOF_BITMAP_TEST(n,
+                                                   UNI_TO_NATIVE(*try_c)))
                                {
-                                   AV* list = (AV*) *listp;
-                                   IV i;
-                                   for (i = 0; i <= av_len(list); i++) {
-                                       SV** try_p = av_fetch(list, i, FALSE);
-                                       char* try_c;
-                                       if (try_p == NULL) {
-                                           Perl_croak(aTHX_ "panic: invalid 
PL_utf8_foldclosures structure");
-                                       }
-                                       /* Don't have to worry about embeded
-                                        * nulls since NULL isn't folded or
-                                        * foldable */
-                                       try_c = SvPVX(*try_p);
-                                       if (UTF8_IS_INVARIANT(*try_c)
-                                           && ANYOF_BITMAP_TEST(n,
-                                                           
UNI_TO_NATIVE(*try_c)))
-                                       {
-                                           match = TRUE;
-                                           break;
-                                       }
-                                       else if
-                                           (UTF8_IS_DOWNGRADEABLE_START(*try_c)
-                                            && ANYOF_BITMAP_TEST(n,
-                                            UNI_TO_NATIVE(
+                                   match = TRUE;
+                                   break;
+                               }
+                               else if
+                                   (UTF8_IS_DOWNGRADEABLE_START(*try_c)
+                                    && ANYOF_BITMAP_TEST(n, UNI_TO_NATIVE(
                                                TWO_BYTE_UTF8_TO_UNI(try_c[0],
-                                                                    
try_c[1]))))
-                                       {
-                                           match = TRUE;
-                                           break;
-                                       } else if (swash_fetch(sw,
-                                                               (U8*) try_c, 1))
-                                       {
-                                           match = TRUE;
-                                           break;
-                                       }
-                                   }
+                                                                   try_c[1]))))
+                               {
+                                  /* Since the fold comes from internally
+                                   * generated data, we can safely assume it
+                                   * is valid utf8 in the test above */
+                                   match = TRUE;
+                                   break;
+                               } else if (swash_fetch(sw, (U8*) try_c, 1)) {
+                                   match = TRUE;
+                                   break;
                                }
                            }
-                        }
+                       }
                    }
                }
 
diff --git a/t/uni/fold.t b/t/uni/fold.t
index 0f71c80..52417c1 100644
--- a/t/uni/fold.t
+++ b/t/uni/fold.t
@@ -1,54 +1,122 @@
+use strict;
+use warnings;
+
+# re/fold_grind.t has more complex tests, but doesn't test every fold
+
 BEGIN {
     chdir 't' if -d 't';
     @INC = '../lib';
+    require './test.pl';
 }
 
+binmode *STDOUT, ":utf8";
+
 use File::Spec;
+our $TODO;
 
+plan("no_plan");
+
+# Read in the official case folding definitions.
 my $CF = File::Spec->catfile(File::Spec->catdir(File::Spec->updir,
                                               "lib", "unicore"),
                            "CaseFolding.txt");
 
-use constant EBCDIC => ord 'A' == 193;
+die qq[$0: failed to open "$CF": $!\n] if ! open(my $fh, "<", $CF);
+
+my @CF;
+my %reverse_fold;
+while (<$fh>) {
+    # Skip S since we are going for 'F'ull case folding.  I is obsolete 
starting
+    # with Unicode 3.2, but leaving it in does no harm, and allows backward
+    # compatibility
+    next unless my ($code, $type, $mapping, $name) = $_ =~
+            /^([0-9A-F]+); ([CFI]); ((?:[0-9A-F]+)(?: [0-9A-F]+)*); \# (.+)/;
+
+    # Convert any 0-255 range chars to native.
+    $code = sprintf("%04X", ord_latin1_to_native(hex $code)) if hex $code < 
0x100;
+    $mapping = join " ", map { $_ =
+                                sprintf("%04X", ord_latin1_to_native(hex $_)) }
+                                                            split / /, 
$mapping;
+
+    push @CF, [$code, $mapping, $type, $name];
+
+    # Get the inverse fold for single-char mappings.
+    $reverse_fold{pack "U0U*", hex $mapping} = pack "U0U*", hex $code if $type 
ne 'F';
+}
+
+close($fh) or die "$0 Couldn't close $CF";
+
+foreach my $test_ref (@CF) {
+    my ($code, $mapping, $type, $name) = @$test_ref;
+    my $c = pack("U0U*", hex $code);
+    my $f = pack("U0U*", map { hex } split " ", $mapping);
+    my $f_length = length $f;
+    foreach my $test (
+            qq[":$c:" =~ /:$c:/],
+            qq[":$c:" =~ /:$c:/i],
+            qq[":$c:" =~ /:[_$c]:/], # Place two chars in [] so doesn't get
+                                     # optimized to a non-charclass
+            qq[":$c:" =~ /:[_$c]:/i],
+            qq[":$c:" =~ /:$f:/i],
+            qq[":$f:" =~ /:$c:/i],
+    ) {
+        ok eval $test, "$code - $name - $mapping - $type - $test";
+    }
 
-if (open(CF, $CF)) {
-    my @CF;
+    # Certain tests weren't convenient to put in the list above since they are
+    # TODO's in multi-character folds.
+    if ($f_length == 1) {
 
-    while (<CF>) {
-       # Skip S since we are going for 'F'ull case folding.  I is obsolete 
starting
-       # with Unicode 3.2, but leaving it in does no harm, and allows backward
-       # compatibility
-        if (/^([0-9A-F]+); ([CFI]); ((?:[0-9A-F]+)(?: [0-9A-F]+)*); \# (.+)/) {
-           next if EBCDIC && hex $1 < 0x100;
-           push @CF, [$1, $2, $3, $4];
-       }
+        # The qq loses the utf8ness of ":$f:".  These tests are not about
+        # finding bugs in utf8ness, so make sure it's utf8.
+        my $test = qq[my \$s = ":$f:"; utf8::upgrade(\$s); \$s =~ /:[_$c]:/i];
+        ok eval $test, "$code - $name - $mapping - $type - $test";
+        $test = qq[":$c:" =~ /:[_$f]:/i];
+        ok eval $test, "$code - $name - $mapping - $type - $test";
     }
+    else {
+
+        # There are two classes of multi-char folds that don't pass.  For
+        # example,
+        #   ":ß:" =~ /:[_s]{2}:/i
+        #   ":ss:" =~ /:[_ß]:/i
+        #
+        # Some of the old tests for the second case happened to pass somewhat
+        # coincidentally.  But none would pass if changed to this.
+        #   ":SS:" =~ /:[_ß]:/i
+        #
+        # As the capital SS doesn't get folded.  When those pass, it means
+        # that the code has been changed to take into account folding in the
+        # string, and all should pass, capitalized or not.  So, what is done
+        # is to essentially upper-case the string for this class (but use the
+        # reverse fold not uc(), as that is more correct)
+        my $u;
+        for my $i (0 .. $f_length - 1) {
+            my $cur_char = substr($f, $i, 1);
+            $u .= $reverse_fold{$cur_char} || $cur_char;
+        }
+        my $test;
+
+        # A multi-char fold should not match just one char;
+        # e.g., ":ß:" !~ /:[_s]:/i
+        $test = qq[":$c:" !~ /:[_$f]:/i];
+        ok eval $test, "$code - $name - $mapping - $type - $test";
 
-    close(CF);
-
-    die qq[$0: failed to find casefoldings from "$CF"\n] unless @CF;
-
-    print "1..", scalar @CF, "\n";
-
-    my $i = 0;
-    for my $cf (@CF) {
-       my ($code, $status, $mapping, $name) = @$cf;
-       $i++;
-       my $a = pack("U0U*", hex $code);
-       my $b = pack("U0U*", map { hex } split " ", $mapping);
-       my $t0 = ":$a:" =~ /:$a:/    ? 1 : 0;
-       my $t1 = ":$a:" =~ /:$a:/i   ? 1 : 0;
-       my $t2 = ":$a:" =~ /:[_$a]:/  ? 1 : 0; # Two chars in [] so doesn't get
-                                               # optimized to a non-charclass
-       my $t3 = ":$a:" =~ /:[_$a]:/i ? 1 : 0;
-       my $t4 = ":$a:" =~ /:$b:/i   ? 1 : 0;
-       my $t5 = ":$a:" =~ /:[_$b]:/i ? 1 : 0;
-       my $t6 = ":$b:" =~ /:$a:/i   ? 1 : 0;
-       my $t7 = ":$b:" =~ /:[_$a]:/i ? 1 : 0;
-       print $t0 && $t1 && $t2 && $t3 && $t4 && $t5 && $t6 && $t7 ?
-           "ok $i \# - $code - $name - $mapping - $status\n" :
-           "not ok $i \# - $code - $name - $mapping - $status - $t0 $t1 $t2 
$t3 $t4 $t5 $t6 $t7\n";
+        local $TODO = 'Multi-char fold in [character class]';
+
+        TODO: { # e.g., ":ß:" =~ /:[_s]{2}:/i
+            $test = qq[":$c:" =~ /:[_$f]{$f_length}:/i];
+            ok eval $test, "$code - $name - $mapping - $type - $test";
+        }
+        TODO: { # e.g., ":SS:" =~ /:[_ß]:/i
+            $test = qq[ my \$s = ":$u:"; utf8::upgrade(\$s); \$s =~ 
/:[_$c]:/i];
+            ok eval $test, "$code - $name - $mapping - $type - $test";
+        }
     }
-} else {
-    die qq[$0: failed to open "$CF": $!\n];
 }
+
+my $num_tests = curr_test() - 1;
+
+die qq[$0: failed to find casefoldings from "$CF"\n] unless $num_tests > 0;
+
+plan($num_tests);

--
Perl5 Master Repository

Reply via email to