In perl.git, the branch blead has been updated

<https://perl5.git.perl.org/perl.git/commitdiff/8c20b11d5137b21b9b47609f4e97e1d5c3d355f1?hp=9cfa7410526b2e7b3ea7f31a338c74ab86906ac6>

- Log -----------------------------------------------------------------
commit 8c20b11d5137b21b9b47609f4e97e1d5c3d355f1
Author: Karl Williamson <[email protected]>
Date:   Tue Nov 28 17:44:02 2017 -0700

    perluniprops: Improve sorting
    
    Unicode has some property values that should be sorted numerically, but
    have prefixes that make them not currently appear to be numbers.  For
    example, CCC101 and V10_5.  This commit changes so they are sorted by
    their numeric parts.

-----------------------------------------------------------------------

Summary of changes:
 charclass_invlists.h |  2 +-
 lib/unicore/mktables | 74 ++++++++++++++++++++++++++++++++++++++++------------
 regcharclass.h       |  2 +-
 3 files changed, 60 insertions(+), 18 deletions(-)

diff --git a/charclass_invlists.h b/charclass_invlists.h
index 69b42b36f0..841820512c 100644
--- a/charclass_invlists.h
+++ b/charclass_invlists.h
@@ -97454,7 +97454,7 @@ static const U8 WB_table[24][24] = {
  * be0f129691d479aa38646e4ca0ec1ee576ae7f75b0300a5624a7fa862fa8abba 
lib/unicore/extracted/DLineBreak.txt
  * 92449d354d9f6b6f2f97a292ebb59f6344ffdeb83d120d7d23e569c43ba67cd5 
lib/unicore/extracted/DNumType.txt
  * e3a319527153b0c6c0c549b40fc6f3a01a7a0dcd6620784391db25901df3b154 
lib/unicore/extracted/DNumValues.txt
- * c145400bb109fc28371c48e1fcde2e55d44c35ac561b38613cfd7ed235ceead3 
lib/unicore/mktables
+ * 7e82d9210fb1c8ffadda5a3a04912fc34a165bfe98ac80c1669c1e67c3de044a 
lib/unicore/mktables
  * 21653d2744fdd071f9ef138c805393901bb9547cf3e777ebf50215a191f986ea 
lib/unicore/version
  * 913d2f93f3cb6cdf1664db888bf840bc4eb074eef824e082fceda24a9445e60c 
regen/charset_translations.pl
  * 48418cbf454eb9ef35c73468ed5ef72ad8603490eabe74181ce4fae42ec72579 
regen/mk_invlists.pl
diff --git a/lib/unicore/mktables b/lib/unicore/mktables
index be4d81dc02..8a7be25759 100644
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -16581,27 +16581,69 @@ sub pod_alphanumeric_sort {
         return -1
     }
 
-    # Determine if the two operands are numeric property values or not.
-    # A numeric property will look like xyz: 3.  But the number
-    # can begin with an optional minus sign, and may have a
-    # fraction or rational component, like xyz: 3/2.  If either
-    # isn't numeric, use alphabetic sort.
-    my ($a_initial, $a_number) =
-        ($a =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
-    return $a cmp $b unless defined $a_number;
-    my ($b_initial, $b_number) =
-        ($b =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
-    return $a cmp $b unless defined $b_number;
-
-    # Here they are both numeric, but use alphabetic sort if the
-    # initial parts don't match
-    return $a cmp $b if $a_initial ne $b_initial;
+    # Determine if the two operands are compound or not, and if so if are
+    # "numeric" property values or not, like \p{Age: 3.0}.  But there are also
+    # things like \p{Canonical_Combining_Class: CCC133} and \p{Age: V10_0},
+    # all of which this considers numeric, and for sorting, looks just at the
+    # numeric parts.  It can also be a rational like \p{Numeric Value=-1/2}.
+    my $split_re = qr/
+        ^ ( [^:=]+ ) # $1 is undef if not a compound form, otherwise is the
+                     # property name
+        [:=] \s*     # The syntax for the compound form
+        (?:          # followed by ...
+            (        # $2 gets defined if what follows is a "numeric"
+                     # expression, which is ...
+              ( -? \d+ (?: [.\/] \d+)?  # An integer, float, or rational
+                                        # number, optionally signed
+               | [[:alpha:]]{2,} \d+ $ ) # or something like CCC131.  Either
+                                         # of these go into $3
+             | ( V \d+ _ \d+ )           # or a Unicode's Age property version
+                                         # number, into $4
+            )
+            | .* $    # If not "numeric", accept anything so that $1 gets
+                      # defined if it is any compound form
+        ) /ix;
+    my ($a_initial, $a_numeric, $a_number, $a_version) = ($a =~ $split_re);
+    my ($b_initial, $b_numeric, $b_number, $b_version) = ($b =~ $split_re);
+
+    # Sort alphabeticlly on the whole property name if either operand isn't
+    # compound, or they differ.
+    return $a cmp $b if   ! defined $a_initial
+                       || ! defined $b_initial
+                       || $a_initial ne $b_initial;
+
+    if (! defined $a_numeric) {
+
+        # If neither is numeric, use alpha sort
+        return $a cmp $b if ! defined $b_numeric;
+        return 1;  # Sort numeric ahead of alpha
+    }
+
+    # Here $a is numeric
+    return -1 if ! defined $b_numeric;  # Numeric sorts before alpha
+
+    # Here they are both numeric in the same property.
+    # Convert version numbers into regular numbers
+    if (defined $a_version) {
+        ($a_number = $a_version) =~ s/^V//i;
+        $a_number =~ s/_/./;
+    }
+    else {  # Otherwise get rid of the, e.g., CCC in CCC9 */
+        $a_number =~ s/ ^ [[:alpha:]]+ //x;
+    }
+    if (defined $b_version) {
+        ($b_number = $b_version) =~ s/^V//i;
+        $b_number =~ s/_/./;
+    }
+    else {
+        $b_number =~ s/ ^ [[:alpha:]]+ //x;
+    }
 
     # Convert rationals to floating for the comparison.
     $a_number = eval $a_number if $a_number =~ qr{/};
     $b_number = eval $b_number if $b_number =~ qr{/};
 
-    return $a_number <=> $b_number;
+    return $a_number <=> $b_number || $a cmp $b;
 }
 
 sub make_pod () {
diff --git a/regcharclass.h b/regcharclass.h
index a03eacc71f..9ca56f3306 100644
--- a/regcharclass.h
+++ b/regcharclass.h
@@ -1898,7 +1898,7 @@
  * be0f129691d479aa38646e4ca0ec1ee576ae7f75b0300a5624a7fa862fa8abba 
lib/unicore/extracted/DLineBreak.txt
  * 92449d354d9f6b6f2f97a292ebb59f6344ffdeb83d120d7d23e569c43ba67cd5 
lib/unicore/extracted/DNumType.txt
  * e3a319527153b0c6c0c549b40fc6f3a01a7a0dcd6620784391db25901df3b154 
lib/unicore/extracted/DNumValues.txt
- * c145400bb109fc28371c48e1fcde2e55d44c35ac561b38613cfd7ed235ceead3 
lib/unicore/mktables
+ * 7e82d9210fb1c8ffadda5a3a04912fc34a165bfe98ac80c1669c1e67c3de044a 
lib/unicore/mktables
  * 21653d2744fdd071f9ef138c805393901bb9547cf3e777ebf50215a191f986ea 
lib/unicore/version
  * 913d2f93f3cb6cdf1664db888bf840bc4eb074eef824e082fceda24a9445e60c 
regen/charset_translations.pl
  * 9ea6338945a7d70e5ea4b31ac7856c0b521df96be002e94b4b3b7d31debbf3ab 
regen/regcharclass.pl

-- 
Perl5 Master Repository

Reply via email to