In perl.git, the branch blead has been updated <https://perl5.git.perl.org/perl.git/commitdiff/8c20b11d5137b21b9b47609f4e97e1d5c3d355f1?hp=9cfa7410526b2e7b3ea7f31a338c74ab86906ac6>
- Log ----------------------------------------------------------------- commit 8c20b11d5137b21b9b47609f4e97e1d5c3d355f1 Author: Karl Williamson <[email protected]> Date: Tue Nov 28 17:44:02 2017 -0700 perluniprops: Improve sorting Unicode has some property values that should be sorted numerically, but have prefixes that make them not currently appear to be numbers. For example, CCC101 and V10_5. This commit changes so they are sorted by their numeric parts. ----------------------------------------------------------------------- Summary of changes: charclass_invlists.h | 2 +- lib/unicore/mktables | 74 ++++++++++++++++++++++++++++++++++++++++------------ regcharclass.h | 2 +- 3 files changed, 60 insertions(+), 18 deletions(-) diff --git a/charclass_invlists.h b/charclass_invlists.h index 69b42b36f0..841820512c 100644 --- a/charclass_invlists.h +++ b/charclass_invlists.h @@ -97454,7 +97454,7 @@ static const U8 WB_table[24][24] = { * be0f129691d479aa38646e4ca0ec1ee576ae7f75b0300a5624a7fa862fa8abba lib/unicore/extracted/DLineBreak.txt * 92449d354d9f6b6f2f97a292ebb59f6344ffdeb83d120d7d23e569c43ba67cd5 lib/unicore/extracted/DNumType.txt * e3a319527153b0c6c0c549b40fc6f3a01a7a0dcd6620784391db25901df3b154 lib/unicore/extracted/DNumValues.txt - * c145400bb109fc28371c48e1fcde2e55d44c35ac561b38613cfd7ed235ceead3 lib/unicore/mktables + * 7e82d9210fb1c8ffadda5a3a04912fc34a165bfe98ac80c1669c1e67c3de044a lib/unicore/mktables * 21653d2744fdd071f9ef138c805393901bb9547cf3e777ebf50215a191f986ea lib/unicore/version * 913d2f93f3cb6cdf1664db888bf840bc4eb074eef824e082fceda24a9445e60c regen/charset_translations.pl * 48418cbf454eb9ef35c73468ed5ef72ad8603490eabe74181ce4fae42ec72579 regen/mk_invlists.pl diff --git a/lib/unicore/mktables b/lib/unicore/mktables index be4d81dc02..8a7be25759 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -16581,27 +16581,69 @@ sub pod_alphanumeric_sort { return -1 } - # Determine if the two operands are numeric property values or not. - # A numeric property will look like xyz: 3. But the number - # can begin with an optional minus sign, and may have a - # fraction or rational component, like xyz: 3/2. If either - # isn't numeric, use alphabetic sort. - my ($a_initial, $a_number) = - ($a =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix); - return $a cmp $b unless defined $a_number; - my ($b_initial, $b_number) = - ($b =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix); - return $a cmp $b unless defined $b_number; - - # Here they are both numeric, but use alphabetic sort if the - # initial parts don't match - return $a cmp $b if $a_initial ne $b_initial; + # Determine if the two operands are compound or not, and if so if are + # "numeric" property values or not, like \p{Age: 3.0}. But there are also + # things like \p{Canonical_Combining_Class: CCC133} and \p{Age: V10_0}, + # all of which this considers numeric, and for sorting, looks just at the + # numeric parts. It can also be a rational like \p{Numeric Value=-1/2}. + my $split_re = qr/ + ^ ( [^:=]+ ) # $1 is undef if not a compound form, otherwise is the + # property name + [:=] \s* # The syntax for the compound form + (?: # followed by ... + ( # $2 gets defined if what follows is a "numeric" + # expression, which is ... + ( -? \d+ (?: [.\/] \d+)? # An integer, float, or rational + # number, optionally signed + | [[:alpha:]]{2,} \d+ $ ) # or something like CCC131. Either + # of these go into $3 + | ( V \d+ _ \d+ ) # or a Unicode's Age property version + # number, into $4 + ) + | .* $ # If not "numeric", accept anything so that $1 gets + # defined if it is any compound form + ) /ix; + my ($a_initial, $a_numeric, $a_number, $a_version) = ($a =~ $split_re); + my ($b_initial, $b_numeric, $b_number, $b_version) = ($b =~ $split_re); + + # Sort alphabeticlly on the whole property name if either operand isn't + # compound, or they differ. + return $a cmp $b if ! defined $a_initial + || ! defined $b_initial + || $a_initial ne $b_initial; + + if (! defined $a_numeric) { + + # If neither is numeric, use alpha sort + return $a cmp $b if ! defined $b_numeric; + return 1; # Sort numeric ahead of alpha + } + + # Here $a is numeric + return -1 if ! defined $b_numeric; # Numeric sorts before alpha + + # Here they are both numeric in the same property. + # Convert version numbers into regular numbers + if (defined $a_version) { + ($a_number = $a_version) =~ s/^V//i; + $a_number =~ s/_/./; + } + else { # Otherwise get rid of the, e.g., CCC in CCC9 */ + $a_number =~ s/ ^ [[:alpha:]]+ //x; + } + if (defined $b_version) { + ($b_number = $b_version) =~ s/^V//i; + $b_number =~ s/_/./; + } + else { + $b_number =~ s/ ^ [[:alpha:]]+ //x; + } # Convert rationals to floating for the comparison. $a_number = eval $a_number if $a_number =~ qr{/}; $b_number = eval $b_number if $b_number =~ qr{/}; - return $a_number <=> $b_number; + return $a_number <=> $b_number || $a cmp $b; } sub make_pod () { diff --git a/regcharclass.h b/regcharclass.h index a03eacc71f..9ca56f3306 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -1898,7 +1898,7 @@ * be0f129691d479aa38646e4ca0ec1ee576ae7f75b0300a5624a7fa862fa8abba lib/unicore/extracted/DLineBreak.txt * 92449d354d9f6b6f2f97a292ebb59f6344ffdeb83d120d7d23e569c43ba67cd5 lib/unicore/extracted/DNumType.txt * e3a319527153b0c6c0c549b40fc6f3a01a7a0dcd6620784391db25901df3b154 lib/unicore/extracted/DNumValues.txt - * c145400bb109fc28371c48e1fcde2e55d44c35ac561b38613cfd7ed235ceead3 lib/unicore/mktables + * 7e82d9210fb1c8ffadda5a3a04912fc34a165bfe98ac80c1669c1e67c3de044a lib/unicore/mktables * 21653d2744fdd071f9ef138c805393901bb9547cf3e777ebf50215a191f986ea lib/unicore/version * 913d2f93f3cb6cdf1664db888bf840bc4eb074eef824e082fceda24a9445e60c regen/charset_translations.pl * 9ea6338945a7d70e5ea4b31ac7856c0b521df96be002e94b4b3b7d31debbf3ab regen/regcharclass.pl -- Perl5 Master Repository
