In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/a9ad02a848bdc09f2db9ffd8a955709e81afce74?hp=405350d5a880c875cd450beac2a60417ebc58f01>
- Log ----------------------------------------------------------------- commit a9ad02a848bdc09f2db9ffd8a955709e81afce74 Author: Karl Williamson <k...@cpan.org> Date: Thu Mar 17 16:11:04 2016 -0600 locale.c: Add comment M locale.c commit 289ce9ccf71a0b3a1f849f181c591ff860ad74ed Author: Karl Williamson <k...@cpan.org> Date: Mon Mar 14 20:49:00 2016 -0600 regen/mk_invlists.pl: Revamp so works on earlier Unicodes The code that generates the tables for the \b{foo} handling (in regexec.c) did not correctly work when compiled on an earlier Unicode. This fixes things up to do that, consolidating some common code into a common function and making the generated hdr file look nice, with the tables taking fewer columns of screen space M charclass_invlists.h M lib/unicore/mktables M regcharclass.h M regen/mk_invlists.pl commit c84b473c717dfd2cd2c9d1e61d70721565c537a7 Author: Karl Williamson <k...@cpan.org> Date: Thu Mar 17 15:25:09 2016 -0600 mktables: Use correct structure to look up data There are two types of tables in mktables: Map tables map code points to the values a property have for those code points; and match tables which are booleans, give "does a code point match a given property value?". There are different data structures to encapsulate each. This code was using the wrong structure to look something up. Usually this failed, and a fall-back value was used instead. When compiling an early Unicode release, I discovered that there could be a conflict. M charclass_invlists.h M lib/unicore/mktables M regcharclass.h commit 4ed2b786f0a44b01cfc49ae1a48bd9745aac2d6f Author: Karl Williamson <k...@cpan.org> Date: Thu Mar 17 14:27:34 2016 -0600 mktables: Fix bug with early Unicode versions An array had 2 optional elements at the end. I got confused about handling them. This change first deals with the final one, pops it and saves it separately if found. Then only one optional element needs to be dealt with in the course of the code. This only gets executed for very early Unicode versions M charclass_invlists.h M lib/unicore/mktables M regcharclass.h commit 933afa2c456ccd7f9ecc3376f645028e485c874f Author: Karl Williamson <k...@cpan.org> Date: Tue Mar 15 16:19:30 2016 -0600 mktables: Unicode 1.5 only had 2**16 code points Therefore, we shouldn't add any above that. M charclass_invlists.h M lib/unicore/mktables M regcharclass.h ----------------------------------------------------------------------- Summary of changes: charclass_invlists.h | 194 +++++------ lib/unicore/mktables | 132 +++++--- locale.c | 4 +- regcharclass.h | 2 +- regen/mk_invlists.pl | 932 ++++++++++++++++++++++++++++++++------------------- 5 files changed, 790 insertions(+), 474 deletions(-) diff --git a/charclass_invlists.h b/charclass_invlists.h index 9c8dbac..6abd325 100644 --- a/charclass_invlists.h +++ b/charclass_invlists.h @@ -87736,109 +87736,109 @@ static const UV XPosixXDigit_invlist[] = { /* for EBCDIC 037 */ #if defined(PERL_IN_REGEXEC_C) static const bool GCB_table[14][14] = { -/* XX CR CN EX L LF LV LVT PP RI SM T V edge*/ -/* XX*/ { 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1}, -/* CR*/ { 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1}, -/* CN*/ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, -/* EX*/ { 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1}, -/* L*/ { 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1}, -/* LF*/ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, -/* LV*/ { 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1}, -/* LVT*/ { 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1}, -/* PP*/ { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1}, -/* RI*/ { 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1}, -/* SM*/ { 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1}, -/* T*/ { 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1}, -/* V*/ { 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1}, -/*edge*/ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0} + /* 'edg' stands for 'EDGE' */ +/* XX CR CN EX L LF LV LVT PP RI SM T V edg */ +/* XX */ { 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1 }, +/* CR */ { 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1 }, +/* CN */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, +/* EX */ { 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1 }, +/* L */ { 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1 }, +/* LF */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, +/* LV */ { 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1 }, +/* LVT*/ { 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1 }, +/* PP */ { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1 }, +/* RI */ { 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1 }, +/* SM */ { 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1 }, +/* T */ { 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1 }, +/* V */ { 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1 }, +/* edg*/ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0 } }; -#define LB_NOBREAK 0 -#define LB_BREAKABLE 1 -#define LB_NOBREAK_EVEN_WITH_SP_BETWEEN 2 -#define LB_CM_foo 3 -#define LB_SP_foo 6 -#define LB_PR_or_PO_then_OP_or_HY 9 -#define LB_SY_or_IS_then_various 11 -#define LB_HY_or_BA_then_foo 13 -#define LB_various_then_PO_or_PR 16 +#define LB_NOBREAK 0 +#define LB_BREAKABLE 1 +#define LB_NOBREAK_EVEN_WITH_SP_BETWEEN 2 +#define LB_CM_foo 3 +#define LB_SP_foo 6 +#define LB_PR_or_PO_then_OP_or_HY 9 +#define LB_SY_or_IS_then_various 11 +#define LB_HY_or_BA_then_foo 13 +#define LB_various_then_PO_or_PR 16 static const U8 LB_table[36][36] = { - -/* 'ed' stands for 'edge' */ -/* AL BA BB B2 SY CR CP CL CM CB EX GL H2 H3 HL HY ID IS IN JL JT JV LF BK NL NS NU OP PO PR QU RI SP WJ ZW ed */ -/* AL */ { 0, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 0, 0, 1, 2, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1 }, -/* BA */ { 14, 0, 14, 14, 2, 0, 2, 2, 0, 1, 2, 14, 14, 14, 14, 0, 14, 2, 14, 14, 14, 14, 0, 0, 0, 0, 14, 14, 14, 14, 0, 14, 0, 0, 0, 1 }, -/* BB */ { 0, 0, 0, 0, 2, 0, 2, 2, 0, 1, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 }, -/* B2 */ { 1, 0, 1, 2, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 1, 0, 1, 2, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1 }, -/* SY */ { 1, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 0, 0, 1, 2, 1, 1, 1, 1, 0, 0, 0, 0, 12, 1, 17, 17, 0, 1, 0, 0, 0, 1 }, -/* CR */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, -/* CP */ { 0, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 0, 0, 1, 2, 1, 1, 1, 1, 0, 0, 0, 2, 0, 1, 17, 17, 0, 1, 0, 0, 0, 1 }, -/* CL */ { 1, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 1, 0, 1, 2, 1, 1, 1, 1, 0, 0, 0, 2, 1, 1, 17, 17, 0, 1, 0, 0, 0, 1 }, -/* CM */ { 3, 3, 3, 3, 3, 0, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 0, 3, 0, 1 }, -/* CB */ { 1, 1, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1 }, -/* EX */ { 1, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 1, 0, 1, 2, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1 }, -/* GL */ { 0, 0, 0, 0, 2, 0, 2, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 }, -/* H2 */ { 1, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 1, 0, 1, 2, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1 }, -/* H3 */ { 1, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 1, 0, 1, 2, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1 }, -/* HL */ { 0, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 0, 0, 1, 2, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1 }, -/* HY */ { 14, 0, 14, 14, 2, 0, 2, 2, 0, 1, 2, 14, 14, 14, 14, 0, 14, 2, 14, 14, 14, 14, 0, 0, 0, 0, 13, 14, 14, 14, 0, 14, 0, 0, 0, 1 }, -/* ID */ { 1, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 1, 0, 1, 2, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1 }, -/* IS */ { 0, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 0, 0, 1, 2, 1, 1, 1, 1, 0, 0, 0, 0, 12, 1, 17, 17, 0, 1, 0, 0, 0, 1 }, -/* IN */ { 1, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 1, 0, 1, 2, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1 }, -/* JL */ { 1, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 0, 0, 1, 0, 1, 2, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1 }, -/* JT */ { 1, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 1, 0, 1, 2, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1 }, -/* JV */ { 1, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 1, 0, 1, 2, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1 }, -/* LF */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, -/* BK */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, -/* NL */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, -/* NS */ { 1, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 1, 0, 1, 2, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1 }, -/* NU */ { 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 2, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1 }, -/* OP */ { 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 1 }, -/* PO */ { 0, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 0, 0, 1, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 10, 1, 1, 0, 1, 0, 0, 0, 1 }, -/* PR */ { 0, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 10, 1, 1, 0, 1, 0, 0, 0, 1 }, -/* QU */ { 0, 0, 0, 0, 2, 0, 2, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1 }, -/* RI */ { 1, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 1, 0, 1, 2, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1 }, -/* SP */ { 7, 7, 7, 7, 8, 0, 8, 8, 7, 7, 8, 7, 7, 7, 7, 7, 7, 8, 7, 7, 7, 7, 0, 0, 0, 7, 7, 7, 7, 7, 7, 7, 0, 8, 0, 1 }, -/* WJ */ { 0, 0, 0, 0, 2, 0, 2, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 }, -/* ZW */ { 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1 }, -/* ed */ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } + /* 'edg' stands for 'EDGE' */ +/* AL BA BB B2 SY CR CP CL CM CB EX GL H2 H3 HL HY ID IS IN JL JT JV LF BK NL NS NU OP PO PR QU RI SP WJ ZW edg */ +/* AL */ { 0, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 0, 0, 1, 2, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1 }, +/* BA */ {14, 0,14,14, 2, 0, 2, 2, 0, 1, 2,14,14,14,14, 0,14, 2,14,14,14,14, 0, 0, 0, 0,14,14,14,14, 0,14, 0, 0, 0, 1 }, +/* BB */ { 0, 0, 0, 0, 2, 0, 2, 2, 0, 1, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 }, +/* B2 */ { 1, 0, 1, 2, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 1, 0, 1, 2, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1 }, +/* SY */ { 1, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 0, 0, 1, 2, 1, 1, 1, 1, 0, 0, 0, 0,12, 1,17,17, 0, 1, 0, 0, 0, 1 }, +/* CR */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, +/* CP */ { 0, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 0, 0, 1, 2, 1, 1, 1, 1, 0, 0, 0, 2, 0, 1,17,17, 0, 1, 0, 0, 0, 1 }, +/* CL */ { 1, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 1, 0, 1, 2, 1, 1, 1, 1, 0, 0, 0, 2, 1, 1,17,17, 0, 1, 0, 0, 0, 1 }, +/* CM */ { 3, 3, 3, 3, 3, 0, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 0, 3, 0, 1 }, +/* CB */ { 1, 1, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1 }, +/* EX */ { 1, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 1, 0, 1, 2, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1 }, +/* GL */ { 0, 0, 0, 0, 2, 0, 2, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 }, +/* H2 */ { 1, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 1, 0, 1, 2, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1 }, +/* H3 */ { 1, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 1, 0, 1, 2, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1 }, +/* HL */ { 0, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 0, 0, 1, 2, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1 }, +/* HY */ {14, 0,14,14, 2, 0, 2, 2, 0, 1, 2,14,14,14,14, 0,14, 2,14,14,14,14, 0, 0, 0, 0,13,14,14,14, 0,14, 0, 0, 0, 1 }, +/* ID */ { 1, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 1, 0, 1, 2, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1 }, +/* IS */ { 0, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 0, 0, 1, 2, 1, 1, 1, 1, 0, 0, 0, 0,12, 1,17,17, 0, 1, 0, 0, 0, 1 }, +/* IN */ { 1, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 1, 0, 1, 2, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1 }, +/* JL */ { 1, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 0, 0, 1, 0, 1, 2, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1 }, +/* JT */ { 1, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 1, 0, 1, 2, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1 }, +/* JV */ { 1, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 1, 0, 1, 2, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1 }, +/* LF */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, +/* BK */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, +/* NL */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, +/* NS */ { 1, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 1, 0, 1, 2, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1 }, +/* NU */ { 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 2, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1 }, +/* OP */ { 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 1 }, +/* PO */ { 0, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 0, 0, 1, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0,10, 1, 1, 0, 1, 0, 0, 0, 1 }, +/* PR */ { 0, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0,10, 1, 1, 0, 1, 0, 0, 0, 1 }, +/* QU */ { 0, 0, 0, 0, 2, 0, 2, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1 }, +/* RI */ { 1, 0, 1, 1, 2, 0, 2, 2, 0, 1, 2, 0, 1, 1, 1, 0, 1, 2, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1 }, +/* SP */ { 7, 7, 7, 7, 8, 0, 8, 8, 7, 7, 8, 7, 7, 7, 7, 7, 7, 8, 7, 7, 7, 7, 0, 0, 0, 7, 7, 7, 7, 7, 7, 7, 0, 8, 0, 1 }, +/* WJ */ { 0, 0, 0, 0, 2, 0, 2, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 }, +/* ZW */ { 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1 }, +/* edg*/ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }; -#define WB_NOBREAK 0 -#define WB_BREAKABLE 1 -#define WB_hs_then_hs 2 -#define WB_Ex_or_FO_then_foo 3 -#define WB_DQ_then_HL 4 -#define WB_HL_then_DQ 6 -#define WB_LE_or_HL_then_MB_or_ML_or_SQ 8 -#define WB_MB_or_ML_or_SQ_then_LE_or_HL 10 -#define WB_MB_or_MN_or_SQ_then_NU 12 -#define WB_NU_then_MB_or_MN_or_SQ 14 +#define WB_NOBREAK 0 +#define WB_BREAKABLE 1 +#define WB_hs_then_hs 2 +#define WB_Ex_or_FO_then_foo 3 +#define WB_DQ_then_HL 4 +#define WB_HL_then_DQ 6 +#define WB_LE_or_HL_then_MB_or_ML_or_SQ 8 +#define WB_MB_or_ML_or_SQ_then_LE_or_HL 10 +#define WB_MB_or_MN_or_SQ_then_NU 12 +#define WB_NU_then_MB_or_MN_or_SQ 14 static const U8 WB_table[19][19] = { - -/* 'Ex' stands for 'Extend'; 'hs' for 'Perl_Tailored_HSpace'; 'ed' for 'edge' */ -/* XX LE CR DQ Ex EX FO HL KA LF ML MN MB NL NU hs RI SQ ed */ -/* XX */ { 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, -/* LE */ { 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 9, 1, 9, 1, 0, 1, 1, 9, 1 }, -/* CR */ { 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1 }, -/* DQ */ { 1, 1, 1, 1, 0, 1, 0, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, -/* Ex */ { 3, 3, 1, 3, 0, 3, 0, 3, 3, 1, 3, 3, 3, 1, 3, 1, 3, 3, 1 }, -/* EX */ { 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1 }, -/* FO */ { 3, 3, 1, 3, 0, 3, 0, 3, 3, 1, 3, 3, 3, 1, 3, 1, 3, 3, 1 }, -/* HL */ { 1, 0, 1, 7, 0, 0, 0, 0, 1, 1, 9, 1, 9, 1, 0, 1, 1, 8, 1 }, -/* KA */ { 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, -/* LF */ { 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1 }, -/* ML */ { 1, 11, 1, 1, 0, 1, 0, 11, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, -/* MN */ { 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 13, 1, 1, 1, 1 }, -/* MB */ { 1, 11, 1, 1, 0, 1, 0, 11, 1, 1, 1, 1, 1, 1, 13, 1, 1, 1, 1 }, -/* NL */ { 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1 }, -/* NU */ { 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 15, 15, 1, 0, 1, 1, 15, 1 }, -/* hs */ { 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 2, 1, 1, 1 }, -/* RI */ { 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1 }, -/* SQ */ { 1, 11, 1, 1, 0, 1, 0, 11, 1, 1, 1, 1, 1, 1, 13, 1, 1, 1, 1 }, -/* ed */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0 } + /* 'Ext' stands for 'Extend'; 'edg' stands for 'EDGE'; 'hs' stands + * for 'Perl_Tailored_HSpace'; 'unk' stands for 'UNKNOWN' */ +/* XX LE CR DQ Ext EX FO HL KA LF ML MN MB NL NU hs RI SQ edg */ +/* XX */ { 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, +/* LE */ { 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 9, 1, 9, 1, 0, 1, 1, 9, 1 }, +/* CR */ { 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1 }, +/* DQ */ { 1, 1, 1, 1, 0, 1, 0, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, +/* Ext*/ { 3, 3, 1, 3, 0, 3, 0, 3, 3, 1, 3, 3, 3, 1, 3, 1, 3, 3, 1 }, +/* EX */ { 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1 }, +/* FO */ { 3, 3, 1, 3, 0, 3, 0, 3, 3, 1, 3, 3, 3, 1, 3, 1, 3, 3, 1 }, +/* HL */ { 1, 0, 1, 7, 0, 0, 0, 0, 1, 1, 9, 1, 9, 1, 0, 1, 1, 8, 1 }, +/* KA */ { 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, +/* LF */ { 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1 }, +/* ML */ { 1,11, 1, 1, 0, 1, 0,11, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, +/* MN */ { 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1,13, 1, 1, 1, 1 }, +/* MB */ { 1,11, 1, 1, 0, 1, 0,11, 1, 1, 1, 1, 1, 1,13, 1, 1, 1, 1 }, +/* NL */ { 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1 }, +/* NU */ { 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1,15,15, 1, 0, 1, 1,15, 1 }, +/* hs */ { 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 2, 1, 1, 1 }, +/* RI */ { 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1 }, +/* SQ */ { 1,11, 1, 1, 0, 1, 0,11, 1, 1, 1, 1, 1, 1,13, 1, 1, 1, 1 }, +/* edg*/ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0 } }; #endif /* defined(PERL_IN_REGEXEC_C) */ @@ -87887,8 +87887,8 @@ static const U8 WB_table[19][19] = { * 1a0687fb9c6c4567e853913549df0944fe40821279a3e9cdaa6ab8679bc286fd lib/unicore/extracted/DLineBreak.txt * 40bcfed3ca727c19e1331f6c33806231d5f7eeeabd2e6a9e06a3740c85d0c250 lib/unicore/extracted/DNumType.txt * a18d502bad39d527ac5586d7bc93e29f565859e3bcc24ada627eff606d6f5fed lib/unicore/extracted/DNumValues.txt - * fdff462ac2c512b5990e6276d8175d6a511c14654c31dc2fcfb2f802b9fa5c8e lib/unicore/mktables + * 285aef7ed2bf69724b1fa9bba177640636f666e1a5dd0ba5e538d4790129bbfe lib/unicore/mktables * 462c9aaa608fb2014cd9649af1c5c009485c60b9c8b15b89401fdc10cf6161c6 lib/unicore/version * 913d2f93f3cb6cdf1664db888bf840bc4eb074eef824e082fceda24a9445e60c regen/charset_translations.pl - * 5774f77d07a81945b6a679ecce07ad90cdb334f3fb402ff63bdbecd2ec67da05 regen/mk_invlists.pl + * 12bd58cb9d5a99f631ca95e269f7f9c90dacaf81020efa5d95a995f3cdc19200 regen/mk_invlists.pl * ex: set ro: */ diff --git a/lib/unicore/mktables b/lib/unicore/mktables index d5bf009..0e70a78 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -2289,10 +2289,10 @@ sub trace { return main::trace(@_); } # points not given in the input. If not present, the default from the # normal property is used # - # [4] if present must be the string 'ONLY_EARLY'. Normally, when - # compiling Unicode versions that don't invoke the early handling, the - # name in [1] is added as an alias to the property name used for these. - # This parameter says to not do this. + # [-1] If there is an extra final element that is the string 'ONLY_EARLY'. + # it means to not add the name in [1] as an alias to the property name + # used for these. Normally, when compiling Unicode versions that don't + # invoke the early handling, the name is added as a synonym. # # Not all files can be handled in the above way, and so the code ref # alternative is available. It can do whatever it needs to. The other @@ -2304,6 +2304,9 @@ sub trace { return main::trace(@_); } # makes for easier testing later on. main::set_access('early', \%early, 'c'); + my %only_early; + main::set_access('only_early', \%only_early, 'c'); + my %required_even_in_debug_skip; # debug_skip is used to speed up compilation during debugging by skipping # processing files that are not needed for the task at hand. However, @@ -2414,9 +2417,9 @@ sub trace { return main::trace(@_); } my $progress; my $function_instead_of_file = 0; - if ($early{$addr}->@* > 4 && $early{$addr}[4] ne 'ONLY_EARLY') { - Carp::my_carp_bug("If present, element [4] in 'Early => [ ... ]'" - . " must be the string 'ONLY_EARLY'"); + if ($early{$addr}->@* && $early{$addr}[-1] eq 'ONLY_EARLY') { + $only_early{$addr} = 1; + pop $early{$addr}->@*; } # If we are compiling a Unicode release earlier than the file became @@ -2427,6 +2430,8 @@ sub trace { return main::trace(@_); } unshift $early{$addr}->@*, 1; # See the definition of %early for what the array elements mean. + # Note that we have just unshifted onto the array, so the numbers + # below are +1 of those in the %early description. # If we have a property this defines, create a table and default # map for it now (at essentially compile time), so that it will be # available for the whole of run time. (We will want to add this @@ -2445,8 +2450,8 @@ sub trace { return main::trace(@_); } # If not specified by the constructor, use the default mapping # for the regular property for this substitute one. - if ($early{$addr}[3]) { - $prop_object->set_default_map($early{$addr}[3]); + if ($early{$addr}[4]) { + $prop_object->set_default_map($early{$addr}[4]); } elsif ( defined $property{$addr} && defined $default_mapping{$property{$addr}}) @@ -2497,7 +2502,7 @@ END push $each_line_handler{$addr}->@*, \&_exclude_unassigned; if ( $v_version lt v2.0 # Hanguls in this release ... - && defined $early{$addr}[4]) # ... need special treatment + && defined $early{$addr}[3]) # ... need special treatment { push $eof_handler{$addr}->@*, \&_fixup_obsolete_hanguls; } @@ -2717,11 +2722,12 @@ END # official property, we still have to allow the publicly # inaccessible early name so that the core code which uses it # will work regardless. - if (! $early{$addr}[0] && $early{$addr}->@* > 2) { + if ( ! $only_early{$addr} + && ! $early{$addr}[0] + && $early{$addr}->@* > 2) + { my $early_property_name = $early{$addr}[2]; - if ( $property{$addr} ne $early_property_name - && $early{$addr}->@* < 5) - { + if ($property{$addr} ne $early_property_name) { main::property_ref($property{$addr}) ->add_alias($early_property_name); } @@ -6977,18 +6983,29 @@ sub trace { return main::trace(@_); } sub set_default_map { # Define what code points that are missing from the input files should - # map to + # map to. The optional second parameter 'full_name' indicates to + # force using the full name of the map instead of its standard name. my $self = shift; my $map = shift; + my $use_full_name = shift // 0; Carp::carp_extra_args(\@_) if main::DEBUG && @_; + if ($use_full_name && $use_full_name ne 'full_name') { + Carp::my_carp_bug("Second parameter to set_default_map() if" + . " present, must be 'full_name'"); + } + my $addr = do { no overloading; pack 'J', $self; }; # Convert the input to the standard equivalent, if any (won't have any # for $STRING properties) - my $standard = $self->_find_table_from_alias->{$map}; - $map = $standard->name if defined $standard; + my $standard = $self->property->table($map); + if (defined $standard) { + $map = ($use_full_name) + ? $standard->full_name + : $standard->name; + } # Warn if there already is a non-equivalent default map for this # property. Note that a default map can be a ref, which means that @@ -10604,8 +10621,8 @@ ea ; W ; Wide END } - if (-e 'LineBreak.txt') { - push @return, split /\n/, <<'END'; + if (-e 'LineBreak.txt' || -e 'LBsubst.txt') { + my @lb = split /\n/, <<'END'; lb ; AI ; Ambiguous lb ; AL ; Alphabetic lb ; B2 ; Break_Both @@ -10636,6 +10653,12 @@ lb ; SY ; Break_Symbols lb ; XX ; Unknown lb ; ZW ; ZWSpace END + # If this Unicode version predates the lb property, we use our + # substitute one + if (-e 'LBsubst.txt') { + $_ = s/^lb/_Perl_LB/r for @lb; + } + push @return, @lb; } if (-e 'DNormalizationProps.txt') { @@ -12149,6 +12172,24 @@ sub process_NamedSequences { } } +sub filter_substitute_lb { + # Used on Unicodes that predate the LB property, where there is a + # substitute file. This just does the regular ea_lb handling for such + # files, and then substitutes the long property value name for the short + # one that comes with the file. (The other break files have the long + # names in them, so this is the odd one out.) The reason for doing this + # kludge is that regen/mk_invlists.pl is expecting the long name. This + # also fixes the typo 'Inseperable' that leads to problems. + + filter_early_ea_lb; + return unless $_; + + my @fields = split /\s*;\s*/; + $fields[1] = property_ref('_Perl_LB')->table($fields[1])->full_name; + $fields[1] = 'Inseparable' if lc $fields[1] eq 'inseperable'; + $_ = join '; ', @fields; +} + sub filter_old_style_arabic_shaping { # Early versions used a different term for the later one. @@ -15051,8 +15092,8 @@ END 0x2060 .. 0x206F, 0xFE00 .. 0xFE0F, 0xFFF0 .. 0xFFFB, - 0xE0000 .. 0xE0FFF, ]); + $temp->add_range(0xE0000, 0xE0FFF) if $v_version ge v2.0; $quotemeta += $temp; } calculate_DI(); @@ -15111,6 +15152,10 @@ END # SA CM Only Mn or Mc # SA AL Any except Mn and Mc # CJ NS Any + # + # All property values are also written out in their long form, as + # regen/mk_invlist.pl expects that. This also fixes occurrences of the + # typo in early Unicode versions: 'inseperable'. my $perl_lb = property_ref('_Perl_LB'); if (! defined $perl_lb) { $perl_lb = Property->new('_Perl_LB', @@ -15119,36 +15164,40 @@ END Directory => $map_directory, Type => $STRING); my $lb = property_ref('Line_Break'); - $perl_lb->initialize($lb); + + # Populate from $lb, but use full name and fix typo. + foreach my $range ($lb->ranges) { + my $full_name = $lb->table($range->value)->full_name; + $full_name = 'Inseparable' + if standardize($full_name) eq 'inseperable'; + $perl_lb->add_map($range->start, $range->end, $full_name); + } } - $perl_lb->set_default_map('AL'); - # It's a little iffy relying on Unicode to not change which property value - # synonym they use, but if they do, tests should start failing and we can - # fix this up + $perl_lb->set_default_map('Alphabetic', 'full_name'); # XX -> AL + for my $range ($perl_lb->ranges) { my $value = standardize($range->value); if ( $value eq standardize('Unknown') - || $value eq standardize('XX') - || $value eq standardize('AI') - || $value eq standardize('SG')) + || $value eq standardize('Ambiguous') + || $value eq standardize('Surrogate')) { - $perl_lb->add_map($range->start, $range->end, 'AL', + $perl_lb->add_map($range->start, $range->end, 'Alphabetic', Replace => $UNCONDITIONALLY); } - elsif ($value eq standardize('CJ')) { - $perl_lb->add_map($range->start, $range->end, 'NS', + elsif ($value eq standardize('Conditional_Japanese_Starter')) { + $perl_lb->add_map($range->start, $range->end, 'Nonstarter', Replace => $UNCONDITIONALLY); } - elsif ($value eq standardize('SA')) { + elsif ($value eq standardize('Complex_Context')) { for my $i ($range->start .. $range->end) { my $gc_val = $gc->value_of($i); if ($gc_val eq 'Mn' || $gc_val eq 'Mc') { - $perl_lb->add_map($i, $i, 'CM', + $perl_lb->add_map($i, $i, 'Combining_Mark', Replace => $UNCONDITIONALLY); } else { - $perl_lb->add_map($i, $i, 'AL', + $perl_lb->add_map($i, $i, 'Alphabetic', Replace => $UNCONDITIONALLY); } } @@ -18915,11 +18964,16 @@ my @input_file_objects = ( Has_Missings_Defaults => $NOT_IGNORED, Property => 'Line_Break', # Early versions had problematic syntax - Each_Line_Handler => (($v_version lt v3.1.0) - ? \&filter_early_ea_lb - : undef), - Early => [ "LBsubst.txt", '_Perl_LB', 'AL', - 'AL', # default + Each_Line_Handler => ($v_version ge v3.1.0) + ? undef + : ($v_version lt v3.0.0) + ? \&filter_substitute_lb + : \&filter_early_ea_lb, + # Must use long names for property values see comments at + # sub filter_substitute_lb + Early => [ "LBsubst.txt", '_Perl_LB', 'Alphabetic', + 'Alphabetic', # default to this because XX -> + # AL # Don't use _Perl_LB as a synonym for # Line_Break in later perls, as it is tailored diff --git a/locale.c b/locale.c index 20e5e98..6de9893 100644 --- a/locale.c +++ b/locale.c @@ -29,7 +29,9 @@ * in such scope than if not. However, various libc functions called by Perl * are affected by the LC_NUMERIC category, so there are macros in perl.h that * are used to toggle between the current locale and the C locale depending on - * the desired behavior of those functions at the moment. + * the desired behavior of those functions at the moment. And, LC_MESSAGES is + * switched to the C locale for outputting the message unless within the scope + * of 'use locale'. */ #include "EXTERN.h" diff --git a/regcharclass.h b/regcharclass.h index d7925a2..36fa1fd 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -1895,7 +1895,7 @@ * 1a0687fb9c6c4567e853913549df0944fe40821279a3e9cdaa6ab8679bc286fd lib/unicore/extracted/DLineBreak.txt * 40bcfed3ca727c19e1331f6c33806231d5f7eeeabd2e6a9e06a3740c85d0c250 lib/unicore/extracted/DNumType.txt * a18d502bad39d527ac5586d7bc93e29f565859e3bcc24ada627eff606d6f5fed lib/unicore/extracted/DNumValues.txt - * fdff462ac2c512b5990e6276d8175d6a511c14654c31dc2fcfb2f802b9fa5c8e lib/unicore/mktables + * 285aef7ed2bf69724b1fa9bba177640636f666e1a5dd0ba5e538d4790129bbfe lib/unicore/mktables * 462c9aaa608fb2014cd9649af1c5c009485c60b9c8b15b89401fdc10cf6161c6 lib/unicore/version * 913d2f93f3cb6cdf1664db888bf840bc4eb074eef824e082fceda24a9445e60c regen/charset_translations.pl * d9c04ac46bdd81bb3e26519f2b8eb6242cb12337205add3f7cf092b0c58dccc4 regen/regcharclass.pl diff --git a/regen/mk_invlists.pl b/regen/mk_invlists.pl index d508e9f..09d2961 100644 --- a/regen/mk_invlists.pl +++ b/regen/mk_invlists.pl @@ -40,6 +40,8 @@ my $out_fh = open_new('charclass_invlists.h', '>', my $in_file_pound_if = 0; +my $max_hdr_len = 3; # In headings, how wide a name is allowed? + print $out_fh "/* See the generating file for comments */\n\n"; # The symbols generated by this program are all currently defined only in a @@ -57,12 +59,15 @@ my %exceptions_to_where_to_define = ); # This hash contains the properties with enums that have hard-coded references -# to them in C code. Its only use is to make sure that if perl is compiled +# to them in C code. It is neeed to make sure that if perl is compiled # with an older Unicode data set, that all the enum values the code is # expecting will still be in the enum typedef. Thus the code doesn't have to -# change. The Unicode version won't have any code points that have these enum -# values, so the code that handles them will not get exercised. This is far -# better than having to #ifdef things. +# change. The Unicode version won't have any code points that have the enum +# values not in that version, so the code that handles them will not get +# exercised. This is far better than having to #ifdef things. The names here +# should be the long names of the respective property values. The reason for +# this is because regexec.c uses them as case labels, and the long name is +# generally more understandable than the short. my %hard_coded_enums = ( gcb => [ 'Control', @@ -157,10 +162,13 @@ my %hard_coded_enums = my %gcb_enums; my @gcb_short_enums; +my %gcb_abbreviations; my %lb_enums; my @lb_short_enums; +my %lb_abbreviations; my %wb_enums; my @wb_short_enums; +my %wb_abbreviations; my @a2n; @@ -288,33 +296,37 @@ sub output_invmap ($$$$$$$) { else { @enums = uniques(@$invmap); } + if (! @enums) { die "Only enum properties are currently handled; '$prop_name' isn't one"; } else { - - # Convert short names to long - @enums = map { (prop_value_aliases($prop_name, $_))[1] } @enums; - my @expected_enums = @{$hard_coded_enums{lc $short_name}}; - die 'You need to update %hard_coded_enums to reflect new entries in this Unicode version' - if @expected_enums < @enums; - - # Remove the enums found in the input from the ones we expect - for (my $i = @expected_enums - 1; $i >= 0; $i--) { - splice(@expected_enums, $i, 1) - if grep { $expected_enums[$i] eq $_ } @enums; - } + my @canonical_input_enums; + if (@expected_enums) { + if (@expected_enums < @enums) { + die 'You need to update %hard_coded_enums to reflect new' + . " entries in this Unicode version\n" + . "Expected: " . join(", ", sort @expected_enums) . "\n" + . " Got: " . join(", ", sort @enums); + } - # The ones remaining must be because we're using an older - # Unicode version. Add them to the list. - push @enums, @expected_enums; + if (! defined prop_aliases($prop_name)) { - # Add in the extra values coded into this program, and sort. - @enums = sort @enums; + # Convert the input enums into canonical form and + # save for use below + @canonical_input_enums = map { lc ($_ =~ s/_//gr) } + @enums; + } + @enums = sort @expected_enums; + } - # The internal enums comes last. - push @enums, split /,/, $extra_enums if $extra_enums ne ""; + # The internal enums come last, and in the order specified + my @extras; + if ($extra_enums ne "") { + @extras = split /,/, $extra_enums; + push @enums, @extras; + } # Assign a value to each element of the enum. The default # value always gets 0; the others are arbitrarily assigned. @@ -326,35 +338,102 @@ sub output_invmap ($$$$$$$) { $enums{$enum} = $enum_val++ unless exists $enums{$enum}; } - # Calculate the enum values for properties _Perl_GCB and - # _Perl_LB because we output special tables for them - if ($name eq '_Perl_GCB' && ! %gcb_enums) { - while (my ($enum, $value) = each %enums) { - my ($short) = prop_value_aliases('GCB', $enum); - $short = lc $enum unless defined $short; - $gcb_enums{$short} = $value; - @gcb_short_enums[$value] = $short; - } - } - elsif ($name eq '_Perl_LB' && ! %lb_enums) { - while (my ($enum, $value) = each %enums) { - my ($short) = prop_value_aliases('LB', $enum); - $short = substr(lc $enum, 0, 2) unless defined $short; - $lb_enums{$short} = $value; - @lb_short_enums[$value] = $short; - } - } - elsif ($name eq '_Perl_WB' && ! %wb_enums) { - while (my ($enum, $value) = each %enums) { - my ($short) = prop_value_aliases('WB', $enum); - $short = lc $enum unless defined $short; - $short = substr($short, 0, 2); - - # Special case a better name than the kludgy one - $short = 'hs' if $short eq 'pe'; - - $wb_enums{$short} = $value; - @wb_short_enums[$value] = $short; + # Calculate the enum values for certain properties like + # _Perl_GCB and _Perl_LB, because we output special tables for + # them. + if ($name =~ / ^ _Perl_ (?: GCB | LB | WB ) $ /x) { + + # We use string evals to allow the same code to work on + # all tables we're doing. + my $type = lc $prop_name; + + # We use lowercase single letter names for any property + # values not in the release of Unicode being compiled now. + my $placeholder = "a"; + + # Skip if we've already done this code, which populated + # this hash + if (eval "! \%${type}_enums") { + + # For each enum ... + foreach my $enum (sort keys %enums) { + my $value = $enums{$enum}; + my $short; + my $abbreviated_from; + + # Special case this wb property value to make the + # name more clear + if ($enum eq 'Perl_Tailored_HSpace') { + $short = 'hs'; + $abbreviated_from = $enum; + } + elsif (grep { $_ eq $enum } @extras) { + + # The 'short' name for one of the property + # values added by this file is just the + # lowercase of it + $short = lc $enum; + } + elsif (grep {$_ eq lc ( $enum =~ s/_//gr) } + @canonical_input_enums) + { # On Unicode versions that predate the + # official property, we have set up this array + # to be the canonical form of each enum in the + # substitute property. If the enum we're + # looking at is canonically the same as one of + # these, use its name instead of generating a + # placeholder one in the next clause (which + # will happen because prop_value_aliases() + # will fail because it only works on official + # properties) + $short = $enum; + } + else { + # Use the official short name for the other + # property values, which should all be + # official ones. + ($short) = prop_value_aliases($type, $enum); + + # But create a placeholder for ones not in + # this Unicode version. + $short = $placeholder++ unless defined $short; + } + + # If our short name is too long, or we already + # know that the name is an abbreviation, truncate + # to make sure it's short enough, and remember + # that we did this so we can later place in a + # comment in the generated file + if ( $abbreviated_from + || length $short > $max_hdr_len) + { + $short = substr($short, 0, $max_hdr_len); + $abbreviated_from = $enum + unless $abbreviated_from; + # If the name we are to display conflicts, try + # another. + while (eval "exists + \$${type}_abbreviations{$short}") + { + die $@ if $@; + $short++; + } + + eval "\$${type}_abbreviations{$short} = '$enum'"; + die $@ if $@; + } + + # Remember the mapping from the property value + # (enum) name to its value. + eval "\$${type}_enums{$enum} = $value"; + die $@ if $@; + + # Remember the inverse mapping to the short name + # so that we can properly label the generated + # table's rows and columns + eval "\$${type}_short_enums[$value] = '$short'"; + die $@ if $@; + } } } } @@ -480,6 +559,163 @@ sub UpperLatin1 { return mk_invlist_from_sorted_cp_list([ 128 .. 255 ]); } +sub output_table_common { + + # Common subroutine to actually output the generated rules table. + + my ($property, + $table_value_defines_ref, + $table_ref, + $names_ref, + $abbreviations_ref) = @_; + my $size = @$table_ref; + + # Output the #define list, sorted by numeric value + if ($table_value_defines_ref) { + my $max_name_length = 0; + my @defines; + + # Put in order, and at the same time find the longest name + while (my ($enum, $value) = each %$table_value_defines_ref) { + $defines[$value] = $enum; + + my $length = length $enum; + $max_name_length = $length if $length > $max_name_length; + } + + print $out_fh "\n"; + + # Output, so that the values are vertically aligned in a column after + # the longest name + foreach my $i (0 .. @defines - 1) { + next unless defined $defines[$i]; + printf $out_fh "#define %-*s %2d\n", + $max_name_length, + $defines[$i], + $i; + } + } + + my $column_width = 2; # We currently allow 2 digits for the number + + # If the maximum value in the table is 1, it can be a bool. (Being above + # a U8 is not currently handled + my $max_element = 0; + for my $i (0 .. $size - 1) { + for my $j (0 .. $size - 1) { + next if $max_element >= $table_ref->[$i][$j]; + $max_element = $table_ref->[$i][$j]; + } + } + die "Need wider table column width given '$max_element" + if length $max_element > $column_width; + + my $table_type = ($max_element == 1) + ? 'bool' + : 'U8'; + + # If a name is longer than the width set aside for a column, its column + # needs to have increased spacing so that the name doesn't get truncated + # nor run into an adjacent column + my @spacers; + + # If we are being compiled on a Unicode version earlier than that which + # this file was designed for, it may be that some of the property values + # aren't in the current release, and so would be undefined if we didn't + # define them ourselves. Earlier code has done this, making them + # lowercase characters of length one. We look to see if any exist, so + # that we can add an annotation to the output table + my $has_placeholder = 0; + + for my $i (0 .. $size - 1) { + no warnings 'numeric'; + $has_placeholder = 1 if $names_ref->[$i] =~ / ^ [[:lower:]] $ /ax; + $spacers[$i] = " " x (length($names_ref->[$i]) - $column_width); + } + + print $out_fh "\nstatic const $table_type ${property}_table[$size][$size] = {\n"; + + # Calculate the column heading line + my $header_line = "/* " + . (" " x $max_hdr_len) # We let the row heading meld to + # the '*/' for those that are at + # the max + . " " x 3; # Space for '*/ ' + # Now each column + for my $i (0 .. $size - 1) { + $header_line .= sprintf "%s%*s", + $spacers[$i], + $column_width + 1, # 1 for the ',' + $names_ref->[$i]; + } + $header_line .= " */\n"; + + # If we have annotations, output it now. + if ($has_placeholder || scalar %$abbreviations_ref) { + my $text = ""; + foreach my $abbr (sort keys %$abbreviations_ref) { + $text .= "; " if $text; + $text .= "'$abbr' stands for '$abbreviations_ref->{$abbr}'"; + } + if ($has_placeholder) { + $text .= "; other " if $text; + $text .= "lowercase names are placeholders for" + . " property values not defined until a later Unicode" + . " release, so are irrelevant in this one, as they are" + . " not assigned to any code points"; + } + + my $indent = " " x 3; + $text = $indent . "/* $text */"; + + # Wrap the text so that it is no wider than the table, which the + # header line gives. + my $output_width = length $header_line; + while (length $text > $output_width) { + my $cur_line = substr($text, 0, $output_width); + + # Find the first blank back from the right end to wrap at. + for (my $i = $output_width -1; $i > 0; $i--) { + if (substr($text, $i, 1) eq " ") { + print $out_fh substr($text, 0, $i), "\n"; + + # Set so will look at just the remaining tail (which will + # be indented and have a '*' after the indent + $text = $indent . " * " . substr($text, $i + 1); + last; + } + } + } + + # And any remaining + print $out_fh $text, "\n" if $text; + } + + # We calculated the header line earlier just to get its width so that we + # could make sure the annotations fit into that. + print $out_fh $header_line; + + # Now output the bulk of the table. + for my $i (0 .. $size - 1) { + + # First the row heading. + printf $out_fh "/* %-*s*/ ", $max_hdr_len, $names_ref->[$i]; + print $out_fh "{"; # Then the brace for this row + + # Then each column + for my $j (0 .. $size -1) { + print $out_fh $spacers[$j]; + printf $out_fh "%*d", $column_width, $table_ref->[$i][$j]; + print $out_fh "," if $j < $size - 1; + } + print $out_fh " }"; + print $out_fh "," if $i < $size - 1; + print $out_fh "\n"; + } + + print $out_fh "};\n"; +} + sub output_GCB_table() { # Create and output the pair table for use in determining Grapheme Cluster @@ -506,14 +742,15 @@ sub output_GCB_table() { # GB9a à SpacingMark # GB9b Prepend à for my $i (0 .. @gcb_table - 1) { - $gcb_table[$i][$gcb_enums{'EX'}] = 0; - $gcb_table[$i][$gcb_enums{'SM'}] = 0; - $gcb_table[$gcb_enums{'PP'}][$i] = 0; + $gcb_table[$i][$gcb_enums{'Extend'}] = 0; + $gcb_table[$i][$gcb_enums{'SpacingMark'}] = 0; + $gcb_table[$gcb_enums{'Prepend'}][$i] = 0; } # Do not break between regional indicator symbols. # GB8a Regional_Indicator à Regional_Indicator - $gcb_table[$gcb_enums{'RI'}][$gcb_enums{'RI'}] = 0; + $gcb_table[$gcb_enums{'Regional_Indicator'}] + [$gcb_enums{'Regional_Indicator'}] = 0; # Do not break Hangul syllable sequences. # GB8 ( LVT | T) à T @@ -532,14 +769,15 @@ sub output_GCB_table() { $gcb_table[$gcb_enums{'L'}][$gcb_enums{'LV'}] = 0; $gcb_table[$gcb_enums{'L'}][$gcb_enums{'LVT'}] = 0; - # Do not break between a CR and LF. Otherwise, break before and after controls. + # Do not break between a CR and LF. Otherwise, break before and after + # controls. # GB5 ÷ ( Control | CR | LF ) # GB4 ( Control | CR | LF ) ÷ for my $i (0 .. @gcb_table - 1) { - $gcb_table[$i][$gcb_enums{'CN'}] = 1; + $gcb_table[$i][$gcb_enums{'Control'}] = 1; $gcb_table[$i][$gcb_enums{'CR'}] = 1; $gcb_table[$i][$gcb_enums{'LF'}] = 1; - $gcb_table[$gcb_enums{'CN'}][$i] = 1; + $gcb_table[$gcb_enums{'Control'}][$i] = 1; $gcb_table[$gcb_enums{'CR'}][$i] = 1; $gcb_table[$gcb_enums{'LF'}][$i] = 1; } @@ -551,30 +789,15 @@ sub output_GCB_table() { # GB1 sot ÷ # GB2 ÷ eot for my $i (0 .. @gcb_table - 1) { - $gcb_table[$i][$gcb_enums{'edge'}] = 1; - $gcb_table[$gcb_enums{'edge'}][$i] = 1; + $gcb_table[$i][$gcb_enums{'EDGE'}] = 1; + $gcb_table[$gcb_enums{'EDGE'}][$i] = 1; } # But, unspecified by Unicode, we shouldn't break on an empty string. - $gcb_table[$gcb_enums{'edge'}][$gcb_enums{'edge'}] = 0; - - print $out_fh "\nstatic const bool GCB_table[$table_size][$table_size] = {\n"; - print $out_fh "/* "; - for my $i (0 .. @gcb_table - 1) { - printf $out_fh "%5s", $gcb_short_enums[$i]; - } - print $out_fh "*/\n"; + $gcb_table[$gcb_enums{'EDGE'}][$gcb_enums{'EDGE'}] = 0; - for my $i (0 .. @gcb_table - 1) { - printf $out_fh "/*%4s*/ ", $gcb_short_enums[$i]; - print $out_fh "{"; - print $out_fh join ", ", map sprintf("%3d", $_), @{ $gcb_table[$i] }; - print $out_fh "}"; - print $out_fh "," if $i < @gcb_table - 1; - print $out_fh "\n"; - } - - print $out_fh "};\n"; + output_table_common('GCB', undef, + \@gcb_table, \@gcb_short_enums, \%gcb_abbreviations); } sub output_LB_table() { @@ -614,19 +837,6 @@ sub output_LB_table() { LB_various_then_PO_or_PR => (1<<4), # Rule 25 ); - # Output the #define list, sorted by numeric value - my @defines; - while (my ($enum, $value) = each %lb_actions) { - $defines[$value] = $enum; - } - - print $out_fh "\n"; - - foreach my $i (0 .. @defines - 1) { - next unless defined $defines[$i]; - print $out_fh "#define $defines[$i]\t$i\n"; - } - # Construct the LB pair table. This is based on the rules in # http://www.unicode.org/reports/tr14/, but modified as those rules are # designed for someone taking a string of text and sequentially going @@ -649,53 +859,81 @@ sub output_LB_table() { } # LB30a. Don't break between Regional Indicators - $lb_table[$lb_enums{'RI'}][$lb_enums{'RI'}] = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'Regional_Indicator'}] + [$lb_enums{'Regional_Indicator'}] = $lb_actions{'LB_NOBREAK'}; # LB30 Do not break between letters, numbers, or ordinary symbols and # opening or closing parentheses. # (AL | HL | NU) à OP - $lb_table[$lb_enums{'AL'}][$lb_enums{'OP'}] = $lb_actions{'LB_NOBREAK'}; - $lb_table[$lb_enums{'HL'}][$lb_enums{'OP'}] = $lb_actions{'LB_NOBREAK'}; - $lb_table[$lb_enums{'NU'}][$lb_enums{'OP'}] = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Open_Punctuation'}] + = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Open_Punctuation'}] + = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Open_Punctuation'}] + = $lb_actions{'LB_NOBREAK'}; # CP à (AL | HL | NU) - $lb_table[$lb_enums{'CP'}][$lb_enums{'AL'}] = $lb_actions{'LB_NOBREAK'}; - $lb_table[$lb_enums{'CP'}][$lb_enums{'HL'}] = $lb_actions{'LB_NOBREAK'}; - $lb_table[$lb_enums{'CP'}][$lb_enums{'NU'}] = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Alphabetic'}] + = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Hebrew_Letter'}] + = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Numeric'}] + = $lb_actions{'LB_NOBREAK'}; # LB29 Do not break between numeric punctuation and alphabetics (âe.g.â). # IS à (AL | HL) - $lb_table[$lb_enums{'IS'}][$lb_enums{'AL'}] = $lb_actions{'LB_NOBREAK'}; - $lb_table[$lb_enums{'IS'}][$lb_enums{'HL'}] = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Alphabetic'}] + = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Hebrew_Letter'}] + = $lb_actions{'LB_NOBREAK'}; # LB28 Do not break between alphabetics (âatâ). # (AL | HL) à (AL | HL) - $lb_table[$lb_enums{'AL'}][$lb_enums{'AL'}] = $lb_actions{'LB_NOBREAK'}; - $lb_table[$lb_enums{'HL'}][$lb_enums{'AL'}] = $lb_actions{'LB_NOBREAK'}; - $lb_table[$lb_enums{'AL'}][$lb_enums{'HL'}] = $lb_actions{'LB_NOBREAK'}; - $lb_table[$lb_enums{'HL'}][$lb_enums{'HL'}] = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Alphabetic'}] + = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Alphabetic'}] + = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Hebrew_Letter'}] + = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Hebrew_Letter'}] + = $lb_actions{'LB_NOBREAK'}; # LB27 Treat a Korean Syllable Block the same as ID. # (JL | JV | JT | H2 | H3) à IN - $lb_table[$lb_enums{'JL'}][$lb_enums{'IN'}] = $lb_actions{'LB_NOBREAK'}; - $lb_table[$lb_enums{'JV'}][$lb_enums{'IN'}] = $lb_actions{'LB_NOBREAK'}; - $lb_table[$lb_enums{'JT'}][$lb_enums{'IN'}] = $lb_actions{'LB_NOBREAK'}; - $lb_table[$lb_enums{'H2'}][$lb_enums{'IN'}] = $lb_actions{'LB_NOBREAK'}; - $lb_table[$lb_enums{'H3'}][$lb_enums{'IN'}] = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'JL'}][$lb_enums{'Inseparable'}] + = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'JV'}][$lb_enums{'Inseparable'}] + = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'JT'}][$lb_enums{'Inseparable'}] + = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'H2'}][$lb_enums{'Inseparable'}] + = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'H3'}][$lb_enums{'Inseparable'}] + = $lb_actions{'LB_NOBREAK'}; # (JL | JV | JT | H2 | H3) à PO - $lb_table[$lb_enums{'JL'}][$lb_enums{'PO'}] = $lb_actions{'LB_NOBREAK'}; - $lb_table[$lb_enums{'JV'}][$lb_enums{'PO'}] = $lb_actions{'LB_NOBREAK'}; - $lb_table[$lb_enums{'JT'}][$lb_enums{'PO'}] = $lb_actions{'LB_NOBREAK'}; - $lb_table[$lb_enums{'H2'}][$lb_enums{'PO'}] = $lb_actions{'LB_NOBREAK'}; - $lb_table[$lb_enums{'H3'}][$lb_enums{'PO'}] = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'JL'}][$lb_enums{'Postfix_Numeric'}] + = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'JV'}][$lb_enums{'Postfix_Numeric'}] + = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'JT'}][$lb_enums{'Postfix_Numeric'}] + = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'H2'}][$lb_enums{'Postfix_Numeric'}] + = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'H3'}][$lb_enums{'Postfix_Numeric'}] + = $lb_actions{'LB_NOBREAK'}; # PR à (JL | JV | JT | H2 | H3) - $lb_table[$lb_enums{'PR'}][$lb_enums{'JL'}] = $lb_actions{'LB_NOBREAK'}; - $lb_table[$lb_enums{'PR'}][$lb_enums{'JV'}] = $lb_actions{'LB_NOBREAK'}; - $lb_table[$lb_enums{'PR'}][$lb_enums{'JT'}] = $lb_actions{'LB_NOBREAK'}; - $lb_table[$lb_enums{'PR'}][$lb_enums{'H2'}] = $lb_actions{'LB_NOBREAK'}; - $lb_table[$lb_enums{'PR'}][$lb_enums{'H3'}] = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JL'}] + = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JV'}] + = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'JT'}] + = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'H2'}] + = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'H3'}] + = $lb_actions{'LB_NOBREAK'}; # LB26 Do not break a Korean syllable. # JL à (JL | JV | H2 | H3) @@ -719,135 +957,165 @@ sub output_LB_table() { # http://www.unicode.org/reports/tr14/#Examples # We follow that tailoring because Unicode's test cases expect it # (PR | PO) à ( OP | HY )? NU - $lb_table[$lb_enums{'PR'}][$lb_enums{'NU'}] = $lb_actions{'LB_NOBREAK'}; - $lb_table[$lb_enums{'PO'}][$lb_enums{'NU'}] = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Numeric'}] + = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Numeric'}] + = $lb_actions{'LB_NOBREAK'}; # Given that (OP | HY )? is optional, we have to test for it in code. # We add in the action (instead of overriding) for this, so that in # the code we can recover the underlying break value. - $lb_table[$lb_enums{'PR'}][$lb_enums{'OP'}] + $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Open_Punctuation'}] += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'}; - $lb_table[$lb_enums{'PO'}][$lb_enums{'OP'}] + $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Open_Punctuation'}] += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'}; - $lb_table[$lb_enums{'PR'}][$lb_enums{'HY'}] + $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Hyphen'}] += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'}; - $lb_table[$lb_enums{'PO'}][$lb_enums{'HY'}] + $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Hyphen'}] += $lb_actions{'LB_PR_or_PO_then_OP_or_HY'}; # ( OP | HY ) à NU - $lb_table[$lb_enums{'OP'}][$lb_enums{'NU'}] = $lb_actions{'LB_NOBREAK'}; - $lb_table[$lb_enums{'HY'}][$lb_enums{'NU'}] = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'Open_Punctuation'}][$lb_enums{'Numeric'}] + = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'Hyphen'}][$lb_enums{'Numeric'}] + = $lb_actions{'LB_NOBREAK'}; # NU (NU | SY | IS)* à (NU | SY | IS | CL | CP ) # which can be rewritten as: # NU (SY | IS)* à (NU | SY | IS | CL | CP ) - $lb_table[$lb_enums{'NU'}][$lb_enums{'NU'}] = $lb_actions{'LB_NOBREAK'}; - $lb_table[$lb_enums{'NU'}][$lb_enums{'SY'}] = $lb_actions{'LB_NOBREAK'}; - $lb_table[$lb_enums{'NU'}][$lb_enums{'IS'}] = $lb_actions{'LB_NOBREAK'}; - $lb_table[$lb_enums{'NU'}][$lb_enums{'CL'}] = $lb_actions{'LB_NOBREAK'}; - $lb_table[$lb_enums{'NU'}][$lb_enums{'CP'}] = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Numeric'}] + = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Break_Symbols'}] + = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Infix_Numeric'}] + = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Close_Punctuation'}] + = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Close_Parenthesis'}] + = $lb_actions{'LB_NOBREAK'}; # Like earlier where we have to test in code, we add in the action so # that we can recover the underlying values. This is done in rules # below, as well. The code assumes that we haven't added 2 actions. # Shoul a later Unicode release break that assumption, then tests # should start failing. - $lb_table[$lb_enums{'SY'}][$lb_enums{'NU'}] + $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Numeric'}] += $lb_actions{'LB_SY_or_IS_then_various'}; - $lb_table[$lb_enums{'SY'}][$lb_enums{'SY'}] + $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Break_Symbols'}] += $lb_actions{'LB_SY_or_IS_then_various'}; - $lb_table[$lb_enums{'SY'}][$lb_enums{'IS'}] + $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Infix_Numeric'}] += $lb_actions{'LB_SY_or_IS_then_various'}; - $lb_table[$lb_enums{'SY'}][$lb_enums{'CL'}] + $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Close_Punctuation'}] += $lb_actions{'LB_SY_or_IS_then_various'}; - $lb_table[$lb_enums{'SY'}][$lb_enums{'CP'}] + $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Close_Parenthesis'}] += $lb_actions{'LB_SY_or_IS_then_various'}; - $lb_table[$lb_enums{'IS'}][$lb_enums{'NU'}] + $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Numeric'}] += $lb_actions{'LB_SY_or_IS_then_various'}; - $lb_table[$lb_enums{'IS'}][$lb_enums{'SY'}] + $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Break_Symbols'}] += $lb_actions{'LB_SY_or_IS_then_various'}; - $lb_table[$lb_enums{'IS'}][$lb_enums{'IS'}] + $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Infix_Numeric'}] += $lb_actions{'LB_SY_or_IS_then_various'}; - $lb_table[$lb_enums{'IS'}][$lb_enums{'CL'}] + $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Close_Punctuation'}] += $lb_actions{'LB_SY_or_IS_then_various'}; - $lb_table[$lb_enums{'IS'}][$lb_enums{'CP'}] + $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Close_Parenthesis'}] += $lb_actions{'LB_SY_or_IS_then_various'}; # NU (NU | SY | IS)* (CL | CP)? à (PO | PR) # which can be rewritten as: # NU (SY | IS)* (CL | CP)? à (PO | PR) - $lb_table[$lb_enums{'NU'}][$lb_enums{'PO'}] = $lb_actions{'LB_NOBREAK'}; - $lb_table[$lb_enums{'NU'}][$lb_enums{'PR'}] = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Postfix_Numeric'}] + = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Prefix_Numeric'}] + = $lb_actions{'LB_NOBREAK'}; - $lb_table[$lb_enums{'CP'}][$lb_enums{'PO'}] + $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Postfix_Numeric'}] += $lb_actions{'LB_various_then_PO_or_PR'}; - $lb_table[$lb_enums{'CL'}][$lb_enums{'PO'}] + $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Postfix_Numeric'}] += $lb_actions{'LB_various_then_PO_or_PR'}; - $lb_table[$lb_enums{'IS'}][$lb_enums{'PO'}] + $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Postfix_Numeric'}] += $lb_actions{'LB_various_then_PO_or_PR'}; - $lb_table[$lb_enums{'SY'}][$lb_enums{'PO'}] + $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Postfix_Numeric'}] += $lb_actions{'LB_various_then_PO_or_PR'}; - $lb_table[$lb_enums{'CP'}][$lb_enums{'PR'}] + $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Prefix_Numeric'}] += $lb_actions{'LB_various_then_PO_or_PR'}; - $lb_table[$lb_enums{'CL'}][$lb_enums{'PR'}] + $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Prefix_Numeric'}] += $lb_actions{'LB_various_then_PO_or_PR'}; - $lb_table[$lb_enums{'IS'}][$lb_enums{'PR'}] + $lb_table[$lb_enums{'Infix_Numeric'}][$lb_enums{'Prefix_Numeric'}] += $lb_actions{'LB_various_then_PO_or_PR'}; - $lb_table[$lb_enums{'SY'}][$lb_enums{'PR'}] + $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Prefix_Numeric'}] += $lb_actions{'LB_various_then_PO_or_PR'}; # LB24 Do not break between prefix and letters or ideographs. # PR à ID - $lb_table[$lb_enums{'PR'}][$lb_enums{'ID'}] = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Ideographic'}] + = $lb_actions{'LB_NOBREAK'}; # PR à (AL | HL) - $lb_table[$lb_enums{'PR'}][$lb_enums{'AL'}] = $lb_actions{'LB_NOBREAK'}; - $lb_table[$lb_enums{'PR'}][$lb_enums{'HL'}] = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Alphabetic'}] + = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'Prefix_Numeric'}][$lb_enums{'Hebrew_Letter'}] + = $lb_actions{'LB_NOBREAK'}; # PO à (AL | HL) - $lb_table[$lb_enums{'PO'}][$lb_enums{'AL'}] = $lb_actions{'LB_NOBREAK'}; - $lb_table[$lb_enums{'PO'}][$lb_enums{'HL'}] = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Alphabetic'}] + = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'Postfix_Numeric'}][$lb_enums{'Hebrew_Letter'}] + = $lb_actions{'LB_NOBREAK'}; # LB23 Do not break within âa9â, â3aâ, or âH%â. # ID à PO - $lb_table[$lb_enums{'ID'}][$lb_enums{'PO'}] = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'Ideographic'}][$lb_enums{'Postfix_Numeric'}] + = $lb_actions{'LB_NOBREAK'}; # (AL | HL) à NU - $lb_table[$lb_enums{'AL'}][$lb_enums{'NU'}] = $lb_actions{'LB_NOBREAK'}; - $lb_table[$lb_enums{'HL'}][$lb_enums{'NU'}] = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Numeric'}] + = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Numeric'}] + = $lb_actions{'LB_NOBREAK'}; # NU à (AL | HL) - $lb_table[$lb_enums{'NU'}][$lb_enums{'AL'}] = $lb_actions{'LB_NOBREAK'}; - $lb_table[$lb_enums{'NU'}][$lb_enums{'HL'}] = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Alphabetic'}] + = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Hebrew_Letter'}] + = $lb_actions{'LB_NOBREAK'}; # LB22 Do not break between two ellipses, or between letters, numbers or # exclamations and ellipsis. # (AL | HL) à IN - $lb_table[$lb_enums{'AL'}][$lb_enums{'IN'}] = $lb_actions{'LB_NOBREAK'}; - $lb_table[$lb_enums{'HL'}][$lb_enums{'IN'}] = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'Alphabetic'}][$lb_enums{'Inseparable'}] + = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'Hebrew_Letter'}][$lb_enums{'Inseparable'}] + = $lb_actions{'LB_NOBREAK'}; - # EX à IN - $lb_table[$lb_enums{'EX'}][$lb_enums{'IN'}] = $lb_actions{'LB_NOBREAK'}; + # Exclamation à IN + $lb_table[$lb_enums{'Exclamation'}][$lb_enums{'Inseparable'}] + = $lb_actions{'LB_NOBREAK'}; # ID à IN - $lb_table[$lb_enums{'ID'}][$lb_enums{'IN'}] = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'Ideographic'}][$lb_enums{'Inseparable'}] + = $lb_actions{'LB_NOBREAK'}; # IN à IN - $lb_table[$lb_enums{'IN'}][$lb_enums{'IN'}] = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'Inseparable'}][$lb_enums{'Inseparable'}] + = $lb_actions{'LB_NOBREAK'}; # NU à IN - $lb_table[$lb_enums{'NU'}][$lb_enums{'IN'}] = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'Numeric'}][$lb_enums{'Inseparable'}] + = $lb_actions{'LB_NOBREAK'}; # LB21b Donât break between Solidus and Hebrew letters. # SY à HL - $lb_table[$lb_enums{'SY'}][$lb_enums{'HL'}] = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'Break_Symbols'}][$lb_enums{'Hebrew_Letter'}] + = $lb_actions{'LB_NOBREAK'}; # LB21a Don't break after Hebrew + Hyphen. # HL (HY | BA) à for my $i (0 .. @lb_table - 1) { - $lb_table[$lb_enums{'HY'}][$i] += $lb_actions{'LB_HY_or_BA_then_foo'}; - $lb_table[$lb_enums{'BA'}][$i] += $lb_actions{'LB_HY_or_BA_then_foo'}; + $lb_table[$lb_enums{'Hyphen'}][$i] + += $lb_actions{'LB_HY_or_BA_then_foo'}; + $lb_table[$lb_enums{'Break_After'}][$i] + += $lb_actions{'LB_HY_or_BA_then_foo'}; } # LB21 Do not break before hyphen-minus, other hyphens, fixed-width @@ -857,10 +1125,10 @@ sub output_LB_table() { # à NS # BB à for my $i (0 .. @lb_table - 1) { - $lb_table[$i][$lb_enums{'BA'}] = $lb_actions{'LB_NOBREAK'}; - $lb_table[$i][$lb_enums{'HY'}] = $lb_actions{'LB_NOBREAK'}; - $lb_table[$i][$lb_enums{'NS'}] = $lb_actions{'LB_NOBREAK'}; - $lb_table[$lb_enums{'BB'}][$i] = $lb_actions{'LB_NOBREAK'}; + $lb_table[$i][$lb_enums{'Break_After'}] = $lb_actions{'LB_NOBREAK'}; + $lb_table[$i][$lb_enums{'Hyphen'}] = $lb_actions{'LB_NOBREAK'}; + $lb_table[$i][$lb_enums{'Nonstarter'}] = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'Break_Before'}][$i] = $lb_actions{'LB_NOBREAK'}; } # LB20 Break before and after unresolved CB. @@ -870,47 +1138,49 @@ sub output_LB_table() { # rules. However, the default action is to treat unresolved CB as breaking # before and after. for my $i (0 .. @lb_table - 1) { - $lb_table[$i][$lb_enums{'CB'}] = $lb_actions{'LB_BREAKABLE'}; - $lb_table[$lb_enums{'CB'}][$i] = $lb_actions{'LB_BREAKABLE'}; + $lb_table[$i][$lb_enums{'Contingent_Break'}] + = $lb_actions{'LB_BREAKABLE'}; + $lb_table[$lb_enums{'Contingent_Break'}][$i] + = $lb_actions{'LB_BREAKABLE'}; } # LB19 Do not break before or after quotation marks, such as â â â. # à QU # QU à for my $i (0 .. @lb_table - 1) { - $lb_table[$i][$lb_enums{'QU'}] = $lb_actions{'LB_NOBREAK'}; - $lb_table[$lb_enums{'QU'}][$i] = $lb_actions{'LB_NOBREAK'}; + $lb_table[$i][$lb_enums{'Quotation'}] = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'Quotation'}][$i] = $lb_actions{'LB_NOBREAK'}; } # LB18 Break after spaces # SP ÷ for my $i (0 .. @lb_table - 1) { - $lb_table[$lb_enums{'SP'}][$i] = $lb_actions{'LB_BREAKABLE'}; + $lb_table[$lb_enums{'Space'}][$i] = $lb_actions{'LB_BREAKABLE'}; } # LB17 Do not break within ââââ, even with intervening spaces. # B2 SP* à B2 - $lb_table[$lb_enums{'B2'}][$lb_enums{'B2'}] + $lb_table[$lb_enums{'Break_Both'}][$lb_enums{'Break_Both'}] = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; # LB16 Do not break between closing punctuation and a nonstarter even with # intervening spaces. # (CL | CP) SP* à NS - $lb_table[$lb_enums{'CL'}][$lb_enums{'NS'}] + $lb_table[$lb_enums{'Close_Punctuation'}][$lb_enums{'Nonstarter'}] = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; - $lb_table[$lb_enums{'CP'}][$lb_enums{'NS'}] + $lb_table[$lb_enums{'Close_Parenthesis'}][$lb_enums{'Nonstarter'}] = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; # LB15 Do not break within ââ[â, even with intervening spaces. # QU SP* à OP - $lb_table[$lb_enums{'QU'}][$lb_enums{'OP'}] + $lb_table[$lb_enums{'Quotation'}][$lb_enums{'Open_Punctuation'}] = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; # LB14 Do not break after â[â, even after spaces. # OP SP* à for my $i (0 .. @lb_table - 1) { - $lb_table[$lb_enums{'OP'}][$i] + $lb_table[$lb_enums{'Open_Punctuation'}][$i] = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; } @@ -922,18 +1192,18 @@ sub output_LB_table() { # [^NU] à IS # [^NU] à SY for my $i (0 .. @lb_table - 1) { - $lb_table[$i][$lb_enums{'EX'}] + $lb_table[$i][$lb_enums{'Exclamation'}] = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; - next if $i == $lb_enums{'NU'}; + next if $i == $lb_enums{'Numeric'}; - $lb_table[$i][$lb_enums{'CL'}] + $lb_table[$i][$lb_enums{'Close_Punctuation'}] = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; - $lb_table[$i][$lb_enums{'CP'}] + $lb_table[$i][$lb_enums{'Close_Parenthesis'}] = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; - $lb_table[$i][$lb_enums{'IS'}] + $lb_table[$i][$lb_enums{'Infix_Numeric'}] = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; - $lb_table[$i][$lb_enums{'SY'}] + $lb_table[$i][$lb_enums{'Break_Symbols'}] = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; } @@ -941,45 +1211,45 @@ sub output_LB_table() { # spaces and hyphens. # [^SP BA HY] à GL for my $i (0 .. @lb_table - 1) { - next if $i == $lb_enums{'SP'} - || $i == $lb_enums{'BA'} - || $i == $lb_enums{'HY'}; + next if $i == $lb_enums{'Space'} + || $i == $lb_enums{'Break_After'} + || $i == $lb_enums{'Hyphen'}; # We don't break, but if a property above has said don't break even # with space between, don't override that (also in the next few rules) - next if $lb_table[$i][$lb_enums{'GL'}] + next if $lb_table[$i][$lb_enums{'Glue'}] == $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; - $lb_table[$i][$lb_enums{'GL'}] = $lb_actions{'LB_NOBREAK'}; + $lb_table[$i][$lb_enums{'Glue'}] = $lb_actions{'LB_NOBREAK'}; } # LB12 Do not break after NBSP and related characters. # GL à for my $i (0 .. @lb_table - 1) { - next if $lb_table[$lb_enums{'GL'}][$i] + next if $lb_table[$lb_enums{'Glue'}][$i] == $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; - $lb_table[$lb_enums{'GL'}][$i] = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'Glue'}][$i] = $lb_actions{'LB_NOBREAK'}; } # LB11 Do not break before or after Word joiner and related characters. # à WJ # WJ à for my $i (0 .. @lb_table - 1) { - if ($lb_table[$i][$lb_enums{'WJ'}] + if ($lb_table[$i][$lb_enums{'Word_Joiner'}] != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}) { - $lb_table[$i][$lb_enums{'WJ'}] = $lb_actions{'LB_NOBREAK'}; + $lb_table[$i][$lb_enums{'Word_Joiner'}] = $lb_actions{'LB_NOBREAK'}; } - if ($lb_table[$lb_enums{'WJ'}][$i] + if ($lb_table[$lb_enums{'Word_Joiner'}][$i] != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}) { - $lb_table[$lb_enums{'WJ'}][$i] = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'Word_Joiner'}][$i] = $lb_actions{'LB_NOBREAK'}; } } # Special case this here to avoid having to do a special case in the code, # by making this the same as other things with a SP in front of them that # don't break, we avoid an extra test - $lb_table[$lb_enums{'SP'}][$lb_enums{'WJ'}] + $lb_table[$lb_enums{'Space'}][$lb_enums{'Word_Joiner'}] = $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}; # LB9 and LB10 are done in the same loop @@ -998,27 +1268,29 @@ sub output_LB_table() { # When the CM is the first in the pair, we don't know without looking # behind whether the CM is going to inherit from an earlier character, # or not. So have to figure this out in the code - $lb_table[$lb_enums{'CM'}][$i] = $lb_actions{'LB_CM_foo'}; - - if ( $i == $lb_enums{'BK'} - || $i == $lb_enums{'ed'} - || $i == $lb_enums{'CR'} - || $i == $lb_enums{'LF'} - || $i == $lb_enums{'NL'} - || $i == $lb_enums{'SP'} - || $i == $lb_enums{'ZW'}) + $lb_table[$lb_enums{'Combining_Mark'}][$i] = $lb_actions{'LB_CM_foo'}; + + if ( $i == $lb_enums{'Mandatory_Break'} + || $i == $lb_enums{'EDGE'} + || $i == $lb_enums{'Carriage_Return'} + || $i == $lb_enums{'Line_Feed'} + || $i == $lb_enums{'Next_Line'} + || $i == $lb_enums{'Space'} + || $i == $lb_enums{'ZWSpace'}) { # For these classes, a following CM doesn't combine, and should do - # whatever 'AL' would do. - $lb_table[$i][$lb_enums{'CM'}] = $lb_table[$i][$lb_enums{'AL'}]; + # whatever 'Alphabetic' would do. + $lb_table[$i][$lb_enums{'Combining_Mark'}] + = $lb_table[$i][$lb_enums{'Alphabetic'}]; } else { # For these classes, the CM combines, so doesn't break, inheriting # the type of nobreak from the master character. - if ($lb_table[$i][$lb_enums{'CM'}] + if ($lb_table[$i][$lb_enums{'Combining_Mark'}] != $lb_actions{'LB_NOBREAK_EVEN_WITH_SP_BETWEEN'}) { - $lb_table[$i][$lb_enums{'CM'}] = $lb_actions{'LB_NOBREAK'}; + $lb_table[$i][$lb_enums{'Combining_Mark'}] + = $lb_actions{'LB_NOBREAK'}; } } } @@ -1027,7 +1299,7 @@ sub output_LB_table() { # or more spaces intervene. # ZW SP* ÷ for my $i (0 .. @lb_table - 1) { - $lb_table[$lb_enums{'ZW'}][$i] = $lb_actions{'LB_BREAKABLE'}; + $lb_table[$lb_enums{'ZWSpace'}][$i] = $lb_actions{'LB_BREAKABLE'}; } # Because of LB8-10, we need to look at context for "SP x", and this must @@ -1036,24 +1308,24 @@ sub output_LB_table() { # context. By adding this action instead of replacing the existing one, # we can get back to the original rule if necessary. for my $i (0 .. @lb_table - 1) { - $lb_table[$lb_enums{'SP'}][$i] += $lb_actions{'LB_SP_foo'}; + $lb_table[$lb_enums{'Space'}][$i] += $lb_actions{'LB_SP_foo'}; } # LB7 Do not break before spaces or zero width space. # à SP # à ZW for my $i (0 .. @lb_table - 1) { - $lb_table[$i][$lb_enums{'SP'}] = $lb_actions{'LB_NOBREAK'}; - $lb_table[$i][$lb_enums{'ZW'}] = $lb_actions{'LB_NOBREAK'}; + $lb_table[$i][$lb_enums{'Space'}] = $lb_actions{'LB_NOBREAK'}; + $lb_table[$i][$lb_enums{'ZWSpace'}] = $lb_actions{'LB_NOBREAK'}; } # LB6 Do not break before hard line breaks. # à ( BK | CR | LF | NL ) for my $i (0 .. @lb_table - 1) { - $lb_table[$i][$lb_enums{'BK'}] = $lb_actions{'LB_NOBREAK'}; - $lb_table[$i][$lb_enums{'CR'}] = $lb_actions{'LB_NOBREAK'}; - $lb_table[$i][$lb_enums{'LF'}] = $lb_actions{'LB_NOBREAK'}; - $lb_table[$i][$lb_enums{'NL'}] = $lb_actions{'LB_NOBREAK'}; + $lb_table[$i][$lb_enums{'Mandatory_Break'}] = $lb_actions{'LB_NOBREAK'}; + $lb_table[$i][$lb_enums{'Carriage_Return'}] = $lb_actions{'LB_NOBREAK'}; + $lb_table[$i][$lb_enums{'Line_Feed'}] = $lb_actions{'LB_NOBREAK'}; + $lb_table[$i][$lb_enums{'Next_Line'}] = $lb_actions{'LB_NOBREAK'}; } # LB5 Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks. @@ -1062,16 +1334,19 @@ sub output_LB_table() { # LF ! # NL ! for my $i (0 .. @lb_table - 1) { - $lb_table[$lb_enums{'CR'}][$i] = $lb_actions{'LB_BREAKABLE'}; - $lb_table[$lb_enums{'LF'}][$i] = $lb_actions{'LB_BREAKABLE'}; - $lb_table[$lb_enums{'NL'}][$i] = $lb_actions{'LB_BREAKABLE'}; + $lb_table[$lb_enums{'Carriage_Return'}][$i] + = $lb_actions{'LB_BREAKABLE'}; + $lb_table[$lb_enums{'Line_Feed'}][$i] = $lb_actions{'LB_BREAKABLE'}; + $lb_table[$lb_enums{'Next_Line'}][$i] = $lb_actions{'LB_BREAKABLE'}; } - $lb_table[$lb_enums{'CR'}][$lb_enums{'LF'}] = $lb_actions{'LB_NOBREAK'}; + $lb_table[$lb_enums{'Carriage_Return'}][$lb_enums{'Line_Feed'}] + = $lb_actions{'LB_NOBREAK'}; # LB4 Always break after hard line breaks. # BK ! for my $i (0 .. @lb_table - 1) { - $lb_table[$lb_enums{'BK'}][$i] = $lb_actions{'LB_BREAKABLE'}; + $lb_table[$lb_enums{'Mandatory_Break'}][$i] + = $lb_actions{'LB_BREAKABLE'}; } # LB2 Never break at the start of text. @@ -1081,8 +1356,8 @@ sub output_LB_table() { # but these are reversed in the loop below, so that won't break if there # is no text for my $i (0 .. @lb_table - 1) { - $lb_table[$i][$lb_enums{'ed'}] = $lb_actions{'LB_BREAKABLE'}; - $lb_table[$lb_enums{'ed'}][$i] = $lb_actions{'LB_NOBREAK'}; + $lb_table[$i][$lb_enums{'EDGE'}] = $lb_actions{'LB_BREAKABLE'}; + $lb_table[$lb_enums{'EDGE'}][$i] = $lb_actions{'LB_NOBREAK'}; } # LB1 Assign a line breaking class to each code point of the input. @@ -1101,24 +1376,8 @@ sub output_LB_table() { # This is done in mktables, so we never see any of the remapped-from # classes. - print $out_fh "\nstatic const U8 LB_table[$table_size][$table_size] = {\n"; - print $out_fh "\n/* 'ed' stands for 'edge' */\n"; - print $out_fh "/* "; - for my $i (0 .. @lb_table - 1) { - print $out_fh " $lb_short_enums[$i]"; - } - print $out_fh " */\n"; - - for my $i (0 .. @lb_table - 1) { - print $out_fh "/* $lb_short_enums[$i] */ "; - print $out_fh "{ "; - print $out_fh join ", ", map sprintf("%2d", $_), @{ $lb_table[$i] }; - print $out_fh " }"; - print $out_fh "," if $i < @lb_table - 1; - print $out_fh "\n"; - } - - print $out_fh "};\n"; + output_table_common('LB', \%lb_actions, + \@lb_table, \@lb_short_enums, \%lb_abbreviations); } sub output_WB_table() { @@ -1143,19 +1402,6 @@ sub output_WB_table() { WB_NU_then_MB_or_MN_or_SQ => 14, ); - # Output the #define list, sorted by numeric value - my @defines; - while (my ($enum, $value) = each %wb_actions) { - $defines[$value] = $enum; - } - - print $out_fh "\n"; - - foreach my $i (0 .. @defines - 1) { - next unless defined $defines[$i]; - print $out_fh "#define $defines[$i]\t$i\n"; - } - # Construct the WB pair table. # The table is constructed in reverse order of the rules, to make the # lower-numbered, higher priority ones override the later ones, as the @@ -1174,110 +1420,135 @@ sub output_WB_table() { # Do not break between regional indicator symbols. # WB13c Regional_Indicator à Regional_Indicator - $wb_table[$wb_enums{'RI'}][$wb_enums{'RI'}] = $wb_actions{'WB_NOBREAK'}; + $wb_table[$wb_enums{'Regional_Indicator'}] + [$wb_enums{'Regional_Indicator'}] = $wb_actions{'WB_NOBREAK'}; # Do not break from extenders. # WB13b ExtendNumLet à (ALetter | Hebrew_Letter | Numeric | Katakana) - $wb_table[$wb_enums{'EX'}][$wb_enums{'LE'}] = $wb_actions{'WB_NOBREAK'}; - $wb_table[$wb_enums{'EX'}][$wb_enums{'HL'}] = $wb_actions{'WB_NOBREAK'}; - $wb_table[$wb_enums{'EX'}][$wb_enums{'NU'}] = $wb_actions{'WB_NOBREAK'}; - $wb_table[$wb_enums{'EX'}][$wb_enums{'KA'}] = $wb_actions{'WB_NOBREAK'}; + $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'ALetter'}] + = $wb_actions{'WB_NOBREAK'}; + $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Hebrew_Letter'}] + = $wb_actions{'WB_NOBREAK'}; + $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Numeric'}] + = $wb_actions{'WB_NOBREAK'}; + $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Katakana'}] + = $wb_actions{'WB_NOBREAK'}; # WB13a (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) # à # ExtendNumLet - $wb_table[$wb_enums{'LE'}][$wb_enums{'EX'}] = $wb_actions{'WB_NOBREAK'}; - $wb_table[$wb_enums{'HL'}][$wb_enums{'EX'}] = $wb_actions{'WB_NOBREAK'}; - $wb_table[$wb_enums{'NU'}][$wb_enums{'EX'}] = $wb_actions{'WB_NOBREAK'}; - $wb_table[$wb_enums{'KA'}][$wb_enums{'EX'}] = $wb_actions{'WB_NOBREAK'}; - $wb_table[$wb_enums{'EX'}][$wb_enums{'EX'}] = $wb_actions{'WB_NOBREAK'}; + $wb_table[$wb_enums{'ALetter'}][$wb_enums{'ExtendNumLet'}] + = $wb_actions{'WB_NOBREAK'}; + $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'ExtendNumLet'}] + = $wb_actions{'WB_NOBREAK'}; + $wb_table[$wb_enums{'Numeric'}][$wb_enums{'ExtendNumLet'}] + = $wb_actions{'WB_NOBREAK'}; + $wb_table[$wb_enums{'Katakana'}][$wb_enums{'ExtendNumLet'}] + = $wb_actions{'WB_NOBREAK'}; + $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'ExtendNumLet'}] + = $wb_actions{'WB_NOBREAK'}; # Do not break between Katakana. # WB13 Katakana à Katakana - $wb_table[$wb_enums{'KA'}][$wb_enums{'KA'}] = $wb_actions{'WB_NOBREAK'}; + $wb_table[$wb_enums{'Katakana'}][$wb_enums{'Katakana'}] + = $wb_actions{'WB_NOBREAK'}; # Do not break within sequences, such as â3.2â or â3,456.789â. # WB12 Numeric à (MidNum | MidNumLet | Single_Quote) Numeric - $wb_table[$wb_enums{'NU'}][$wb_enums{'MB'}] + $wb_table[$wb_enums{'Numeric'}][$wb_enums{'MidNumLet'}] += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'}; - $wb_table[$wb_enums{'NU'}][$wb_enums{'MN'}] + $wb_table[$wb_enums{'Numeric'}][$wb_enums{'MidNum'}] += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'}; - $wb_table[$wb_enums{'NU'}][$wb_enums{'SQ'}] + $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Single_Quote'}] += $wb_actions{'WB_NU_then_MB_or_MN_or_SQ'}; # WB11 Numeric (MidNum | (MidNumLet | Single_Quote)) à Numeric - $wb_table[$wb_enums{'MB'}][$wb_enums{'NU'}] + $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'Numeric'}] += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'}; - $wb_table[$wb_enums{'MN'}][$wb_enums{'NU'}] + $wb_table[$wb_enums{'MidNum'}][$wb_enums{'Numeric'}] += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'}; - $wb_table[$wb_enums{'SQ'}][$wb_enums{'NU'}] + $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'Numeric'}] += $wb_actions{'WB_MB_or_MN_or_SQ_then_NU'}; # Do not break within sequences of digits, or digits adjacent to letters # (â3aâ, or âA3â). # WB10 Numeric à (ALetter | Hebrew_Letter) - $wb_table[$wb_enums{'NU'}][$wb_enums{'LE'}] = $wb_actions{'WB_NOBREAK'}; - $wb_table[$wb_enums{'NU'}][$wb_enums{'HL'}] = $wb_actions{'WB_NOBREAK'}; + $wb_table[$wb_enums{'Numeric'}][$wb_enums{'ALetter'}] + = $wb_actions{'WB_NOBREAK'}; + $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Hebrew_Letter'}] + = $wb_actions{'WB_NOBREAK'}; # WB9 (ALetter | Hebrew_Letter) à Numeric - $wb_table[$wb_enums{'LE'}][$wb_enums{'NU'}] = $wb_actions{'WB_NOBREAK'}; - $wb_table[$wb_enums{'HL'}][$wb_enums{'NU'}] = $wb_actions{'WB_NOBREAK'}; + $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Numeric'}] + = $wb_actions{'WB_NOBREAK'}; + $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Numeric'}] + = $wb_actions{'WB_NOBREAK'}; # WB8 Numeric à Numeric - $wb_table[$wb_enums{'NU'}][$wb_enums{'NU'}] = $wb_actions{'WB_NOBREAK'}; + $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Numeric'}] + = $wb_actions{'WB_NOBREAK'}; # Do not break letters across certain punctuation. # WB7c Hebrew_Letter Double_Quote à Hebrew_Letter - $wb_table[$wb_enums{'DQ'}][$wb_enums{'HL'}] += $wb_actions{'WB_DQ_then_HL'}; + $wb_table[$wb_enums{'Double_Quote'}][$wb_enums{'Hebrew_Letter'}] + += $wb_actions{'WB_DQ_then_HL'}; # WB7b Hebrew_Letter à Double_Quote Hebrew_Letter - $wb_table[$wb_enums{'HL'}][$wb_enums{'DQ'}] += $wb_actions{'WB_HL_then_DQ'}; + $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Double_Quote'}] + += $wb_actions{'WB_HL_then_DQ'}; # WB7a Hebrew_Letter à Single_Quote - $wb_table[$wb_enums{'HL'}][$wb_enums{'SQ'}] = $wb_actions{'WB_NOBREAK'}; + $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Single_Quote'}] + = $wb_actions{'WB_NOBREAK'}; # WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) # à (ALetter | Hebrew_Letter) - $wb_table[$wb_enums{'MB'}][$wb_enums{'LE'}] + $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'ALetter'}] += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'}; - $wb_table[$wb_enums{'MB'}][$wb_enums{'HL'}] + $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'Hebrew_Letter'}] += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'}; - $wb_table[$wb_enums{'ML'}][$wb_enums{'LE'}] + $wb_table[$wb_enums{'MidLetter'}][$wb_enums{'ALetter'}] += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'}; - $wb_table[$wb_enums{'ML'}][$wb_enums{'HL'}] + $wb_table[$wb_enums{'MidLetter'}][$wb_enums{'Hebrew_Letter'}] += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'}; - $wb_table[$wb_enums{'SQ'}][$wb_enums{'LE'}] + $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'ALetter'}] += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'}; - $wb_table[$wb_enums{'SQ'}][$wb_enums{'HL'}] + $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'Hebrew_Letter'}] += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'}; # WB6 (ALetter | Hebrew_Letter) à (MidLetter | MidNumLet # | Single_Quote) (ALetter | Hebrew_Letter) - $wb_table[$wb_enums{'LE'}][$wb_enums{'MB'}] + $wb_table[$wb_enums{'ALetter'}][$wb_enums{'MidNumLet'}] += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'}; - $wb_table[$wb_enums{'HL'}][$wb_enums{'MB'}] + $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'MidNumLet'}] += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'}; - $wb_table[$wb_enums{'LE'}][$wb_enums{'ML'}] + $wb_table[$wb_enums{'ALetter'}][$wb_enums{'MidLetter'}] += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'}; - $wb_table[$wb_enums{'HL'}][$wb_enums{'ML'}] + $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'MidLetter'}] += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'}; - $wb_table[$wb_enums{'LE'}][$wb_enums{'SQ'}] + $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Single_Quote'}] += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'}; - $wb_table[$wb_enums{'HL'}][$wb_enums{'SQ'}] + $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Single_Quote'}] += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'}; # Do not break between most letters. # WB5 (ALetter | Hebrew_Letter) à (ALetter | Hebrew_Letter) - $wb_table[$wb_enums{'LE'}][$wb_enums{'LE'}] = $wb_actions{'WB_NOBREAK'}; - $wb_table[$wb_enums{'LE'}][$wb_enums{'HL'}] = $wb_actions{'WB_NOBREAK'}; - $wb_table[$wb_enums{'HL'}][$wb_enums{'LE'}] = $wb_actions{'WB_NOBREAK'}; - $wb_table[$wb_enums{'HL'}][$wb_enums{'HL'}] = $wb_actions{'WB_NOBREAK'}; + $wb_table[$wb_enums{'ALetter'}][$wb_enums{'ALetter'}] + = $wb_actions{'WB_NOBREAK'}; + $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Hebrew_Letter'}] + = $wb_actions{'WB_NOBREAK'}; + $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'ALetter'}] + = $wb_actions{'WB_NOBREAK'}; + $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Hebrew_Letter'}] + = $wb_actions{'WB_NOBREAK'}; # Ignore Format and Extend characters, except when they appear at the # beginning of a region of text. # WB4 X (Extend | Format)* â X for my $i (0 .. @wb_table - 1) { - $wb_table[$wb_enums{'Ex'}][$i] = $wb_actions{'WB_Ex_or_FO_then_foo'}; - $wb_table[$wb_enums{'FO'}][$i] = $wb_actions{'WB_Ex_or_FO_then_foo'}; + $wb_table[$wb_enums{'Extend'}][$i] + = $wb_actions{'WB_Ex_or_FO_then_foo'}; + $wb_table[$wb_enums{'Format'}][$i] + = $wb_actions{'WB_Ex_or_FO_then_foo'}; } # Implied is that these attach to the character before them, except for @@ -1285,15 +1556,15 @@ sub output_WB_table() { # override the ones set up here, for all the characters that need # overriding. for my $i (0 .. @wb_table - 1) { - $wb_table[$i][$wb_enums{'Ex'}] = $wb_actions{'WB_NOBREAK'}; - $wb_table[$i][$wb_enums{'FO'}] = $wb_actions{'WB_NOBREAK'}; + $wb_table[$i][$wb_enums{'Extend'}] = $wb_actions{'WB_NOBREAK'}; + $wb_table[$i][$wb_enums{'Format'}] = $wb_actions{'WB_NOBREAK'}; } # Break before and after white space # WB3b ÷ (Newline | CR | LF) # WB3a (Newline | CR | LF) ÷ # et. al. - for my $i ('CR', 'LF', 'NL', 'hs') { + for my $i ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') { for my $j (0 .. @wb_table - 1) { $wb_table[$j][$wb_enums{$i}] = $wb_actions{'WB_BREAKABLE'}; $wb_table[$wb_enums{$i}][$j] = $wb_actions{'WB_BREAKABLE'}; @@ -1303,46 +1574,34 @@ sub output_WB_table() { # But do not break within white space. # WB3 CR à LF # et.al. - for my $i ('CR', 'LF', 'NL', 'hs') { - for my $j ('CR', 'LF', 'NL', 'hs') { + for my $i ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') { + for my $j ('CR', 'LF', 'Newline', 'Perl_Tailored_HSpace') { $wb_table[$wb_enums{$i}][$wb_enums{$j}] = $wb_actions{'WB_NOBREAK'}; } } # And do not break horizontal space followed by Extend or Format - $wb_table[$wb_enums{'hs'}][$wb_enums{'Ex'}] = $wb_actions{'WB_NOBREAK'}; - $wb_table[$wb_enums{'hs'}][$wb_enums{'FO'}] = $wb_actions{'WB_NOBREAK'}; - $wb_table[$wb_enums{'hs'}][$wb_enums{'hs'}] = $wb_actions{'WB_hs_then_hs'}; + $wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'Extend'}] + = $wb_actions{'WB_NOBREAK'}; + $wb_table[$wb_enums{'Perl_Tailored_HSpace'}][$wb_enums{'Format'}] + = $wb_actions{'WB_NOBREAK'}; + $wb_table[$wb_enums{'Perl_Tailored_HSpace'}] + [$wb_enums{'Perl_Tailored_HSpace'}] + = $wb_actions{'WB_hs_then_hs'}; # Break at the start and end of text. # WB2 ÷ eot # WB1 sot ÷ for my $i (0 .. @wb_table - 1) { - $wb_table[$i][$wb_enums{'ed'}] = $wb_actions{'WB_BREAKABLE'}; - $wb_table[$wb_enums{'ed'}][$i] = $wb_actions{'WB_BREAKABLE'}; + $wb_table[$i][$wb_enums{'EDGE'}] = $wb_actions{'WB_BREAKABLE'}; + $wb_table[$wb_enums{'EDGE'}][$i] = $wb_actions{'WB_BREAKABLE'}; } # But, unspecified by Unicode, we shouldn't break on an empty string. - $wb_table[$wb_enums{'ed'}][$wb_enums{'ed'}] = 0; - - print $out_fh "\nstatic const U8 WB_table[$table_size][$table_size] = {\n"; - print $out_fh "\n/* 'Ex' stands for 'Extend'; 'hs' for 'Perl_Tailored_HSpace'; 'ed' for 'edge' */\n"; - print $out_fh "/* "; - for my $i (0 .. @wb_table - 1) { - print $out_fh " $wb_short_enums[$i]"; - } - print $out_fh " */\n"; + $wb_table[$wb_enums{'EDGE'}][$wb_enums{'EDGE'}] = 0; - for my $i (0 .. @wb_table - 1) { - print $out_fh "/* $wb_short_enums[$i] */ "; - print $out_fh "{"; - print $out_fh join ", ", map sprintf("%2d", $_), @{ $wb_table[$i] }; - print $out_fh " }"; - print $out_fh "," if $i < @wb_table - 1; - print $out_fh "\n"; - } - - print $out_fh "};\n"; + output_table_common('WB', \%wb_actions, + \@wb_table, \@wb_short_enums, \%wb_abbreviations); } output_invlist("Latin1", [ 0, 256 ]); @@ -1442,6 +1701,7 @@ for my $charset (get_supported_code_pages()) { my $to_adjust; if ($is_local_sub) { @invlist = eval $lookup_prop; + die $@ if $@; } else { @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok'); -- Perl5 Master Repository