> Has anyone had a look at the OpenI18N/ICU locale data? > > The locales there are all UTF-8 and have java rule based collation data, so > they *might* be useful for creating a more comprehensive (and accurate) set > of sort modules? The downside is this data is pretty rough ATM but does > seem to be improving slowly. > > I guess p6 is going to use ICU as the basis for I18N - sure hope the APIs > are easier though :)
The syntax of collation customization (tailoring) in ICU ( http://oss.software.ibm.com/icu/userguide/Collate_Customization.html ) is character-based and may be more intuitive: for French: "[backwards 2]&A << \u00e6/e <<< \u00c6/E" for Spanish: "&N < n\u0303 <<< N\u0303" "&C < ch <<< Ch <<< CH" "&l < ll <<< Ll <<< LL" However Unicode::Collate also allows linguistic tailoring. Certainly its interface requires hard code of weights and may be less user-friendly. #!perl use strict; use warnings; use Unicode::Collate; our (@listEs, @listFr); my $objEs = Unicode::Collate->new( entry => <<'ENTRY', # for allkeys-4.0.0.txt 0063 0068 ; [.0E6A.0020.0002.0063] # ch 0043 0068 ; [.0E6A.0020.0007.0043] # Ch 0043 0048 ; [.0E6A.0020.0008.0043] # Ch 006C 006C ; [.0F4C.0020.0002.006C] # ll 004C 006C ; [.0F4C.0020.0007.004C] # Ll 004C 004C ; [.0F4C.0020.0008.004C] # LL 006E 0303 ; [.0F69.0020.0002.006E] # � 004E 0303 ; [.0F69.0020.0008.004E] # � ENTRY # entry => <<'ENTRY', # for allkeys-3.1.1.txt #0063 0068 ; [.0A46.0020.0002.0063] # ch #0043 0068 ; [.0A46.0020.0007.0043] # Ch #0043 0048 ; [.0A46.0020.0008.0043] # Ch #006C 006C ; [.0B1C.0020.0002.006C] # ll #004C 006C ; [.0B1C.0020.0007.004C] # Ll #004C 004C ; [.0B1C.0020.0008.004C] # LL #006E 0303 ; [.0B38.0020.0002.006E] # � #004E 0303 ; [.0B38.0020.0008.004E] # � #ENTRY ); my $objFr = Unicode::Collate->new( backwards => 2, entry => <<'ENTRY', # for allkeys-4.0.0.txt 00E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae 00C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ENTRY # entry => <<'ENTRY', # for allkeys-3.1.1.txt #00E6 ; [.0A15.0020.0002.00E6][.0A65.0020.0002.00E6] # ae #00C6 ; [.0A15.0020.0008.00C6][.0A65.0020.0008.00C6] # AE #ENTRY ); BEGIN { @listEs = qw( cambio camelo camella camello Camer�n cielo curso chico chile Chile CHILE chocolate espacio espanto espa�ol esperanza lama l�quido luz llama Llama LLAMA llamar nos nueve �u ojo ); @listFr = ( qw( cadurcien c�cum c�CUM C�CUM C�CUM caennais c�sium cafard coercitif cote c�te C�te cot� Cot� c�t� C�t� coter �l�ve �lev� g�ne g�ne M�CON ma�on p�che P�CHE p�che P�CHE p�ch� P�CH� p�cher p�cher rel�ve relev� r�v�le r�v�l� sur�l�vation s�rement sur�minent s�ret� vice-consul vicennal vice-pr�sident vice-roi vic�simal), "vice versa", "vice-versa", ); use Test; plan tests => $#listEs + 2 + $#listFr + 2; } sub randomize { my %hash; @[EMAIL PROTECTED] = (); keys %hash; } # ?! for (my $i = 0; $i < $#listEs; $i++) { ok($objEs->lt($listEs[$i], $listEs[$i+1])); } for (my $i = 0; $i < $#listFr; $i++) { ok($objFr->lt($listFr[$i], $listFr[$i+1])); } our @randEs = randomize(@listEs); our @sortEs = $objEs->sort(@randEs); ok("@randEs" ne "@listEs"); ok("@sortEs" eq "@listEs"); our @randFr = randomize(@listFr); our @sortFr = $objFr->sort(@randFr); ok("@randFr" ne "@listFr"); ok("@sortFr" eq "@listFr"); __END__ Regards, SADAHIRO Tomoyuki
