On Fri, Feb 17, 2012 at 11:43 AM, Orlovsky Alexander <[email protected]> wrote: > Я тут себе IT-бложик завел, заодно, кроме всего прочего, опубликовал там > текст на основе моего декабрьского доклада на Saint Perl "Unicode. Ликбез": > http://nordicdyno.github.com/blog/2012/02/17/unicode-basics/
Кстати, может быть тут не все п5п читают, коротко о поддержке юникода в перл: On Mon, Feb 20, 2012 at 7:27 PM, Tom Christiansen <[email protected]> wrote: > Inspired by how scandalously Unicode-deficient the > otherwise fine 4-way polyglot table comparing PHP, Perl, > Python, and Ruby is at > > http://hyperpolyglot.org/scripting > > I created a quick Unicode cheatsheet for Perl, mostly by > mining the examples in the new 4th edition of the came. > > Gee, I foresee a *whole* lot of "impossibles" in the > other three languages' columns, don't you? :) > > Hm, have I left anything out that Perl is especially cool with? > > I almost wonder whether this sort of thing oughtn't be a manpage, > something like perluni{ref,cheat,quick}? > > --tom > > =Characters and their numbers > > # ASCII > ord("A") > chr(65) > > # BMP > ord("Σ") > chr(0x3A3) > > # beyond the BMP > ord("𝑛") > chr(0x1D45B) > > # beyond Unicode (up to MAXINT) > ord("\x{20_0000}") > chr(0x20_0000) > > =Unicode literals by character number > > String: "\x{3a3}" > Regex: /\x{3a3}/ > > String: "\x{1d45b}" > Regex: /\x{1d45b}/ > > # even non-BMP ranges in regex work fine > /[\x{1D434}-\x{1D467}]/ > > =Get character name by number > > use charnames (); > my $name = charnames::viacode(0x03A3); > > =Get character number by name > > use charnames (); > my $number = charnames::vianame("GREEK CAPITAL LETTER SIGMA"); > > =Unicode named characters > > use charnames qw(:full :short greek); > > "\N{MATHEMATICAL ITALIC SMALL N}" > "\N{GREEK CAPITAL LETTER SIGMA}" > "\N{Greek:Sigma}" > "\N{epsilon}" > > =Unicode named sequences > > use charnames qw(:full); > my $seq = "\N{LATIN CAPITAL LETTER A WITH MACRON AND GRAVE}"; > printf "U+%v04X\n", $seq; > U+0100.0300 > > =Custom named characters > > use charnames ":full", ":alias" => { > ecute => "LATIN SMALL LETTER E WITH ACUTE", > "APPLE LOGO" => 0xF8FF, # private use character > }; > > "\N{ecute}" > "\N{APPLE LOGO}" > > =Declare source in utf8 for identifiers and literals > > use utf8; > > my $measure = "Ångström"; > my @μsoft = qw( cp852 cp1251 cp1252 ); > my @ὑπέρμεγας = qw( ὑπέρ μεγας ); > my @鯉 = qw( koi8–f koi8–u koi8–r ); > > =Unicode casing > > uc("henry ⅷ") # "HENRY Ⅷ" > uc("tschüß") # "TSHUESS" > > # both are true: > "tschüß" =~ /TSHUESS/i > "Σίσυφος" =~ /ΣΊΣΥΦΟΣ/i > > =Unicode case-insensitive comparisons > > use utf8; > use feature "fc"; # fc() function is from v5.16 > > # sort case-insensitively > my @sorted = sort { fc($a) cmp fc($b) } @list; > > # both are true: > fc("tschüß") eq fc("TSHUESS") > fc("Σίσυφος") eq fc("ΣΊΣΥΦΟΣ") > > =Match Unicode linebreak sequence in regex > > \R > > s/\R/\n/g; # normalize all linebreaks to \n > > =Match Unicode properties in regex with \p, \P > > \pL, \pN, \pS, \pP, \pM, \pZ, \pC > \p{Sk}, \p{Ps}, \p{Lt} > \p{alpha}, \p{upper}, \p{lower} > \p{Latin}, \p{Greek} > \p{script=Latin}, \p{script=Greek} > \p{East_Asian_Width=Wide}, \p{EA=W} > \p{Line_Break=Hyphen}, \p{LB=HY} > \p{Numeric_Value=4}, \p{NV=4} > > =Custom character properties > > # using private-use characters > sub In_Tengwar { "E000\tE07F\n" } > > if (/\p{In_Tengwar}/) { ... } > > # blending existing properties > sub Is_GraecoRoman_Title {<<'END_OF_SET'} > +utf8::IsLatin > +utf8::IsGreek > &utf8::IsTitle > END_OF_SET > > if (/\p{Is_GraecoRoman_Title}/ { ... } > > =Get character category > > use Unicode::UCD qw(charinfo); > my $cat = charinfo(0x3A3)->{category}; # "Lu" > > =Convert non-ASCII Unicode numerics > > # from v5.12 > use Unicode::UCD qw(num); > if (/(\d+|\N)) { # not just ASCII! > $nv = num($1); > } > > use charnames qw(:full); > my $nv = num("\N{RUMI DIGIT ONE}\N{RUMI DIGIT TWO}"); > > =Match Unicode grapheme cluster in regex > > \X > > # match and grab five first graphemes > my($first_five) = /^(\X{5})/; > > # Find vowel plus any diacritics > use Unicode::Normalize; > my $nfd = NFD($orig); > $nfd =~ /(?=[aeiou])\X/i > > =Reverse string by grapheme > > $str = join("", reverse $str =~ /\X/g); > > # OR: cpan -i Unicode::GCString > use Unicode::GCString; > $str = reverse Unicode::GCString->new($str); > > =String length in graphemes > > my $count = 0; > while ($str =~ /\X/) { $count++ } > > # OR: cpan -i Unicode::GCString > use Unicode::GCString; > $gcs = Unicode::GCString->new($str); > my $count = $gcs->length; > > =Substring by grapheme > > # cpan -i Unicode::GCString > use Unicode::GCString; > $gcs = Unicode::GCString->new($str); > my $piece = $gcs->substr(5, 5); > > =Unicode column-width for printing > > # cpan -i Unicode::GCString > use Unicode::GCString; > $gcs = Unicode::GCString->new($str); > my $cols = $gcs->columns; > printf "%*s\n", $cols, $str, > > =Unicode normalization > > use Unicode::Normalize; > my $nfd = NFD($orig); > my $nfc = NFC($orig); > my $nfkd = NFKD($orig); > my $nfkc = NFKC($orig); > > =Unicode collation > > use Unicode::Collate; > my $col = Unicode::Collate->new(); > my @list = $col->sort(@old_list); > > =Case- *and* accent-insensitive Unicode sort > > use Unicode::Collate; > my $col = Unicode::Collate->new(level => 1); > my @list = $col->sort(@old_list); > > =Unicode locale collation > > # either use v5.12, OR: cpan -i Unicode::Collate::Locale > use Unicode::Collate::Locale; > my $col = Unicode::Collate::Locale->new(locale => "de__phonebook"); > my @list = $col->sort(@old_list); > > =Case- *and* accent-insensitive comparisons > > use utf8; > use Unicode::Collate; > my $coll = Unicode::Collate–>new( > level => 1, > normalization => undef > ); > > # now both are true: > $coll->eq("García", "GARCIA" ); > $coll->eq("Márquez", "MARQUEZ"); > > =Unicode linebreaking > > # cpan -i Unicode::LineBreak > use Unicode::LineBreak; > use charnames qw(:full); > > my $para = "This is a super\N{HYPHEN}long string. " x 20; > my $fmt = new Unicode::LineBreak; > print $fmt->break($para), "\n"; > > =Declare std streams to be utf8 > > $ perl -CS ... > or > $ export PERL_UNICODE=S > or > use open qw(:std :utf8); > or > binmode(STDIN, ":utf8"); > binmode(STDOUT, ":utf8"); > binmode(STDERR, ":utf8"); > > =Make I/O default to utf8 > > $ perl -CSD ... > or > $ export PERL_UNICODE=SD > or > use open qw(:std :utf8); > > =Open file with implicit encode/decode > > # input file > open(my $in_file, "< :encoding(UTF-16)", "wintext"); > OR > open(my $in_file, "<", "wintext"); > binmode($in_file, ":encoding(UTF-16)"); > THEN > my $line = <$in_file>; > > # output file > open($out_file, "> :encoding(cp1252)", "wintext"); > OR > open(my $out_file, ">", "wintext"); > binmode($out_file, ":encoding(cp1252)"); > THEN > print $out_file "some text\n"; > > =Explicit encode/decode [rarely needed, see previous] > > use Encode qw(encode decode); > > my $chars = decode("shiftjis", $bytes); > OR > my $bytes = encode("MIME–Header–ISO_2022_JP", $chars); -- Moscow.pm mailing list [email protected] | http://moscow.pm.org
