Author: simon Date: Sat Jan 24 04:36:25 2009 New Revision: 35956 Added: branches/strings/pseudocode/t/recode.t (contents, props changed) Modified: branches/strings/pseudocode/Encodings.pm branches/strings/pseudocode/ParrotString.pm branches/strings/pseudocode/t/create.t
Log: Some bug fixes, and now we have UTF8->NFG->UTF8 round-tripping. Modified: branches/strings/pseudocode/Encodings.pm ============================================================================== --- branches/strings/pseudocode/Encodings.pm (original) +++ branches/strings/pseudocode/Encodings.pm Sat Jan 24 04:36:25 2009 @@ -42,25 +42,52 @@ if 191 < $c < 224 { return 2 } return 3 } + sub _bytes_needed($c) { + if $c < 0x80 { return 1 } + if $c < 0x0800 { return 2 } + return 3; + } sub char_at_byteoffset ($str, $offset is rw) { # Private helper + if ($offset > $str.strlen) { Parrot_debug_string($str); die "BUG: Asked for a byte "~$offset~" that's not there" }; my $c = $str.buffer.[$offset++]; if 191 < $c < 224 { - # XXX Guard + if ($offset + 1 > $str.strlen) { die "BUG: UTF8 encoding ran off end of string" } $c = (($c +& 31) +< 6) +| ( $str.buffer.[$offset++] +& 63 ); } elsif $c >= 224 { - # XXX Guard + if ($offset + 2 > $str.strlen) { die "BUG: UTF8 encoding ran off end of string" } $c = (($c +& 15) +< 12) +| (( $str.buffer.[$offset++] +& 63 ) +< 6); $c +|= $str.buffer.[$offset++] +& 63; } return $c; } + + method append_char($str, $c) { + $str.bufused += _bytes_needed($c); + $str.strlen += _bytes_needed($c); + if ($c < 0x80) { + push $str.buffer, $c; + } elsif ($c < 0x0800) { + push $str.buffer, $c +> 6 +| 0xc0; + push $str.buffer, $c +& 0x3f +| 0x80; + } else { + push $str.buffer, $c +> 12 +| 0xe0; + push $str.buffer, $c +> 6 +& 0x3f +| 0x80; + push $str.buffer, $c +& 0x3f +| 0x80; + } + } + + method append_grapheme($str, $g) { + for (@($g)) { self.append_char($str, $_) } + } + method string_char_iterate ($str, $callback, $parameter) { my $index = 0; - while ($index < $str.bufused-1) { + while ($index < $str.bufused) { $callback(char_at_byteoffset($str, $index), $parameter); } } + method string_grapheme_iterate ($str, $callback, $parameter) { if ($str.charset !~~ ParrotCharset::Unicode) { # Although why you'd store non-Unicode in UTF8 is beyond me @@ -71,7 +98,7 @@ } # Collect characters into graphemes in a roughly O(n) way... my $index = 0; - while ($index < $str.bufused-1) { + while ($index < $str.bufused) { my $c = char_at_byteoffset($str, $index); # If we're the last character, do the callback and give up @@ -83,7 +110,7 @@ my $next_char; my $nc_index = $index; my $end_of_grapheme_sequence = $index; - while ($nc_index <= $str.bufused and + while ($nc_index < $str.bufused and $next_char = char_at_byteoffset($str, $nc_index) and ParrotCharset::Unicode::is_combining($next_char)) { $end_of_grapheme_sequence = $nc_index; @@ -113,13 +140,13 @@ method setup($str) { $str.normalization = ParrotNormalization::NFG.new(); } method append_grapheme ($str, $g) { - my $item; if (@($g) > 1) { + my $item; $item = $str.normalization.get_grapheme_table_entry(@($g)); + $str.buffer.push($item); } else { - ($item) = @($g); + $str.buffer.push(@( $g )); } - $str.buffer.push($item); $str.bufused++; $str.strlen++; } @@ -146,7 +173,7 @@ } my $c = $str.buffer[$index]; if $c >= 0 { return [ $c ]; } - return $str.normalization.grapheme_table.[-$c]; + return $str.normalization.grapheme_table.[-$c - 1]; # We are allowed to be pally with the normalization internals # because NFG is specific to ParrotEncoding. } Modified: branches/strings/pseudocode/ParrotString.pm ============================================================================== --- branches/strings/pseudocode/ParrotString.pm (original) +++ branches/strings/pseudocode/ParrotString.pm Sat Jan 24 04:36:25 2009 @@ -100,3 +100,29 @@ sub Parrot_string_grapheme_chopn($src, $count) { return Parrot_string_replace($src, Parrot_string_grapheme_length($src) - $count, $count, undef); } + +sub Parrot_debug_string($src) { + say "String charset: "~$src.charset; + say "String encoding: "~$src.encoding; + say "String normalization: "~$src.normalization; + say "String buffer used: "~$src.bufused; + say "String length: "~$src.strlen; + say "String buffer contents: "; + for ( $src.buffer) { print " ["~$_~"]"; } + say ""; +} + +sub Parrot_string_byte_equal($one, $two) { + if ($one.strlen != $two.strlen) { return 0; } + for (0 .. $one.strlen-1) { + if ($one.buffer.[$_] != $two.buffer.[$_]) { + say "Oops, byte "~$_~" differed"; + return 0 + } + } + return 1; +} +sub Parrot_string_character_equal($one, $two) { + say "Not implemented yet"; + return 0; +} Modified: branches/strings/pseudocode/t/create.t ============================================================================== --- branches/strings/pseudocode/t/create.t (original) +++ branches/strings/pseudocode/t/create.t Sat Jan 24 04:36:25 2009 @@ -1,6 +1,6 @@ use Test; use ParrotString; -plan 11; +plan 10; my $str = Parrot_string_new_init("flurble", 4, ParrotCharset::ASCII, ParrotEncoding::Byte); ok($str.charset ~~ ParrotCharset::ASCII, "Charset set properly"); @@ -16,9 +16,3 @@ is(Parrot_string_byte_length($str), 28, "String byte length correct"); is(Parrot_string_length($str), 15, "UTF8 char length correct"); is(Parrot_string_index($str, 3), 0x3ac, "UTF8 string indexing"); - -# The standard NFG example... -$str = Parrot_string_new_init("ABC \xd0\xb8\xcc\x8f", 8, ParrotCharset::Unicode, ParrotEncoding::UTF8); -my $str2 = Parrot_string_new_init("", 0, ParrotCharset::Unicode, ParrotEncoding::ParrotNative); -Parrot_string_grapheme_copy($str, $str2); -is(Parrot_string_grapheme_length($str2), 5, "Four UTF8 bytes = one grapheme"); Added: branches/strings/pseudocode/t/recode.t ============================================================================== --- (empty file) +++ branches/strings/pseudocode/t/recode.t Sat Jan 24 04:36:25 2009 @@ -0,0 +1,15 @@ +use Test; +use ParrotString; +plan 4; + +# The standard NFG example... +my $str = Parrot_string_new_init("ABC \xd0\xb8\xcc\x8f", 8, ParrotCharset::Unicode, ParrotEncoding::UTF8); +my $str2 = Parrot_string_new_init("", 0, ParrotCharset::Unicode, ParrotEncoding::ParrotNative); +Parrot_string_grapheme_copy($str, $str2); +is(Parrot_string_grapheme_length($str2), 5, "Four UTF8 bytes = one grapheme"); +my $str3 = Parrot_string_new_init("", 0, ParrotCharset::Unicode, ParrotEncoding::UTF8); + +Parrot_string_grapheme_copy($str2, $str3); +ok(Parrot_string_byte_equal($str, $str3), "Round-tripping UTF8" ); +ok(Parrot_string_character_equal($str, $str3), "Character equivalence for UTF8" ); +ok(Parrot_string_character_equal($str2, $str3), "Character equivalence between UTF8 and NFG" );