Author: simon
Date: Sat Jan 24 04:36:25 2009
New Revision: 35956

Added:
   branches/strings/pseudocode/t/recode.t   (contents, props changed)
Modified:
   branches/strings/pseudocode/Encodings.pm
   branches/strings/pseudocode/ParrotString.pm
   branches/strings/pseudocode/t/create.t

Log:
Some bug fixes, and now we have UTF8->NFG->UTF8 round-tripping.


Modified: branches/strings/pseudocode/Encodings.pm
==============================================================================
--- branches/strings/pseudocode/Encodings.pm    (original)
+++ branches/strings/pseudocode/Encodings.pm    Sat Jan 24 04:36:25 2009
@@ -42,25 +42,52 @@
         if 191 < $c < 224 { return 2 }
         return 3
     }
+    sub _bytes_needed($c) {
+        if $c < 0x80 { return 1 }
+        if $c < 0x0800 { return 2 }
+        return 3;
+    }
     sub char_at_byteoffset ($str, $offset is rw) { # Private helper
+        if ($offset > $str.strlen) { Parrot_debug_string($str); die "BUG: 
Asked for a byte "~$offset~" that's not there" };
         my $c = $str.buffer.[$offset++];
         if 191 < $c < 224 {
-            # XXX Guard
+            if ($offset + 1 > $str.strlen) { die "BUG: UTF8 encoding ran off 
end of string" }
             $c = (($c +& 31) +< 6) +| ( $str.buffer.[$offset++] +& 63 );
         } elsif $c >= 224 {
-            # XXX Guard
+            if ($offset + 2 > $str.strlen) { die "BUG: UTF8 encoding ran off 
end of string" }
             $c = (($c +& 15) +< 12) 
                 +| (( $str.buffer.[$offset++] +& 63 ) +< 6);
             $c  +|= $str.buffer.[$offset++] +& 63;
         }
         return $c;
     }
+
+    method append_char($str, $c) {
+        $str.bufused += _bytes_needed($c);
+        $str.strlen  += _bytes_needed($c);
+        if ($c < 0x80) {
+            push $str.buffer, $c;
+        } elsif ($c < 0x0800) {
+            push $str.buffer, $c +> 6 +| 0xc0;
+            push $str.buffer, $c +& 0x3f +| 0x80;
+        } else {
+            push $str.buffer, $c +> 12 +| 0xe0;
+            push $str.buffer, $c +> 6 +& 0x3f +| 0x80;
+            push $str.buffer, $c +& 0x3f +| 0x80;
+        }
+    }
+
+    method append_grapheme($str, $g) { 
+        for (@($g)) { self.append_char($str, $_) }
+    }
+
     method string_char_iterate ($str, $callback, $parameter) {
         my $index = 0;
-        while ($index < $str.bufused-1) {
+        while ($index < $str.bufused) {
             $callback(char_at_byteoffset($str, $index), $parameter);
         }
     }
+
     method string_grapheme_iterate ($str, $callback, $parameter) {
         if ($str.charset !~~ ParrotCharset::Unicode) {
             # Although why you'd store non-Unicode in UTF8 is beyond me
@@ -71,7 +98,7 @@
         }
         # Collect characters into graphemes in a roughly O(n) way...
         my $index = 0;
-        while ($index < $str.bufused-1) {
+        while ($index < $str.bufused) {
             my $c = char_at_byteoffset($str, $index);
 
             # If we're the last character, do the callback and give up
@@ -83,7 +110,7 @@
             my $next_char;
             my $nc_index = $index;
             my $end_of_grapheme_sequence = $index;
-            while ($nc_index <= $str.bufused and
+            while ($nc_index < $str.bufused and
                    $next_char = char_at_byteoffset($str, $nc_index)
                    and ParrotCharset::Unicode::is_combining($next_char)) {
                    $end_of_grapheme_sequence = $nc_index;
@@ -113,13 +140,13 @@
 
     method setup($str) { $str.normalization = ParrotNormalization::NFG.new(); }
     method append_grapheme ($str, $g) {
-        my $item;
         if (@($g) > 1) {
+            my $item;
             $item = $str.normalization.get_grapheme_table_entry(@($g));
+            $str.buffer.push($item);
         } else {
-            ($item) = @($g);
+            $str.buffer.push(@( $g ));
         }
-        $str.buffer.push($item);
         $str.bufused++;
         $str.strlen++;
     }
@@ -146,7 +173,7 @@
         }
         my $c = $str.buffer[$index];
         if $c >= 0 { return [ $c ]; }
-        return $str.normalization.grapheme_table.[-$c];
+        return $str.normalization.grapheme_table.[-$c - 1];
         # We are allowed to be pally with the normalization internals
         # because NFG is specific to ParrotEncoding.
     }

Modified: branches/strings/pseudocode/ParrotString.pm
==============================================================================
--- branches/strings/pseudocode/ParrotString.pm (original)
+++ branches/strings/pseudocode/ParrotString.pm Sat Jan 24 04:36:25 2009
@@ -100,3 +100,29 @@
 sub Parrot_string_grapheme_chopn($src, $count) { 
     return Parrot_string_replace($src, Parrot_string_grapheme_length($src) - 
$count, $count, undef);
 }
+
+sub Parrot_debug_string($src) {
+    say "String charset: "~$src.charset;
+    say "String encoding: "~$src.encoding;
+    say "String normalization: "~$src.normalization;
+    say "String buffer used: "~$src.bufused;
+    say "String length: "~$src.strlen;
+    say "String buffer contents: ";
+    for ( $src.buffer) { print " ["~$_~"]"; }
+    say "";
+}
+
+sub Parrot_string_byte_equal($one, $two) {
+    if ($one.strlen != $two.strlen) { return 0; }
+    for (0 .. $one.strlen-1) {
+        if ($one.buffer.[$_] != $two.buffer.[$_]) { 
+            say "Oops, byte "~$_~" differed";
+            return 0 
+        }
+    }
+    return 1;
+}
+sub Parrot_string_character_equal($one, $two) {
+    say "Not implemented yet";
+    return 0;
+}

Modified: branches/strings/pseudocode/t/create.t
==============================================================================
--- branches/strings/pseudocode/t/create.t      (original)
+++ branches/strings/pseudocode/t/create.t      Sat Jan 24 04:36:25 2009
@@ -1,6 +1,6 @@
 use Test;
 use ParrotString;
-plan 11;
+plan 10;
 
 my $str = Parrot_string_new_init("flurble", 4, ParrotCharset::ASCII, 
ParrotEncoding::Byte);
 ok($str.charset ~~ ParrotCharset::ASCII, "Charset set properly");
@@ -16,9 +16,3 @@
 is(Parrot_string_byte_length($str), 28, "String byte length correct");
 is(Parrot_string_length($str), 15, "UTF8 char length correct");
 is(Parrot_string_index($str, 3), 0x3ac, "UTF8 string indexing");
-
-# The standard NFG example...
-$str = Parrot_string_new_init("ABC \xd0\xb8\xcc\x8f", 8, 
ParrotCharset::Unicode, ParrotEncoding::UTF8);
-my $str2 = Parrot_string_new_init("", 0, ParrotCharset::Unicode, 
ParrotEncoding::ParrotNative);
-Parrot_string_grapheme_copy($str, $str2);
-is(Parrot_string_grapheme_length($str2), 5, "Four UTF8 bytes = one grapheme");

Added: branches/strings/pseudocode/t/recode.t
==============================================================================
--- (empty file)
+++ branches/strings/pseudocode/t/recode.t      Sat Jan 24 04:36:25 2009
@@ -0,0 +1,15 @@
+use Test;
+use ParrotString;
+plan 4;
+
+# The standard NFG example...
+my $str = Parrot_string_new_init("ABC \xd0\xb8\xcc\x8f", 8, 
ParrotCharset::Unicode, ParrotEncoding::UTF8);
+my $str2 = Parrot_string_new_init("", 0, ParrotCharset::Unicode, 
ParrotEncoding::ParrotNative);
+Parrot_string_grapheme_copy($str, $str2);
+is(Parrot_string_grapheme_length($str2), 5, "Four UTF8 bytes = one grapheme");
+my $str3 = Parrot_string_new_init("", 0, ParrotCharset::Unicode, 
ParrotEncoding::UTF8);
+
+Parrot_string_grapheme_copy($str2, $str3);
+ok(Parrot_string_byte_equal($str, $str3), "Round-tripping UTF8" );
+ok(Parrot_string_character_equal($str, $str3), "Character equivalence for 
UTF8" );
+ok(Parrot_string_character_equal($str2, $str3), "Character equivalence between 
UTF8 and NFG" );

Reply via email to