MHonArc CharEnt.pm,1.8,1.9

Earl Hood <[EMAIL PROTECTED]> Mon, 02 Dec 2002 16:10:13 -0800

Update of /cvsroot/mhonarc/mhonarc/MHonArc/lib/MHonArc
In directory subversions:/tmp/cvs-serv28809/lib/MHonArc


Modified Files:
        CharEnt.pm 
Log Message:
+ Added Korean character set support to MHonArc::CharEnt.
+ Added a few more charset aliases.


Index: CharEnt.pm
===================================================================
RCS file: /cvsroot/mhonarc/mhonarc/MHonArc/lib/MHonArc/CharEnt.pm,v
retrieving revision 1.8
retrieving revision 1.9
diff -C2 -r1.8 -r1.9
*** CharEnt.pm  1 Dec 2002 03:50:52 -0000       1.8
--- CharEnt.pm  3 Dec 2002 00:09:01 -0000       1.9
***************
*** 66,72 ****
  ##      The CHARSETALIASES resource can be used to map aka names (aliases)
  ##      to the names used here.
- ##
  ##  NOTE: UTF-8 does not require a map since UTF-8 is decoded straight
  ##      to &#xHHHH; entity references.
  
  my %CharsetMaps = (
--- 66,73 ----
  ##      The CHARSETALIASES resource can be used to map aka names (aliases)
  ##      to the names used here.
  ##  NOTE: UTF-8 does not require a map since UTF-8 is decoded straight
  ##      to &#xHHHH; entity references.
+ ##  NOTE: iso-2022-{jp,kr} are translated to euc-{jp,kr} first before
+ ##      conversion.
  
  my %CharsetMaps = (
***************
*** 87,90 ****
--- 88,92 ----
      'iso-8859-16'    =>       'MHonArc/CharEnt/ISO8859_16.pm',
      'cp866'        => 'MHonArc/CharEnt/CP866.pm',
+     'cp949'        => 'MHonArc/CharEnt/CP949.pm', # euc-kr
      'cp936'        => 'MHonArc/CharEnt/CP950.pm', # GBK
      'cp950'        => 'MHonArc/CharEnt/CP950.pm', # Big5
***************
*** 148,201 ****
      my $charset  = lc shift;
      $charset =~ tr/_/-/;
-     my($char, $entstr);
  
      # UTF-8 can be converted algorithmically.
      if ($charset eq 'utf-8') {
!       my($i, $n, $mask);
!       # We do not do full compliant UTF-8 parsing: malformed sequences
!       # will end up being treated as individual octets replaced with the
!       # '?' sign.
!       $data =~ s{([\x00-\x7F]|
!                   [\xC0-\xDF][\x80-\xBF]|
!                    \xE0      [\xA0-\xBF][\x80-\xBF]|
!                   [\xE1-\xEF][\x80-\xBF]{2}|
!                    \xF0      [\x90-\xBF][\x80-\xBF]{2}|
!                   [\xF1-\xF7][\x80-\xBF]{3}|
!                    \xF8      [\x88-\xBF][\x80-\xBF]{3}|
!                   [\xF9-\xFB][\x80-\xBF]{4}|
!                    \xFC      [\x84-\xBF][\x80-\xBF]{4}|
!                    \xFD      [\x80-\xBF]{5}|
!                   .)
!                 }{
!                     if (($n = length($1)) == 1) {
!                         $char = unpack('C',$1);
!                         if ($char <= 0x7F) {
!                             $ASCIIMap{$char}
!                                 ? join('', '&', $ASCIIMap{$char}, ';')
!                                 : pack('C', $char);
!                         } else {
!                           '?';
!                         }
!                     } else {
!                         for ($mask=0x1, $i=$n; $i < 6; ++$i) {
!                             $mask = ($mask << 1) | 0x1;
!                         }
!                         $char = (unpack('C',substr($1,0,1)) & $mask) <<
!                                 ($n-1)*6;
!                         for ($i=1; $i < $n; ++$i) {
!                             $char |= ((unpack('C',substr($1,$i,1)) & 0x3F) <<
!                                      (($n-$i-1)*6))
!                         }
!                         sprintf('&#x%X;',$char);
!                     }
!                  }gxe;
! 
        return $data;
      }
  
!     # If iso-2022-jp, convert to euc-jp first
      if ($charset eq 'iso-2022-jp') {
        _jp_2022_to_euc(\$data);
        $charset = 'euc-jp';
      }
  
--- 150,169 ----
      my $charset  = lc shift;
      $charset =~ tr/_/-/;
  
      # UTF-8 can be converted algorithmically.
      if ($charset eq 'utf-8') {
!       _utf8_to_sgml(\$data);
        return $data;
      }
  
!     # Pre-processing checks
      if ($charset eq 'iso-2022-jp') {
+       # iso-2022-jp, convert to euc-jp first
        _jp_2022_to_euc(\$data);
        $charset = 'euc-jp';
+     } elsif ($charset eq 'iso-2022-kr') {
+       # if iso-2022-kr, convert to euc-kr first
+       _kr_2022_to_euc(\$data);
+       $charset = 'cp949';
      }
  
***************
*** 204,257 ****
      $map = _load_charmap($charset)  unless defined $map;
  
      if ($charset eq 'euc-jp') {
        # Japanese
!       $data =~ s{([\x00-\x7E]|
!                   [\x8E][\xA1-\xDF]|
!                   [\xA1-\xFE][\xA1-\xFE]|
!                   \x8F[\xA2-\xFE][\xA1-\xFE])
!                 }{
!                   $char = unpack('N', ("\0"x(4-length($1))).$1);
!                   ($entstr = $map->{$char})
!                   ? ref($entstr)
!                      ? join('', map { '&'.$_.';' } @{$entstr}) :
!                        join('', '&', $entstr, ';')
!                   : ($entstr = $ASCIIMap{$char})
!                      ? join('', '&', $ASCIIMap{$char}, ';')
!                      : (length($1) > 1 ? '?' : $1)
!                 }gxe;
! 
!     } elsif ($charset eq 'cp950' ||
!            $charset eq 'cp936' ||
!            $charset eq 'gb2312' ||
!            $charset eq 'big5-hkscs') {
! 
        # Chinese
!       $data =~ s{([\x00-\x80]|
!                   [\x81-\xFF][\x00-\xFF])
!                 }{
!                   $char = unpack(length($1)>1?'n':'C',$1);
!                   ($entstr = $map->{$char})
!                   ? ref($entstr)
!                      ? join('', map { '&'.$_.';' } @{$entstr}) :
!                        join('', '&', $entstr, ';')
!                   : ($entstr = $ASCIIMap{$char})
!                      ? join('', '&', $ASCIIMap{$char}, ';')
!                      : (length($1) > 1 ? '?' : $1)
!                 }gxe;
! 
!     } else {
!       # Singly byte charset
!       $data =~ s{([\x00-\xFF])
!                 }{
!                   $char = unpack('C', $1);
!                   ($entstr = $map->{$char})
!                   ? ref($entstr)
!                      ? join('', map { '&'.$_.';' } @{$entstr}) :
!                        join('', '&', $entstr, ';')
!                   : ($entstr = $ASCIIMap{$char})
!                      ? join('', '&', $ASCIIMap{$char}, ';')
!                      : $1
!                 }gxe;
      }
      $data;
  }
--- 172,208 ----
      $map = _load_charmap($charset)  unless defined $map;
  
+     # Convert text
      if ($charset eq 'euc-jp') {
        # Japanese
!       _euc_jp_to_sgml(\$data, $map);
!       return $data;
!     }
!     if ($charset eq 'cp949') {
!       # Korean
!       _euc_kr_to_sgml(\$data, $map);
!       return $data;
!     }
!     if ($charset eq 'cp950' ||
!           $charset eq 'cp936' ||
!           $charset eq 'gb2312' ||
!           $charset eq 'big5-hkscs') {
        # Chinese
!       _chinese_to_sgml(\$data, $map);
!       return $data;
      }
+ 
+     # Singly byte charset
+     my($char, $entstr);
+     $data =~ s{([\x00-\xFF])}
+     {
+       $char = unpack('C', $1);
+       ($entstr = $map->{$char})
+       ? ref($entstr)
+          ? join('', map { '&'.$_.';' } @{$entstr}) :
+            join('', '&', $entstr, ';')
+       : ($entstr = $ASCIIMap{$char})
+          ? join('', '&', $entstr, ';')
+          : $1
+     }gxe;
      $data;
  }
***************
*** 284,287 ****
--- 235,289 ----
  
  ##---------------------------------------------------------------------------##
+ ##  Private Routines.
+ ##  NOTE: Many regex substitute code has been copy-n-pasted.  This
+ ##      was done instead of encapsulating into a function in order
+ ##      to avoid the overhead of a function call.  Since the
+ ##      code block will be executed for all, or nearly all, characters
+ ##      in the input, avoiding the function call gives a speed
+ ##      improvement.  Things are already slow enough.
+ 
+ sub _utf8_to_sgml {
+     my $data_r = shift;
+ 
+     my($i, $n, $mask, $char);
+     # We do not do full compliant UTF-8 parsing: malformed sequences
+     # will end up being treated as individual octets replaced with the
+     # '?' sign instead of using a single '?' for the entire malformed
+     # character sequence.
+     $$data_r =~ s{([\x00-\x7F]|
+                  [\xC0-\xDF][\x80-\xBF]|
+                   \xE0      [\xA0-\xBF][\x80-\xBF]|
+                  [\xE1-\xEF][\x80-\xBF]{2}|
+                   \xF0      [\x90-\xBF][\x80-\xBF]{2}|
+                  [\xF1-\xF7][\x80-\xBF]{3}|
+                   \xF8      [\x88-\xBF][\x80-\xBF]{3}|
+                  [\xF9-\xFB][\x80-\xBF]{4}|
+                   \xFC      [\x84-\xBF][\x80-\xBF]{4}|
+                   \xFD      [\x80-\xBF]{5}|
+                  .)}
+     {
+       if (($n = length($1)) == 1) {
+           $char = unpack('C',$1);
+           if ($char <= 0x7F) {
+               $ASCIIMap{$char}
+                   ? join('', '&', $ASCIIMap{$char}, ';')
+                   : $1;
+           } else {
+               '?';
+           }
+       } else {
+           # XXX: There has got to be a quick way to do this?
+           for ($mask=0x1, $i=$n; $i < 6; ++$i) {
+               $mask = ($mask << 1) | 0x1;
+           }
+           $char = (unpack('C',substr($1,0,1)) & $mask) << ($n-1)*6;
+           for ($i=1; $i < $n; ++$i) {
+               $char |= ((unpack('C',substr($1,$i,1)) & 0x3F) <<
+                        (($n-$i-1)*6));
+           }
+           sprintf('&#x%X;',$char);
+       }
+    }gxse;
+ }
  
  sub _jp_2022_to_euc {
***************
*** 293,313 ****
                     (\e\([BJ])|                  # ISO ASC
                     (\e\(I))                     # JIS KANA
!                    ([^\e]*)
!                }{
!                   ($esc_0212, $esc_asc, $esc_kana, $chunk) =
!                        ($1, $2, $3, $4);
!                   if (!$esc_asc) {
!                       $chunk =~ tr/\x21-\x7e/\xa1-\xfe/;
!                       if ($esc_kana) {
!                           $chunk =~ s/([\xa1-\xdf])/\x8e$1/og;
!                       }
!                       elsif ($esc_0212) {
!                           $chunk =~ s/([\xa1-\xfe][\xa1-\xfe])/\x8f$1/og;
!                       }
!                   }
!                   $chunk;
!                }gex;
  }
  
  sub _load_charmap {
    my $charset = shift;
--- 295,388 ----
                     (\e\([BJ])|                  # ISO ASC
                     (\e\(I))                     # JIS KANA
!                    ([^\e]*)}
!     {
!       ($esc_0212, $esc_asc, $esc_kana, $chunk) =
!           ($1, $2, $3, $4);
!       if (!$esc_asc) {
!           $chunk =~ tr/\x21-\x7e/\xa1-\xfe/;
!           if ($esc_kana) {
!               $chunk =~ s/([\xa1-\xdf])/\x8e$1/og;
!           } elsif ($esc_0212) {
!               $chunk =~ s/([\xa1-\xfe][\xa1-\xfe])/\x8f$1/og;
!           }
!       }
!       $chunk;
!     }gex;
! }
! 
! sub _euc_jp_to_sgml {
!     my $data_r  = shift;
!     my $map   = shift;
!     my($char, $entstr);
!     $$data_r =~ s{([\x00-\x7E]|
!                  [\x8E][\xA1-\xDF]|
!                  [\xA1-\xFE][\xA1-\xFE]|
!                  \x8F[\xA2-\xFE][\xA1-\xFE])}
!     {
!       $char = unpack('N', ("\0"x(4-length($1))).$1);
!       ($entstr = $map->{$char})
!       ? ref($entstr)
!          ? join('', map { '&'.$_.';' } @{$entstr}) :
!            join('', '&', $entstr, ';')
!       : ($entstr = $ASCIIMap{$char})
!          ? join('', '&', $entstr, ';')
!          : (length($1) > 1 ? '?' : $1)
!     }gxe;
  }
  
+ sub _kr_2022_to_euc {
+     # implementation of this function plagerized from Encode::KR::2022_KR.
+     my $data_r        = shift;
+     my($match);
+     $data_r =~ s/\e\$\)\C//gx;              # remove the designator
+     $data_r =~ s{\x0E               # replace characters in GL
+                ([^\x0F]*)           # between SO(\x0e) and SI(\x0f)
+                \x0F}                # with characters in GR
+     {
+       $match = $1;
+       $match =~ tr/\x21-\x7e/\xa1-\xfe/;
+       $match;
+     }gex;
+ }
+ 
+ sub _euc_kr_to_sgml {
+     my $data_r  = shift;
+     my $map   = shift;
+     my($char, $entstr);
+     $$data_r =~ s{([\x00-\x80]|
+                  [\x81-\xFE][\xA1-\xFE])}
+     {
+       $char = unpack(length($1)>1?'n':'C',$1);
+       ($entstr = $map->{$char})
+       ? ref($entstr)
+          ? join('', map { '&'.$_.';' } @{$entstr}) :
+            join('', '&', $entstr, ';')
+       : ($entstr = $ASCIIMap{$char})
+          ? join('', '&', $entstr, ';')
+          : (length($1) > 1 ? '?' : $1)
+     }gxe;
+ }
+ 
+ sub _chinese_to_sgml {
+     my $data_r        = shift;
+     my $map   = shift;
+     my($char, $entstr);
+     $$data_r =~ s{([\x00-\x80]|
+                  [\x81-\xFF][\x00-\xFF])}
+     {
+       $char = unpack(length($1)>1?'n':'C',$1);
+       ($entstr = $map->{$char})
+       ? ref($entstr)
+          ? join('', map { '&'.$_.';' } @{$entstr}) :
+            join('', '&', $entstr, ';')
+       : ($entstr = $ASCIIMap{$char})
+          ? join('', '&', $entstr, ';')
+          : (length($1) > 1 ? '?' : $1)
+     }gxe;
+ }
+ 
+ 
+ ##---------------------------------------------------------------------------##
+ 
  sub _load_charmap {
    my $charset = shift;
***************
*** 422,424 ****
--- 497,501 ----
  
  Earl Hood, [EMAIL PROTECTED]
+ 
+ =cut
  

---------------------------------------------------------------------
To sign-off this list, send email to [EMAIL PROTECTED] with the
message text UNSUBSCRIBE MHONARC-DEV

CVS: mhonarc/MHonArc/lib/MHonArc CharEnt.pm,1.8,1.9

Reply via email to