Package: lookup
Version: 1.08b-10
Severity: wishlist
Tags: patch

Attached is a patch to add utf-8 output support to lookup. In order to avoid an
iconv dependancy, I've instead generated two arrays (JIS X 208 and JIS X 212)
with a perl script.

However, input support without iconv looks painful, so I thought I'd submit
this as it is for consideration.

If you want to include the lib/jisucs2tbl.h in the tarball (like commands.h)
then simply generate it once, and move lib/jisucs2tbl.h from the clean target
to the realclean targe.

Adding any more output encodings would be difficult, as we just used up all the
input encoding bitfields.

diff -ruN lookup-1.08b.orig/cmds.master lookup-1.08b/cmds.master
--- lookup-1.08b.orig/cmds.master       1996-07-19 01:10:36.000000000 +1000
+++ lookup-1.08b/cmds.master    2005-09-01 18:36:22.000000000 +1000
@@ -322,8 +322,8 @@
 
 CMD_GENERAL|CMD_ENCODING_RELATED
 report or set the output encoding-method
-output encoding [euc|sjis|jis|...]
-output> (encoding>)? 
(euc|sjis|jis-?(78|83|90)?(-(ascii|roman))?)?((<(212|no212|hwk|nohwk|foldhwk|disp|nodisp|code|mark)>|[-,\s]+)*)
+output encoding [euc|sjis|utf8|jis|...]
+output> (encoding>)? 
(euc|sjis|utf8|jis-?(78|83|90)?(-(ascii|roman))?)?((<(212|no212|hwk|nohwk|foldhwk|disp|nodisp|code|mark)>|[-,\s]+)*)
 cmd_output_encoding(\2, \3, \5, \6)
 
 CMD_GENERAL
diff -ruN lookup-1.08b.orig/commands.c lookup-1.08b/commands.c
--- lookup-1.08b.orig/commands.c        2005-09-01 18:48:12.000000000 +1000
+++ lookup-1.08b/commands.c     2005-09-01 18:32:14.000000000 +1000
@@ -1367,6 +1367,7 @@
       default: soft_assert(0); break;
       case SJIS_OUTPUT: output("sjis"); break;
       case EUC_OUTPUT: output("euc"); break;
+      case UTF8_OUTPUT: output("utf8"); break;
 
       case JIS_OUTPUT:
        switch(output_style & _JIS_KANJI_STYLE)
@@ -1424,6 +1425,8 @@
            (void)select_output_style(EUC_OUTPUT);
        else if (main_style[0]=='s' || main_style[0]=='S')
            (void)select_output_style(SJIS_OUTPUT);
+       else if (main_style[0]=='u' || main_style[0]=='U')
+           (void)select_output_style(UTF8_OUTPUT);
        else if (main_style[0]=='j' || main_style[0]=='j')
        {
            if (!jis_year)
diff -ruN lookup-1.08b.orig/commands.h lookup-1.08b/commands.h
--- lookup-1.08b.orig/commands.h        1996-07-21 18:52:23.000000000 +1000
+++ lookup-1.08b/commands.h     2005-09-01 18:36:43.000000000 +1000
@@ -696,9 +696,9 @@
     /* generated from "cmds.master" record at line 320*/
     {
         CMD_GENERAL|CMD_ENCODING_RELATED,
-        (S)"output encoding [euc|sjis|jis|...]",
+        (S)"output encoding [euc|sjis|utf8|jis|...]",
         (S)"report or set the output encoding-method",
-        
(S)"^\\s*output>\\s*(encoding>)?\\s*(euc|sjis|jis-?(78|83|90)?(-(ascii|roman))?)?((<(212|no212|hwk|nohwk|foldhwk|disp|nodisp|code|mark)>|[-,\\s]+)*)\\s*$",
+        
(S)"^\\s*output>\\s*(encoding>)?\\s*(euc|sjis|utf8|jis-?(78|83|90)?(-(ascii|roman))?)?((<(212|no212|hwk|nohwk|foldhwk|disp|nodisp|code|mark)>|[-,\\s]+)*)\\s*$",
          _func41_,
     },
 
diff -ruN lookup-1.08b.orig/lib/output.c lookup-1.08b/lib/output.c
--- lookup-1.08b.orig/lib/output.c      2005-09-01 18:48:12.000000000 +1000
+++ lookup-1.08b/lib/output.c   2005-09-01 18:35:27.000000000 +1000
@@ -509,6 +509,110 @@
     return retval;
 }
 
+#include "jisucs2tbl.h"
+
+/* We take the next character to print, and return the number
+ * of columns we've output. Bytes go to *output++
+ */
+static unsigned output_euc_as_utf8(unsigned char c)
+{
+    static unsigned char hi = 0;
+    static unsigned char mid = 0;
+       u_int16_t ucs2;
+       int width;
+
+       // EUC-JP can be one, two or three bytes
+       // First byte:
+       //  0x8e Halfwidth Kana (JIS X 0201 Kana) - 2 bytes HALF_WIDTH_KATA_HI
+       //    Easy! UFF{Second byte - 0x40}
+       //  0x8f JIS X 0212 - 3 bytes THREE_BYTE_HI
+       //  > 0xa0 JIS X 0208 - 2 bytes - Mapping table
+       //  Otherwise, ASCII (JIS X 0201 Roman)
+
+//     fprintf(stderr, "0x%02x (0x%02x) ", c, hi);
+       /* Catch incomplete characters */
+       switch (hi) {
+               case 0:
+                       // New character time
+                       if (c < 0xa0 && c != HALF_WIDTH_KATA_HI && c != 
THREE_BYTE_HI) {
+                               // JIS X 0201 Roman
+                               hi = 0;
+                               mid = 0;
+                               if (c != 0)
+                                       *nextout++ = c;
+                               if ((c == '\n' && flush_on_newline) || nextout 
>= bufend)
+                               flush_raw_output();
+                               // I'm sure there're better definitions of 
printable than this...
+                               return (c >= ' ') ? 1:0;
+                       } else {
+                               // Worry about it later
+                               hi = c;
+                               mid = 0;
+                               return 0;
+                       }
+                       break;
+               case THREE_BYTE_HI:
+                       // JIS X 0212
+                       if (mid == 0) {
+                               mid = c;
+                               return 0;
+                       }
+                       break;
+       }
+
+       /* Completed multibyte characters */
+       switch(hi) {
+               case HALF_WIDTH_KATA_HI:
+                       // JIS X 0201 Kana
+                       ucs2 = 0xff00 + c - 0x40;
+                       hi = 0;
+                       mid = 0;
+                       width = 1;
+                       break;
+
+               case THREE_BYTE_HI:
+                       // JIS X 0212
+                       ucs2 = jis212[mid-0xa0][c-0xa0];
+                       hi = 0;
+                       mid = 0;
+                       width = 2;
+                       break;
+
+               default:
+                       // JIS X 0208
+                       ucs2 = jis208[hi-0xa0][c-0xa0];
+                       hi = 0;
+                       mid = 0;
+                       width = 2;
+                       break;
+       }
+
+       if (ucs2 < 0x80) {
+               *nextout++ = ucs2;
+               // If this happens, we've been translated down to ASCII.
+               if ((c == '\n' && flush_on_newline) ||  nextout >= bufend)
+                       flush_raw_output();
+               return 1;
+       } else if (ucs2 < 0x800) {
+               *nextout++ = (0xc0 | (ucs2 >> 6));
+               *nextout++ = (0x80 | (ucs2 & 0x3f));
+               // If this happens, we're prolly back in single-width land
+               width = 1;
+       } else /* if (ucs2 < 0x10000)*/ {
+               *nextout++ = (0xe0 | (ucs2 >> 12));
+               *nextout++ = (0x80 | ((ucs2 >> 6) & 0x3f));
+               *nextout++ = (0x80 | (ucs2 & 0x3f));
+               /*
+       } else {
+               // Should never get here, EUC-JP doens't translate outside BMP.
+               return 0;
+               */
+       }
+    if (nextout >= bufend)
+       flush_raw_output();
+       return width;
+}
+
 /***************************************************************/
 
 unsigned (*_output_char_function)(unsigned char) = output_euc_as_jis;
@@ -986,6 +1090,9 @@
            break;
        }
        break;
+      case UTF8_OUTPUT:
+       *function_pointer = output_euc_as_utf8;
+       break;
     }
 
     if ((output_style & _KATAKANA) != PASS_HW_KATANANA)
@@ -1019,6 +1126,7 @@
          case JIS_ASCII: output(", ASCII)"); break;
        }
        break;
+      case UTF8_OUTPUT: output("UTF-8"); break;
     }
 
     switch (output_style & _0212_1990)
diff -ruN lookup-1.08b.orig/lib/output.h lookup-1.08b/lib/output.h
--- lookup-1.08b.orig/lib/output.h      1996-01-16 04:50:39.000000000 +1100
+++ lookup-1.08b/lib/output.h   2005-09-01 13:48:34.000000000 +1000
@@ -61,7 +61,8 @@
 #define EUC_OUTPUT              0x00000001
 #define SJIS_OUTPUT             0x00000002
 #define JIS_OUTPUT              0x00000004
-#define _BASIC_OUTPUT_TYPE (JIS_OUTPUT|SJIS_OUTPUT|EUC_OUTPUT)
+#define UTF8_OUTPUT             0x00000008
+#define _BASIC_OUTPUT_TYPE (JIS_OUTPUT|SJIS_OUTPUT|EUC_OUTPUT|UTF8_OUTPUT)
 
 #define JIS_1978_OUTPUT                JIS_OUTPUT
 #define JIS_1983_OUTPUT                JIS_OUTPUT|0x00000010
diff -ruN lookup-1.08b.orig/Makefile lookup-1.08b/Makefile
--- lookup-1.08b.orig/Makefile  2005-09-01 18:48:12.000000000 +1000
+++ lookup-1.08b/Makefile       2005-09-01 18:56:15.000000000 +1000
@@ -175,6 +175,9 @@
        -echo '#endif /* file wrapper */'                >> tmp;
        mv tmp lib/system.h
 
+lib/jisucs2tbl.h: /usr/share/i18n/charmaps/EUC-JP.gz
+       zcat /usr/share/i18n/charmaps/EUC-JP.gz | ./mkeucucs2tbl.pl > 
lib/jisucs2tbl.h
+
 make.sh: realclean
        @echo ':# script to make lookup'                               > tmp
        @echo '## Can set CC= and CFLAGS= on the command line, just as with 
make' >> tmp
@@ -219,7 +222,7 @@
 
 clean: tidy
        [EMAIL PROTECTED] dummy > dummy.o
-       /bin/rm -f \#* *.o *.d doc/#* $(LOCAL_LIB) lib/system.h
+       /bin/rm -f \#* *.o *.d doc/#* $(LOCAL_LIB) lib/system.h lib/jisucs2tbl.h
 
 realclean: clean
        [EMAIL PROTECTED] dummy > lookup.man.xxx
@@ -261,7 +264,7 @@
  lib/virtfile.h lib/romaji2kana.h lib/jregex.h lib/strsave.h \
  lib/replace.h lib/input.h lookup.h lib/jreadline.h
 output.o: lib/output.c lib/config.h lib/assert.h lib/input.h \
- lib/output.h
+ lib/output.h lib/jisucs2tbl.h
 replace.o: lib/replace.c lib/config.h lib/assert.h lib/jregex.h \
  lib/xmalloc.h lib/replace.h
 romaji2kana.o: lib/romaji2kana.c lib/config.h lib/assert.h \
diff -ruN lookup-1.08b.orig/mkeucucs2tbl.pl lookup-1.08b/mkeucucs2tbl.pl
--- lookup-1.08b.orig/mkeucucs2tbl.pl   1970-01-01 10:00:00.000000000 +1000
+++ lookup-1.08b/mkeucucs2tbl.pl        2005-09-01 18:25:32.000000000 +1000
@@ -0,0 +1,63 @@
+#! /usr/bin/perl
+
+# Handy reference: http://czyborra.com/utf/
+# Expects zcat /usr/share/i18n/charmaps/EUC-JP.gz as input
+
+while ( <> ) {
+       last if /JIS X 0208/;
+}
+
+my @jis208;
+while ( <> ) {
+       last if /JIS X 0212/;
+       next unless m#^<U(....)>\s+/x(..)/x(..)#;
+       my ($ucs, $row, $col) = (hex($1), hex($2)-0xa0, hex($3)-0xa0);
+       $jis208[$row][$col] = $ucs;
+#      print "Character 0x$ucs is at JIS X 0208 row $row, col $col\n";
+}
+
+my @jis212;
+while ( <> ) {
+       next unless m#^<U(....)>\s+/x8f/x(..)/x(..)#;
+       my ($ucs, $row, $col) = (hex($1), hex($2)-0xa0, hex($3)-0xa0);
+       $jis212[$row][$col] = $ucs;
+#      print "Character 0x$ucs is at JIS X 0212 row $row, col $col\n";
+}
+
+print "// JIS X 208: ".scalar(@jis208)." rows\n";
+
+print "static const u_int32_t jis208[", 0xfe - 0xa0 + 1 ,"][", 0xfe-0xa0 + 1 
,"] = {\n";
+for my $i (0 .. 0xfe-0xa0) {
+       print "\t{";
+       for my $j (0 .. 0xfe-0xa0) {
+               if (defined $jis208[$i][$j]) {
+                       printf "0x%04x", $jis208[$i][$j];
+               } else {
+                       print "0x0000";
+               }
+               print ", " unless $j == 0xfe-0xa0;
+       }
+       print "}";
+       print ", " unless $i == 0xfe-0xa0;
+       print "\n";
+}
+print "};\n";
+
+print "// JIS X 212: ".scalar(@jis212)." rows\n";
+
+print "static const u_int32_t jis212[", 0xfe - 0xa0 + 1 ,"][", 0xfe-0xa0 + 1 
,"] = {\n";
+for my $i (0 .. 0xfe-0xa0) {
+       print "\t{";
+       for my $j (0 .. 0xfe-0xa0) {
+               if (defined $jis212[$i][$j]) {
+                       printf "0x%04x", $jis212[$i][$j];
+               } else {
+                       print "0x0000";
+               }
+               print ", " unless $j == 0xfe-0xa0;
+       }
+       print "}";
+       print ", " unless $i == 0xfe-0xa0;
+       print "\n";
+}
+print "};\n";

-- System Information:
Debian Release: testing/unstable
  APT prefers unstable
  APT policy: (990, 'unstable'), (950, 'unstable'), (900, 'experimental')
Architecture: i386 (i686)
Shell:  /bin/sh linked to /bin/bash
Kernel: Linux 2.6.12
Locale: LANG=en_AU.UTF-8, LC_CTYPE=en_AU.UTF-8 (charmap=UTF-8)

Versions of packages lookup depends on:
ii  libc6                         2.3.5-6    GNU C Library: Shared libraries an

lookup recommends no packages.

-- no debconf information

-- 
Paul "TBBle" Hampson, [EMAIL PROTECTED]
8th year CompSci/Asian Studies student, ANU

Shorter .sig for a more eco-friendly paperless office.

Attachment: pgpgYlpjCZKzX.pgp
Description: PGP signature

Reply via email to