Re: libraryform enhancements

Radovan Garabik Sat, 13 Mar 2004 16:30:41 -0800


New version of plucker unicode patch. This one is combined,
contains patches for both python parser and viewer.
Highlits:


parser:

requires python2.0

--charset option is gone, superceeded by --input-charset and
--output-charset

you can select anything as --output-charset, characters from
--input-charset that cannot be represented in --output-charset
are included as unicode values - this is why default --output-charset
is ascii, rather than palmos.
Conversion is done by looping through all the characters, which is
kind of inefficient, but works with python2.0 (can be made rather
more speedy with python2.3)

special value "unicode" for output charset is gone too.
(using "unicode" will use python built-in unicode encoding (=UiCS2),
and the text will be unreadable). Use "ascii" (the default)

--input-charset for text files is guessed from current locale, if not
explicitly given

input charset for html files (pages) is taken either from headers,
or from META tags. If not present neither in headers neither in
META tags, --input-charset is considered.


viewer:

if using documents in legacy 8-bit encoding (e.g. older plucked
document), viewer falls back to using system TxtGlue* functions,
even with gray fonts, so you can read your old cyrillic documents as
before
(if you have other encoding than palmos compatible, you need your old
gray fonts too - no surprise here)

if documents contain unicode values, you need proper unicode gray fonts
(i.e. not your old with KOI8-R characters pretending to be on
Latin1 positions). No surprise here either, since unicode values were
not displayed properly with previous plucker version at all.

Documents containing unicode values not present in gray fonts
will result in mojibake. The same when using palm fonts to
read them (unless they fall into ISO-8859-1 range)




-- 
 -----------------------------------------------------------
| Radovan Garabík http://melkor.dnp.fmph.uniba.sk/~garabik/ |
| __..--^^^--..__    garabik @ melkor.dnp.fmph.uniba.sk     |
 -----------------------------------------------------------
Antivirus alert: file .signature infected by signature virus.
Hi! I'm a signature virus! Copy me into your signature file to help me spread!

diff -urN plucker-original/configure.in plucker-new/configure.in
--- plucker-original/configure.in       2004-03-07 13:00:33.000000000 +0100
+++ plucker-new/configure.in    2004-03-08 17:09:20.000000000 +0100
@@ -320,6 +320,7 @@
                           to get the function names included in POSE's
                           profiling output ])
 AC_ARG_ENABLE(imode, [  --enable-imode          to enable i-mode support (also 
requires the imodeicons.pdb database)])
+AC_ARG_ENABLE(unicode, [  --enable-unicode        to enable unicode support])
 AC_ARG_ENABLE(scroll_to_bottom, [  --disable-scroll-to-bottom
                           always scroll even pages instead of stopping when
                           the end of the page is reached (will add some extra
diff -urN plucker-original/parser/python/PyPlucker/Spider.py 
plucker-new/parser/python/PyPlucker/Spider.py
--- plucker-original/parser/python/PyPlucker/Spider.py  2004-02-02 03:31:58.000000000 
+0100
+++ plucker-new/parser/python/PyPlucker/Spider.py       2004-03-10 19:52:53.000000000 
+0100
@@ -1276,8 +1276,10 @@
         message(0, "                   Set or clear the backup bit in the output 
file.")
         message(0, "    --beamable, --not-beamable:")
         message(0, "                   Set or clear the beamable bit in the output 
file.")
-        message(0, "    --charset=<name>:")
-        message(0, "                   Set the default charset to that specified by 
<name>.")
+        message(0, "    --output-charset=<name>:")
+        message(0, "                   Set the output charset of generated document 
to that specified by <name>.")
+        message(0, "    --input-charset=<name>:")
+        message(0, "                   Assume input charset to that specified by 
<name>.")
         message(0, "    --owner-id=<name>:")
         message(0, "                   Set owner-id of the output document to 
<name>.")
         message(0, "    --url-pattern=<regexp-pattern>:")
@@ -1350,7 +1352,8 @@
         backup = None
         copy_protect = None
         iconfile = None
-        default_charset = None
+        output_charset = None
+        input_charset = None
         owner_id = None
         url_pattern = None
         referrer = None
@@ -1376,7 +1379,7 @@
                                         "maxheight=", "maxwidth=", "alt-maxheight=", 
"alt-maxwidth=",
                                         "compression=", "home-url=", "update-cache", 
"launchable",
                                         "not-launchable", "backup", "no-backup", 
"beamable", "not-beamable",
-                                        "icon=", "charset=", "owner-id=", 
"url-pattern=", "referrer=",
+                                        "icon=", "output-charset=", "input-charset=", 
"owner-id=", "url-pattern=", "referrer=",
                                         "user-agent=", "title=", "author=", 
"status-file=", "version",
                                         "tables", "depth-first", "http-proxy=", 
"http-proxy-user=", "http-proxy-pass=",
                                         "fragments=", "creator-id="])
@@ -1494,8 +1497,10 @@
                 copy_protect = 1
             elif opt == "--icon":
                 iconfile = arg
-            elif opt == "--charset":
-                default_charset = arg
+            elif opt == "--output-charset":
+                output_charset = arg
+            elif opt == "--input-charset":
+                input_charset = arg
             elif opt == "--owner-id":
                 owner_id = arg
             elif opt == "--referrer":
@@ -1602,21 +1607,19 @@
         if zlib_compression == 'false':
             message('Specification of an owner-id forces use of zlib compression...')
         zlib_compression = 'true'
-        
-    mibenum = None
-    # if not specified on command line, look in .pluckerrc
-    if default_charset is None:
-        default_charset = config.get_string("default_charset")
-    # if we have one, validate it
-    if default_charset is not None:
-        from PyPlucker.helper.CharsetMapping import charset_name_to_mibenum, 
charset_known_names
-        import string, re
-        mibenum = charset_name_to_mibenum(default_charset)
-        if mibenum:
-            config.set('default_charset', mibenum)
-        else:
-            usage ("Error:  Unsupported charset '" + default_charset + "' specified 
as default charset.\n"
-                   "        Charset must be either a decimal MIBenum value, or one of 
" + str(charset_known_names()))
+
+    if output_charset is None:
+        output_charset = config.get_string("output_charset")
+    if output_charset is None:
+        output_charset = 'palmos'
+    config.set ('output_charset', output_charset)
+
+    if input_charset is None:
+        input_charset = config.get_string("input_charset")
+    if output_charset is None:
+        input_charset = 'utf-8'        
+    config.set ('input_charset', input_charset)
+
 
     # update the config with the user options
     if use_file is not None:
@@ -1696,8 +1699,6 @@
         config.set ('author_md', author)
     if title is not None:
         config.set ('title_md', title)
-    if mibenum is not None:
-        config.set ('default_charset', mibenum)
     if statusfile is not None:
         config.set ('status_file', statusfile)
     if depthfirst is not None:
diff -urN plucker-original/parser/python/PyPlucker/TextParser.py 
plucker-new/parser/python/PyPlucker/TextParser.py
--- plucker-original/parser/python/PyPlucker/TextParser.py      2004-02-27 
23:51:08.000000000 +0100
+++ plucker-new/parser/python/PyPlucker/TextParser.py   2004-03-08 23:45:38.000000000 
+0100
@@ -31,6 +31,8 @@
 ## Now PyPlucker things should generally be importable
 ##
 
+NBSP = u'\u00a0' # non-breaking space
+
 import string
 import re
 try:
@@ -336,6 +338,32 @@
 _entitycharref = re.compile('^(.*)&([#a-zA-Z][-.a-zA-Z0-9]*);(.*)$')
 _html_char_ref_pattern = re.compile('^&#([0-9]+);$')
 
+# this needs to be rewritten
+def text_alternative (uchar):
+    "get text alternative to unicode character uchar"
+    val = ord(uchar)
+    if val == 8211:
+        return "-"
+    elif val == 8212:
+        return "--"
+    elif val == 8216:
+        return "`"
+    elif val == 8217:
+        return "'"
+    elif val == 8220:
+        return "\""
+    elif val == 8230:
+        return "..."
+    elif val == 8221:
+        return "\""
+    elif val == 8226:
+        return "o"
+    elif val == 8482:
+        return "(tm)"
+    else:
+        return "&#%d;" % val
+
+
 # These junk "alt" attribute values are not worth showing.
 junk_alt_attributes = ("img", "[img]", "spacer", "")
 
@@ -374,8 +402,6 @@
     return text
 
 
-
-
 class AttributeStack:
     """A data structure to maintain information about the current
     text attributes.
@@ -525,12 +551,11 @@
         return self._tags[self._stack[-1]]
 
 
-
-
 class TextDocBuilder:
     """Encapsulate the knowledge of when to change styles, add paragraphs, etc."""
 
     def __init__ (self, url, config, **keyword_args):
+        message(2,"initializing textdocbuilder")
         self._doc = PluckerDocs.PluckerTextDocument (url)
         self._config = config
         self._attributes = AttributeStack ()
@@ -582,19 +607,12 @@
             # see if we can supply a default charset
             url = self._doc.get_url()
             if self._config:
-                userspec = self._config.get_int('default_charset', 0)
+                userspec = self._config.get_int('output_charset_mibenum', 0)
             else:
                 userspec = None
             locale_default = charset_name_to_mibenum(DEFAULT_LOCALE_CHARSET_ENCODING)
-            # the userspec will take precedence
-            if userspec:
+            if userspec is not None:
                 self._doc.set_charset(userspec)
-            # OK, so we have no idea.  Use the HTTP default of ISO-8859-1 (4) for
-            # http: URLs, and the environment default (if any) for others
-            elif (string.lower(url[:5]) == 'http:' or string.lower(url[:6]) == 
'https:'):
-                self._doc.set_charset(4)
-            elif locale_default:
-                self._doc.set_charset(locale_default)
 
     def add_name (self, name):
         """Give name to the current paragraph"""
@@ -875,7 +893,28 @@
         
 
     def add_text (self, text):
-        """Add some text, maybe even many lines."""
+        """Add some text, maybe even many lines.
+            Text can be either a string or a unicode string.
+        """
+
+        def add_unicode_text(paragraph, text):
+            if type(text)==type(""): # non-unicode string, shortcut
+                message(4, "Adding 8-bit text")
+                paragraph.add_text(text)
+            elif type(text)==type(u""):
+                message(4, "Adding Unicode text")
+                for c in text:
+                    if ord(c)<128:
+                        paragraph.add_text(str(c))
+                    else:
+                        try:
+                            outc = c.encode(self._config.get_string("output_charset"))
+                            paragraph.add_text(outc)
+                        except UnicodeError:
+                            paragraph.add_unicode_char(ord(c), text_alternative(c))
+            else:
+                raise "Unexpected text type"
+
         lines = string.split (text, "\n")
         for i in range (len (lines)):
             line = lines[i]
@@ -891,7 +930,7 @@
                 if rest_size < 0:
                     rest_size = 0
                 (first, rest) = self._find_text_split (line, rest_size)
-                self._paragraph.add_text (first)
+                add_unicode_text(self._paragraph, first)
                 self._approximate_size = self._approximate_size + len (first)
                 self._is_new_paragraph = 0
                 self._is_new_line = 0
@@ -901,7 +940,7 @@
                     break
             
             if line:
-                self._paragraph.add_text (line)
+                add_unicode_text(self._paragraph, line)
                 self._approximate_size = self._approximate_size + len (line)
                 self._is_new_paragraph = 0
                 self._is_new_line = 0
@@ -963,12 +1002,17 @@
 
     def __init__ (self, url, text, headers, config, attribs):
         text = _clean_newlines (text)
+        textcharset = config.get_string("input_charset")
         # This we use to build the document
         self._doc = TextDocBuilder (url, config)
         if headers.has_key("charset"):
-            self._doc.set_charset (headers["charset"])
+            textcharset = headers["charset"]
         elif attribs.has_key("charset"):
-            self._doc.set_charset (attribs["charset"])
+            textcharset = attribs["charset"]
+        if not textcharset: # we have no idea, so we use locale
+            textcharset = DEFAULT_LOCALE_CHARSET_ENCODING 
+        text = unicode(text, textcharset)
+        message(4, "PlainTextParser: converting into unicode from "+textcharset)
         self._url = url
         self._text = text
         # In these two lists we store tuples of (url, attributes) for encountered 
anchors
@@ -1060,9 +1104,11 @@
         # javascript:document.write("<div>") turns it back on, because
         # it only recognizes the div, not the javascript.
         self._visible = 1
-        self._charset = headers.has_key('charset') and 
charset_name_to_mibenum(headers['charset'])
-        if self._charset:
-            self._doc.set_charset(headers['charset'])
+        # charset (python name of it) of current document - first: default
+        self.html_charset = config.get_string("input_charset")
+        # second: from headers
+        if headers.has_key('charset'):
+            self.html_charset = headers['charset']
         # Since some users are really stupid and use HTML wrong, we need a
         # stack of these values
         self._visibility_stack = []
@@ -1153,8 +1199,8 @@
         # we can only check the charset specified in the attribs after parsing
         # the document for <META> tags.  Seems kind of backward, but that's the
         # HTML spec.
-        if not self._charset and self._attribs.has_key('charset'):
-            self._set_charset(self._attribs['charset'])
+        #if not self._charset and self._attribs.has_key('charset'):
+        #    self._set_charset(self._attribs['charset'])
         self._doc.close ()
 
     def get_plucker_doc (self):
@@ -1300,7 +1346,8 @@
         _add_vspace() to do that explicitly if you want to."""
         if self._visible:
             if self.atable is not None and self.in_cell:
-                self.atable.add_cell_text (text)
+                if type(text)==type(""):
+                    self.atable.add_cell_text (text)
             else:
                 self._doc.add_text (text)
                 self._element_beginning = 0
@@ -1392,9 +1439,8 @@
             self._visible = 1
 
     def _set_charset (self, charset):
-        if charset_name_to_mibenum(charset):
-            self._charset = charset
-            self._doc.set_charset(charset)
+        message(4, "Setting html charset to "+charset)
+        self.html_charset = charset
 
     ################################################################################
     ######## HTML specifics
@@ -1430,9 +1476,10 @@
 
 
     def do_meta (self, data):
-        # if the charset is not already assigned (from the HTTP headers, presumably)
-        # and it's available here, then use it
-        if not self._charset and string.lower(data[0][0]) == 'http-equiv' and 
string.lower(data[0][1]) == 'content-type':
+        # if the charset is specified here, use it
+        # this is against html specs (headers have precedence), but
+        # conforms to common usage and is easier to program :-)
+        if string.lower(data[0][0]) == 'http-equiv' and string.lower(data[0][1]) == 
'content-type':
             from PyPlucker.Retriever import parse_http_header_value
             ctype, parameters = parse_http_header_value(data[1][1])
             for parameter in parameters:
@@ -1446,10 +1493,7 @@
         except ValueError:
             self.unknown_entityref(name)
             return
-        if not 0 <= n <= 255:
-            self.unknown_charref(name)
-            return
-        self.handle_data(chr(n))
+        self.handle_data(unichr(n))
 
 
     def handle_special (self, name):
@@ -1478,7 +1522,8 @@
             data = string.translate (data, _CLEANUP_TRANSTABLE)
             data = string.replace (data, "\t", "  ")
 
-
+        if type(data)==type(""):
+            data = unicode(data, self.html_charset or 'iso8859_1')
         #stripped_data = string.strip(data)
         if data:
             # not just blank or empty text (e.g. from comments), so we
@@ -1522,8 +1567,8 @@
                         style_str = struct.pack (">BB", 0, 0x78)
                     self.atable.add_cell_text(style_str)
                     self.last_table_strike = new_strike
-
-            self._add_text (data)
+            self._add_text(data)
+            message(4, "handling data "+`data`)
 
 
     def start_body (self, attributes):
@@ -1886,7 +1931,8 @@
     def do_p (self, attributes):
         if self._needs_newpara ():
             if self._indent_paragraphs:
-                self._add_text('\xa0\xa0\xa0\xa0\xa0\xa0')
+                #self._add_text('\xa0\xa0\xa0\xa0\xa0\xa0')
+                self._add_text(6*NBSP)
             else:
                 self._add_vspace (2)
 
@@ -2049,7 +2095,7 @@
                 text = ((0x2022, "o"), " ")
                 indent = 7
             elif self._ul_list_depth == 2:
-                text = chr(0xbb) + " "
+                text = unichr(0xbb) + " "
                 indent = 6
             elif self._ul_list_depth == 3:
                 text = "+ "
@@ -2063,15 +2109,15 @@
 
         self._doc.set_style ("")  # make sure we render the 'bullet' marker in normal 
style
         if self.atable is not None and self.in_cell:
-            self._add_text('\xa0\xa0' * table_margin)
+            self._add_text((2*NBSP) * table_margin)
             style_str = struct.pack (">BBBBB", 0, 0x53, 0, 0, 0) # black
             self.atable.add_cell_text(style_str)
 
-        if type(text) == type(""):
+        if type(text) == type("") or type(text) == type(u""):
             self._add_text (text)
         elif type(text) == type(()):
             for element in text:
-                if type(element) == type(""):
+                if type(element) == type("") or type(element) == type(u""):
                     self._add_text(element)
                 elif type(element) == type(()) and len(element) == 2:
                     self._add_unicode_char(element[0], element[1])
@@ -2367,31 +2413,6 @@
             if not self._unhandled_tags.has_key (tag):
                 self._unknown["</%s>"%tag] = 1
 
-    def unknown_charref (self, ref):
-        if self._visible:
-            val = int(ref)
-            if val == 8211:
-                self._add_unicode_char (val, "-")
-            elif val == 8212:
-                self._add_unicode_char (val, "--")
-            elif val == 8216:
-                self._add_unicode_char (val, "`")
-            elif val == 8217:
-                self._add_unicode_char (val, "´")
-            elif val == 8220:
-                self._add_unicode_char (val, "\"")
-            elif val == 8230:
-                        self._add_unicode_char (val, "...")
-            elif val == 8221:
-                self._add_unicode_char (val, "\"")
-            elif val == 8226:
-                # what's this?  Unbreakable space?
-                self._add_unicode_char (val, " ")
-            elif val == 8482:
-                self._add_unicode_char (val, "(tm)")
-            else:
-                self._unknown["charref-%s" % ref] = 1
-                self._add_unicode_char (val, "&#%d;" % val)
 
     def unknown_entityref (self, ref):
         if self._visible:
@@ -2399,14 +2420,11 @@
                 s = htmlentitydefs.entitydefs[ref]
                 if len(s) == 1:
                     val = ord(s)
-                    if (val >= 0xa0 and val < 0x100) or (val >= 0x00 and val < 0xFF):
-                        self.handle_data (s)
-                    else:
-                        self._add_unicode_char(val, "&#%d;" % val)
+                    self.handle_data(unichr(val))
                 else:
                     m = _html_char_ref_pattern.match(s)
                     if m:
-                        self.unknown_charref(m.group(1))
+                        self.handle_data(unichr(int(m.group(1))))
             else:
                 self._unknown["entityref-%s"%ref] = 1
                 self.handle_data('?')
diff -urN plucker-original/viewer/config.h.in plucker-new/viewer/config.h.in
--- plucker-original/viewer/config.h.in 2004-03-10 18:01:26.000000000 +0100
+++ plucker-new/viewer/config.h.in      2004-03-10 18:51:34.000000000 +0100
@@ -116,3 +116,6 @@
 
 /* Define if supporting word lookup */
 #undef SUPPORT_WORD_LOOKUP
+
+/* Define if using unicode mode support */
+#undef UNICODE_MODE
diff -urN plucker-original/viewer/configure.in plucker-new/viewer/configure.in
--- plucker-original/viewer/configure.in        2004-02-28 16:28:21.000000000 +0100
+++ plucker-new/viewer/configure.in     2004-03-08 17:09:20.000000000 +0100
@@ -31,6 +31,7 @@
 DEFAULT_SKINS=no
 DEFAULT_ARMLET=no
 DEFAULT_IMODE=no
+DEFAULT_UNICODE=no
 DEFAULT_CATEGORY=""
 DEFAULT_WAIT_ICON=bubble
 DEFAULT_LANG="en de cs it fr ja fo da zh_CN pl ru es tr th ca no"
@@ -418,6 +419,17 @@
     AC_DEFINE(HAVE_IMODE,, [ Define if using i-mode support])
 fi
 
+AC_MSG_CHECKING(--enable-unicode argument)
+AC_ARG_ENABLE(unicode, [  --enable-unicode          to enable unicode grayfont 
support],
+    UNICODE=yes, UNICODE=$DEFAULT_UNICODE)
+AC_MSG_RESULT($UNICODE)
+
+if test "$UNICODE" != "no"; then
+    AC_DEFINE(UNICODE_MODE,, [ Define if using unicode mode support])
+fi
+
+
+
 AC_ARG_DISABLE(scroll_to_bottom, [  --disable-scroll-to-bottom
                           always scroll even pages instead of stopping when
                           the end of the page is reached (will add some extra
@@ -784,6 +796,11 @@
 else
     echo "  I-mode Support:             disabled"
 fi
+if test "$UNICODE" != "no" ; then
+    echo "  Unicode Support:            enabled"
+else
+    echo "  Unicode Support:            disabled"
+fi
 if test "$AXXPAC" != "no" ; then
     echo "  AxxPac Support:             enabled"
 else
diff -urN plucker-original/viewer/const.h plucker-new/viewer/const.h
--- plucker-original/viewer/const.h     2004-02-28 16:28:21.000000000 +0100
+++ plucker-new/viewer/const.h  2004-03-08 17:09:20.000000000 +0100
@@ -101,3 +101,5 @@
 /* 3B 22 is a single character in JIS and Kuten */
 #define testDoubleByteJISKuten            0x3B22
 
+/* 04 00 is a single character in UTF-8 */
+#define testDoubleByteUTF8            0x0400
diff -urN plucker-original/viewer/grayfont.c plucker-new/viewer/grayfont.c
--- plucker-original/viewer/grayfont.c  2004-03-05 15:48:27.000000000 +0100
+++ plucker-new/viewer/grayfont.c       2004-03-09 20:01:28.000000000 +0100
@@ -26,6 +26,7 @@
 #include "prefsdata.h"
 #include "palmbitmap.h"
 #include "font.h"
+#include "debug.h"
 #define NO_GRAY_FONT_SUBSTITUTION
 #include "grayfont.h"
 
@@ -141,7 +142,6 @@
 
 
 
-
 /***********************************************************************
  *
  *      Private variables
@@ -167,6 +167,11 @@
     0x632c, 0x52aa, 0x4228, 0x3186, 0x2104, 0x1082, 0x0000
 };
 
+Boolean UsingGrayFont() 
+{
+    return currentFontPtr != NULL;
+}
+
 
 
 /* Set a map for colorizing a bitmap */
@@ -519,6 +524,9 @@
         uses8BitChars = ( charEncoding <= charEncodingPalmLatin );
     else
         uses8BitChars = true;
+#ifdef UNICODE_MODE
+    uses8BitChars = false;
+#endif
     err = FtrGet( sysFtrCreator, sysFtrNumWinVersion, &version );
     havePalmHiRes = ( HIGH_DENSITY_FEATURE_SET_VERSION <= version );
     resource.string[ RESOURCE_NAME_IDLETTER ] = RESOURCE_NAME_ID;
@@ -810,7 +818,7 @@
         inOffset = 0;
         while ( inOffset < length ) {
             WChar  ch;
-            inOffset += TxtGlueGetNextChar( chars, inOffset, &ch );
+            inOffset += MyTxtGlueGetNextChar( chars, inOffset, &ch );
             if ( length < inOffset )
                 break;
             width += GetGlyph( ch )->advance;
@@ -910,6 +918,7 @@
     WinDrawOperation  oldOperation = winPaint;
     Boolean           doKern;
 
+
     if ( currentFontPtr == NULL ) {
         if ( invert )
             WinDrawInvertedChars( chars, length, x, y );
@@ -945,7 +954,7 @@
     bitmapTopLeftX = 0;
     bitmapTopLeftY = 0;
 
-    TxtGlueGetNextChar( chars, 0, &ch );
+    MyTxtGlueGetNextChar( chars, 0, &ch );
     firstKern = GetGlyph( ch )->leftKerning;
 
     switch ( resource.string[ RESOURCE_NAME_ORIENTATION ] )
@@ -1039,7 +1048,7 @@
         GrayFontGlyphInfo*     glyph;
         UInt16                 resourceIndex;
 
-        inOffset += TxtGlueGetNextChar( chars, inOffset, &ch );
+        inOffset += MyTxtGlueGetNextChar( chars, inOffset, &ch );
         if ( length < inOffset )
             break;
         glyph = GetGlyph( ch );
@@ -1174,7 +1183,7 @@
         WinDrawChar( ch, x, y );
         return;
     }
-    length = TxtGlueSetNextChar( line, 0, ch );
+    length = MyTxtGlueSetNextChar( line, 0, ch );
     GrayWinDrawChars( line, length, x, y );
 }
 
diff -urN plucker-original/viewer/grayfont.h plucker-new/viewer/grayfont.h
--- plucker-original/viewer/grayfont.h  2004-02-10 03:10:44.000000000 +0100
+++ plucker-new/viewer/grayfont.h       2004-03-09 20:00:23.000000000 +0100
@@ -29,6 +29,7 @@
 #include "config.h"
 #include "viewer.h"
 #include "hires.h"
+#include "unicode.h"
 
 #define GRAY_FONT_LEFT   'L'
 #define GRAY_FONT_RIGHT  'R'
@@ -45,6 +46,8 @@
 /* Stop them and clear memory */
 void GrayFntStop( void ) GRAYFONT_SECTION;
 
+Boolean UsingGrayFont() GRAYFONT_SECTION;
+
 Err GrayFntDefineFont ( FontID font, void*  fontP ) GRAYFONT_SECTION;
 
 FontID GrayFntGetFont( void ) GRAYFONT_SECTION;
diff -urN plucker-original/viewer/Makefile.in plucker-new/viewer/Makefile.in
--- plucker-original/viewer/Makefile.in 2004-02-28 22:09:01.000000000 +0100
+++ plucker-new/viewer/Makefile.in      2004-03-08 17:09:20.000000000 +0100
@@ -93,7 +93,7 @@
                     detailsform.c searchform.c categoryform.c fontform.c \
                     bookmark.c session.c document.c image.c history.c \
                     search8.c search.c prefsdata.c anchor.c \
-                    paragraph.c uncompress.c keyboard.c keyboardform.c \
+                    paragraph.c unicode.c uncompress.c keyboard.c keyboardform.c \
                     list.c link.c renamedocform.c hardcopyform.c font.c \
                     table.c fullscreenform.c @OS_EXTRA_SRC@
 
diff -urN plucker-original/viewer/os.c plucker-new/viewer/os.c
--- plucker-original/viewer/os.c        2004-01-04 13:02:09.000000000 +0100
+++ plucker-new/viewer/os.c     2004-03-08 17:09:20.000000000 +0100
@@ -38,6 +38,7 @@
 #include "image.h"
 #include "axxpacimp.h"
 #include "skins.h"
+#include "unicode.h"
 
 #include "os.h"
 
@@ -161,7 +162,7 @@
     MemSet( s, MAX_CHARACTER_LENGTH, 0 );
     s[ 0 ] = word >> 8;
     s[ 1 ] = word & 0xFF;
-    return 1 < TxtGlueGetNextChar( s, 0, NULL );
+    return 1 < MyTxtGlueGetNextChar( s, 0, NULL );
 }
 
 
@@ -371,16 +372,22 @@
     if ( charEncoding != charEncodingPalmLatin )
         return 0;
 
+
     entries = sizeof(Latin1Mapping)/sizeof(CharMapping);
 
     for ( i = 0 ;  i < entries;  i++ ) {
         if ( Latin1Mapping [ i ].unicodeValue == 0 )
             return 0;
-        else if ( charValue < Latin1Mapping [ i ].unicodeValue )
+/*        else if ( charValue < Latin1Mapping [ i ].unicodeValue )
             return 0;
+*/
         else if ( Latin1Mapping [ i ].unicodeValue == charValue )
             return Latin1Mapping[ i ].palmCharValue;
     }
+
+    if (charValue <= 255)
+        return charValue;
+
     return 0;
 }
 
@@ -432,6 +439,7 @@
 #endif
     if ( IsDoubleByteSingleChar( testDoubleByteBig5GB2312EUCJPKR ) ||
          IsDoubleByteSingleChar( testDoubleByteShiftJIS ) ||
+         IsDoubleByteSingleChar( testDoubleByteUTF8 ) ||
          IsDoubleByteSingleChar( testDoubleByteJISKuten ) ) {
         uses8BitChars              = false;
     }
diff -urN plucker-original/viewer/paragraph.c plucker-new/viewer/paragraph.c
--- plucker-original/viewer/paragraph.c 2004-02-20 17:19:19.000000000 +0100
+++ plucker-new/viewer/paragraph.c      2004-03-10 20:00:13.000000000 +0100
@@ -340,7 +340,7 @@
 static Int16 littleSpace;   /* Extra pixels in each */
 
 /* A one-character pushback for character tokens */
-static Char  pushedChar     = 0;
+static WChar  pushedChar     = 0;
 
 /* Used to see if the current font is the fixed with font */
 static Boolean fixedWidthFont = false;
@@ -396,8 +396,7 @@
            tapped position */
         while ( offset < len ) {
             WChar ch;
-
-            offset   += TxtGlueGetNextChar( chars, offset, &ch );
+            offset   += MyTxtGlueGetNextChar( chars, offset, &ch );
             charWidth = TxtGlueCharWidth( ch );
             if ( CharIsSpace( ch ) ) {
                 x += charWidth;
@@ -517,7 +516,9 @@
                                 selectedWordBounds[ i ].extent.y )
             bottomY = selectedWordBounds[ i ].topLeft.y +
                           selectedWordBounds[ i ].extent.y;
-        stringSize += TxtGlueSetNextChar( selectedWord, stringSize, ch );
+        UseLegacyEncoding(!UsingGrayFont());
+        stringSize += MyTxtGlueSetNextChar( selectedWord, stringSize, ch );
+
     }
     selectedWord[ stringSize ] = '\0';
     if ( bounds != NULL ) {
@@ -1118,6 +1119,8 @@
     }
     else {
         if ( tContext->writeMode == WRITEMODE_COPY_CHAR || ! goodTable ) {
+            UseLegacyEncoding(!UsingGrayFont());
+
             DrawText( name, length, tContext );
             *width = FntCharsWidth( name, length );
         }
@@ -1555,7 +1558,6 @@
     UInt8*  functionArgs;
     UInt32  charValue;
     UInt8   charsToSkip;
-    UInt16  palmChar;
 #ifdef HAVE_IMODE
     DmOpenRef  plkrImodeDB;
 #endif
@@ -1596,8 +1598,7 @@
     }
 #endif
 
-    palmChar = FindPalmCharForUnicodeChar( charValue );
-    if ( 0 < palmChar && PutNextToken( palmChar ) ) {
+    if ( PutNextToken( charValue ) ) {
         pContext->position += charsToSkip;
     }
     return UNICODE;
@@ -1694,15 +1695,15 @@
     Int16   offset;
 
     if ( pushedChar != 0 ) {
-        *nextToken = ( UInt8 )pushedChar;
+        *nextToken = ( WChar )pushedChar;
         pushedChar = 0;
         return TOKEN_CHARACTER;
     }
 
     if ( pContext->last <= pContext->position )
         return TOKEN_PARAGRAPH_END;
-
-    pContext->position += TxtGlueGetNextChar( pContext->position, 0,
+    UseLegacyEncoding(1);
+    pContext->position += MyTxtGlueGetNextChar( pContext->position, 0,
                             &nextChar );
 
     if ( nextChar != '\0' ) {
@@ -1726,11 +1727,11 @@
 {
     if ( pushedChar != 0 )
         return false;
-
+/*
     if ( 256 <= nextToken )
         return false;
-
-    pushedChar = (Char) nextToken;
+*/
+    pushedChar = (WChar) nextToken;
 
     return true;
 }
@@ -2227,8 +2228,9 @@
         Char*     prevPosition;
 
         prevPosition  = pContext->position;
-        nextTokenType = GetNextToken( pContext, &nextChar );
-
+        
+        nextTokenType = GetNextToken( pContext, &nextChar ); 
+        
         if ( nextTokenType == TOKEN_PARAGRAPH_END ) {
             break;
         }
@@ -2248,6 +2250,7 @@
             }
             continue;
         }
+
         addMarginToCurrent = false;
 
         if ( skipLeadingSpace && CharIsSpace( nextChar ) && ! fixedWidthFont ) {
@@ -2287,7 +2290,10 @@
                 tContext->cursorX += FntCharsWidth( chars, len );
                 len = 0;
             }
-            len          += TxtGlueSetNextChar( chars, len, nextChar );
+            UseLegacyEncoding(!UsingGrayFont());
+            len          += MyTxtGlueSetNextChar( chars, len, nextChar );
+
         }
 
         if ( pContext->type == ALIGNMENT_JUSTIFY && nextChar == ' ' ) {
@@ -2318,6 +2324,8 @@
 
     if ( 0 < len ) {
         DrawText( chars, len, tContext );
         tContext->cursorX += FntCharsWidth( chars, len );
     }
     if ( invertPattern && tContext->writeMode == WRITEMODE_DRAW_CHAR )
@@ -2610,8 +2618,10 @@
         yPos += currentHeight / 2;
     else if ( GetCurrentStyle() == SUPSTYLE )
         yPos += currentHeight / 2 - GetPrevFontHeight();
-
+    UseLegacyEncoding(0);
     RotDrawChars(chars, len, tContext->cursorX, (Coord)yPos);
+
 }
 
 
@@ -2623,6 +2633,7 @@
     const TextContext* tContext
     )
 {
+    UseLegacyEncoding(0);
     RotDrawInvertedChars( chars, len, tContext->cursorX,
         tContext->cursorY - FntCharHeight() );
 }
diff -urN plucker-original/viewer/paragraph.h plucker-new/viewer/paragraph.h
--- plucker-original/viewer/paragraph.h 2004-02-01 12:26:33.000000000 +0100
+++ plucker-new/viewer/paragraph.h      2004-03-08 17:09:20.000000000 +0100
@@ -26,6 +26,7 @@
 #include "viewer.h"
 #include "document.h"
 #include "util.h"
+#include "unicode.h"
 
 /*
     A paragraph as it appears in the input data stream. The height of the
diff -urN plucker-original/viewer/rotate.c plucker-new/viewer/rotate.c
--- plucker-original/viewer/rotate.c    2004-01-04 01:21:36.000000000 +0100
+++ plucker-new/viewer/rotate.c 2004-03-08 17:09:20.000000000 +0100
@@ -472,7 +472,7 @@
     while ( 0 < length ) {
         Boolean missing;
 
-        charWidth  = TxtGlueGetNextChar( string, 0, &ch );
+        charWidth  = MyTxtGlueGetNextChar( string, 0, &ch );
         string    += charWidth;
         length    -= charWidth;
 
diff -urN plucker-original/viewer/rotate.h plucker-new/viewer/rotate.h
--- plucker-original/viewer/rotate.h    2003-08-11 04:31:57.000000000 +0200
+++ plucker-new/viewer/rotate.h 2004-03-08 17:09:20.000000000 +0100
@@ -30,6 +30,7 @@
 #include "jogdial.h"
 #endif
 #include "grayfont.h"
+#include "unicode.h"
 
 #ifdef HAVE_ROTATE
 
diff -urN plucker-original/viewer/unicode.c plucker-new/viewer/unicode.c
--- plucker-original/viewer/unicode.c   1970-01-01 01:00:00.000000000 +0100
+++ plucker-new/viewer/unicode.c        2004-03-10 19:17:23.000000000 +0100
@@ -0,0 +1,136 @@
+#include "unicode.h"
+#include <TxtGlue.h>
+
+#ifdef UNICODE_MODE
+static Boolean usingLegacyEncoding = 1;
+
+void UseLegacyEncoding(Boolean x)
+{
+    usingLegacyEncoding = x;
+}
+
+Boolean GetLegacyEncoding()
+{
+    return usingLegacyEncoding;
+}
+
+
+UInt16 Utf8TxtGlueSetNextChar(Char * ioText, UInt32 inOffset, WChar inChar)
+{
+
+    UInt8 count;               /* counts how many bytes takes the UTF8 reprezentation 
*/
+    UInt8 *pos;
+
+    UInt32 chr;
+
+    if (usingLegacyEncoding)
+        return TxtGlueSetNextChar( ioText, inOffset, inChar );
+
+    pos = (UInt8 *)ioText + inOffset;
+    chr = inChar;
+
+
+    if (chr < 0x80)
+       count = 1;
+    else if (chr < 0x800) {
+       count = 2;
+}
+    else if (chr < 0x10000)
+       count = 3;
+#ifdef UCS4 /* reserved for eventual UCS4 support */
+    else if (chr < 0x110000)
+       count = 4;
+#endif /* UCS4 */
+    else {
+/* invalid character */
+       ioText[inOffset] = (UInt8) '?';
+       return 1;
+    }
+
+    switch (count) {           /* note: code falls through cases! */
+#ifdef UCS4
+    case 4:
+       pos[3] = 0x80 | (chr & 0x3f);
+       chr = chr >> 6;
+       chr |= 0x10000;
+#endif                         /* UCS4 */
+    case 3:
+       pos[2] = 0x80 | (chr & 0x3f);
+       chr = chr >> 6;
+       chr |= 0x800;
+    case 2:
+       pos[1] = 0x80 | (chr & 0x3f);
+       chr = chr >> 6;
+       chr |= 0xc0;
+    case 1:
+       pos[0] = (UInt8) chr;
+    }
+    return count;
+}
+
+
+UInt16 Utf8TxtGlueGetNextChar(const Char * inText,
+                             UInt32 inOffset, WChar * outChar)
+{
+
+    UInt8 *s;
+    UInt8 c;
+
+    if (usingLegacyEncoding)
+        return TxtGlueGetNextChar( inText, inOffset, outChar );
+
+    s = (UInt8 *) inText + inOffset;   /* source start */
+
+    c = s[0];
+
+
+    if (c<0x80) {
+        if (outChar)
+           *outChar = ((UInt16) c);
+        return 1;
+    }
+
+
+    if (c >= 0xc2) {
+       if (c < 0xe0) {
+           if ((s[1] ^ 0x80) < 0x40) {
+               if (outChar)
+                   *outChar = ((UInt16) (c & 0x1f) << 6)
+                       | (UInt16) (s[1] ^ 0x80);
+               return 2;
+           }
+       } else if (c < 0xf0) {
+           if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
+               && (c >= 0xe1 || s[1] >= 0xa0)) {
+               if (outChar)
+                   *outChar = ((UInt16) (c & 0x0f) << 12)
+                       | ((UInt16) (s[1] ^ 0x80) << 6)
+                       | ((UInt16) (s[2] ^ 0x80));
+               return 3;
+           }
+       }
+#ifdef UCS4
+       else if (c < 0xf8) {
+           if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
+               && (s[3] ^ 0x80) < 0x40 && (c >= 0xf1 || s[1] >= 0x90)
+               && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90))
+               ) {
+               if (outChar)
+                   *outChar = ((unsigned int) (c & 0x07) << 18)
+                       | ((unsigned int) (s[1] ^ 0x80) << 12)
+                       | ((unsigned int) (s[2] ^ 0x80) << 6)
+                       | (unsigned int) (s[3] ^ 0x80);
+               return 4;
+           }
+       }
+#endif /* UCS4 */
+    }
+    /* invalid multibyte character */
+    if (outChar)
+       *outChar = 0xfffd;
+    return 1;
+
+
+}
+
+#endif /* UNICODE_MODE */
diff -urN plucker-original/viewer/unicode.h plucker-new/viewer/unicode.h
--- plucker-original/viewer/unicode.h   1970-01-01 01:00:00.000000000 +0100
+++ plucker-new/viewer/unicode.h        2004-03-08 21:29:11.000000000 +0100
@@ -0,0 +1,32 @@
+#ifndef UNICODE_H
+#define UNICODE_H
+
+#include "config.h"
+#include "viewer.h"
+
+//#define UNICODE_MODE
+//#undef UNICODE_MODE
+
+#ifndef UNICODE_SECTION
+# define UNICODE_SECTION
+#endif
+
+
+#ifdef UNICODE_MODE
+
+UInt16 Utf8TxtGlueSetNextChar( Char* ioText, UInt32 inOffset, WChar inChar ) 
UNICODE_SECTION;
+#define MyTxtGlueSetNextChar Utf8TxtGlueSetNextChar
+UInt16 Utf8TxtGlueGetNextChar( const Char* inText, UInt32 inOffset, WChar* outChar ) 
UNICODE_SECTION;
+#define MyTxtGlueGetNextChar Utf8TxtGlueGetNextChar
+void UseLegacyEncoding(Boolean x) UNICODE_SECTION;
+Boolean GetLegacyEncoding() UNICODE_SECTION;
+
+#else
+#define MyTxtGlueSetNextChar TxtGlueSetNextChar
+#define MyTxtGlueGetNextChar TxtGlueGetNextChar
+#define UseLegacyEncoding(x)
+#define GetLegacyEncoding()
+
+#endif /* UNICODE_MODE */
+
+#endif /* UNICODE_H */

Re: libraryform enhancements

Reply via email to