New version of plucker unicode patch. This one is combined, contains patches for both python parser and viewer. Highlits:
parser: requires python2.0 --charset option is gone, superceeded by --input-charset and --output-charset you can select anything as --output-charset, characters from --input-charset that cannot be represented in --output-charset are included as unicode values - this is why default --output-charset is ascii, rather than palmos. Conversion is done by looping through all the characters, which is kind of inefficient, but works with python2.0 (can be made rather more speedy with python2.3) special value "unicode" for output charset is gone too. (using "unicode" will use python built-in unicode encoding (=UiCS2), and the text will be unreadable). Use "ascii" (the default) --input-charset for text files is guessed from current locale, if not explicitly given input charset for html files (pages) is taken either from headers, or from META tags. If not present neither in headers neither in META tags, --input-charset is considered. viewer: if using documents in legacy 8-bit encoding (e.g. older plucked document), viewer falls back to using system TxtGlue* functions, even with gray fonts, so you can read your old cyrillic documents as before (if you have other encoding than palmos compatible, you need your old gray fonts too - no surprise here) if documents contain unicode values, you need proper unicode gray fonts (i.e. not your old with KOI8-R characters pretending to be on Latin1 positions). No surprise here either, since unicode values were not displayed properly with previous plucker version at all. Documents containing unicode values not present in gray fonts will result in mojibake. The same when using palm fonts to read them (unless they fall into ISO-8859-1 range) -- ----------------------------------------------------------- | Radovan Garabík http://melkor.dnp.fmph.uniba.sk/~garabik/ | | __..--^^^--..__ garabik @ melkor.dnp.fmph.uniba.sk | ----------------------------------------------------------- Antivirus alert: file .signature infected by signature virus. Hi! I'm a signature virus! Copy me into your signature file to help me spread!
diff -urN plucker-original/configure.in plucker-new/configure.in --- plucker-original/configure.in 2004-03-07 13:00:33.000000000 +0100 +++ plucker-new/configure.in 2004-03-08 17:09:20.000000000 +0100 @@ -320,6 +320,7 @@ to get the function names included in POSE's profiling output ]) AC_ARG_ENABLE(imode, [ --enable-imode to enable i-mode support (also requires the imodeicons.pdb database)]) +AC_ARG_ENABLE(unicode, [ --enable-unicode to enable unicode support]) AC_ARG_ENABLE(scroll_to_bottom, [ --disable-scroll-to-bottom always scroll even pages instead of stopping when the end of the page is reached (will add some extra diff -urN plucker-original/parser/python/PyPlucker/Spider.py plucker-new/parser/python/PyPlucker/Spider.py --- plucker-original/parser/python/PyPlucker/Spider.py 2004-02-02 03:31:58.000000000 +0100 +++ plucker-new/parser/python/PyPlucker/Spider.py 2004-03-10 19:52:53.000000000 +0100 @@ -1276,8 +1276,10 @@ message(0, " Set or clear the backup bit in the output file.") message(0, " --beamable, --not-beamable:") message(0, " Set or clear the beamable bit in the output file.") - message(0, " --charset=<name>:") - message(0, " Set the default charset to that specified by <name>.") + message(0, " --output-charset=<name>:") + message(0, " Set the output charset of generated document to that specified by <name>.") + message(0, " --input-charset=<name>:") + message(0, " Assume input charset to that specified by <name>.") message(0, " --owner-id=<name>:") message(0, " Set owner-id of the output document to <name>.") message(0, " --url-pattern=<regexp-pattern>:") @@ -1350,7 +1352,8 @@ backup = None copy_protect = None iconfile = None - default_charset = None + output_charset = None + input_charset = None owner_id = None url_pattern = None referrer = None @@ -1376,7 +1379,7 @@ "maxheight=", "maxwidth=", "alt-maxheight=", "alt-maxwidth=", "compression=", "home-url=", "update-cache", "launchable", "not-launchable", "backup", "no-backup", "beamable", "not-beamable", - "icon=", "charset=", "owner-id=", "url-pattern=", "referrer=", + "icon=", "output-charset=", "input-charset=", "owner-id=", "url-pattern=", "referrer=", "user-agent=", "title=", "author=", "status-file=", "version", "tables", "depth-first", "http-proxy=", "http-proxy-user=", "http-proxy-pass=", "fragments=", "creator-id="]) @@ -1494,8 +1497,10 @@ copy_protect = 1 elif opt == "--icon": iconfile = arg - elif opt == "--charset": - default_charset = arg + elif opt == "--output-charset": + output_charset = arg + elif opt == "--input-charset": + input_charset = arg elif opt == "--owner-id": owner_id = arg elif opt == "--referrer": @@ -1602,21 +1607,19 @@ if zlib_compression == 'false': message('Specification of an owner-id forces use of zlib compression...') zlib_compression = 'true' - - mibenum = None - # if not specified on command line, look in .pluckerrc - if default_charset is None: - default_charset = config.get_string("default_charset") - # if we have one, validate it - if default_charset is not None: - from PyPlucker.helper.CharsetMapping import charset_name_to_mibenum, charset_known_names - import string, re - mibenum = charset_name_to_mibenum(default_charset) - if mibenum: - config.set('default_charset', mibenum) - else: - usage ("Error: Unsupported charset '" + default_charset + "' specified as default charset.\n" - " Charset must be either a decimal MIBenum value, or one of " + str(charset_known_names())) + + if output_charset is None: + output_charset = config.get_string("output_charset") + if output_charset is None: + output_charset = 'palmos' + config.set ('output_charset', output_charset) + + if input_charset is None: + input_charset = config.get_string("input_charset") + if output_charset is None: + input_charset = 'utf-8' + config.set ('input_charset', input_charset) + # update the config with the user options if use_file is not None: @@ -1696,8 +1699,6 @@ config.set ('author_md', author) if title is not None: config.set ('title_md', title) - if mibenum is not None: - config.set ('default_charset', mibenum) if statusfile is not None: config.set ('status_file', statusfile) if depthfirst is not None: diff -urN plucker-original/parser/python/PyPlucker/TextParser.py plucker-new/parser/python/PyPlucker/TextParser.py --- plucker-original/parser/python/PyPlucker/TextParser.py 2004-02-27 23:51:08.000000000 +0100 +++ plucker-new/parser/python/PyPlucker/TextParser.py 2004-03-08 23:45:38.000000000 +0100 @@ -31,6 +31,8 @@ ## Now PyPlucker things should generally be importable ## +NBSP = u'\u00a0' # non-breaking space + import string import re try: @@ -336,6 +338,32 @@ _entitycharref = re.compile('^(.*)&([#a-zA-Z][-.a-zA-Z0-9]*);(.*)$') _html_char_ref_pattern = re.compile('^&#([0-9]+);$') +# this needs to be rewritten +def text_alternative (uchar): + "get text alternative to unicode character uchar" + val = ord(uchar) + if val == 8211: + return "-" + elif val == 8212: + return "--" + elif val == 8216: + return "`" + elif val == 8217: + return "'" + elif val == 8220: + return "\"" + elif val == 8230: + return "..." + elif val == 8221: + return "\"" + elif val == 8226: + return "o" + elif val == 8482: + return "(tm)" + else: + return "&#%d;" % val + + # These junk "alt" attribute values are not worth showing. junk_alt_attributes = ("img", "[img]", "spacer", "") @@ -374,8 +402,6 @@ return text - - class AttributeStack: """A data structure to maintain information about the current text attributes. @@ -525,12 +551,11 @@ return self._tags[self._stack[-1]] - - class TextDocBuilder: """Encapsulate the knowledge of when to change styles, add paragraphs, etc.""" def __init__ (self, url, config, **keyword_args): + message(2,"initializing textdocbuilder") self._doc = PluckerDocs.PluckerTextDocument (url) self._config = config self._attributes = AttributeStack () @@ -582,19 +607,12 @@ # see if we can supply a default charset url = self._doc.get_url() if self._config: - userspec = self._config.get_int('default_charset', 0) + userspec = self._config.get_int('output_charset_mibenum', 0) else: userspec = None locale_default = charset_name_to_mibenum(DEFAULT_LOCALE_CHARSET_ENCODING) - # the userspec will take precedence - if userspec: + if userspec is not None: self._doc.set_charset(userspec) - # OK, so we have no idea. Use the HTTP default of ISO-8859-1 (4) for - # http: URLs, and the environment default (if any) for others - elif (string.lower(url[:5]) == 'http:' or string.lower(url[:6]) == 'https:'): - self._doc.set_charset(4) - elif locale_default: - self._doc.set_charset(locale_default) def add_name (self, name): """Give name to the current paragraph""" @@ -875,7 +893,28 @@ def add_text (self, text): - """Add some text, maybe even many lines.""" + """Add some text, maybe even many lines. + Text can be either a string or a unicode string. + """ + + def add_unicode_text(paragraph, text): + if type(text)==type(""): # non-unicode string, shortcut + message(4, "Adding 8-bit text") + paragraph.add_text(text) + elif type(text)==type(u""): + message(4, "Adding Unicode text") + for c in text: + if ord(c)<128: + paragraph.add_text(str(c)) + else: + try: + outc = c.encode(self._config.get_string("output_charset")) + paragraph.add_text(outc) + except UnicodeError: + paragraph.add_unicode_char(ord(c), text_alternative(c)) + else: + raise "Unexpected text type" + lines = string.split (text, "\n") for i in range (len (lines)): line = lines[i] @@ -891,7 +930,7 @@ if rest_size < 0: rest_size = 0 (first, rest) = self._find_text_split (line, rest_size) - self._paragraph.add_text (first) + add_unicode_text(self._paragraph, first) self._approximate_size = self._approximate_size + len (first) self._is_new_paragraph = 0 self._is_new_line = 0 @@ -901,7 +940,7 @@ break if line: - self._paragraph.add_text (line) + add_unicode_text(self._paragraph, line) self._approximate_size = self._approximate_size + len (line) self._is_new_paragraph = 0 self._is_new_line = 0 @@ -963,12 +1002,17 @@ def __init__ (self, url, text, headers, config, attribs): text = _clean_newlines (text) + textcharset = config.get_string("input_charset") # This we use to build the document self._doc = TextDocBuilder (url, config) if headers.has_key("charset"): - self._doc.set_charset (headers["charset"]) + textcharset = headers["charset"] elif attribs.has_key("charset"): - self._doc.set_charset (attribs["charset"]) + textcharset = attribs["charset"] + if not textcharset: # we have no idea, so we use locale + textcharset = DEFAULT_LOCALE_CHARSET_ENCODING + text = unicode(text, textcharset) + message(4, "PlainTextParser: converting into unicode from "+textcharset) self._url = url self._text = text # In these two lists we store tuples of (url, attributes) for encountered anchors @@ -1060,9 +1104,11 @@ # javascript:document.write("<div>") turns it back on, because # it only recognizes the div, not the javascript. self._visible = 1 - self._charset = headers.has_key('charset') and charset_name_to_mibenum(headers['charset']) - if self._charset: - self._doc.set_charset(headers['charset']) + # charset (python name of it) of current document - first: default + self.html_charset = config.get_string("input_charset") + # second: from headers + if headers.has_key('charset'): + self.html_charset = headers['charset'] # Since some users are really stupid and use HTML wrong, we need a # stack of these values self._visibility_stack = [] @@ -1153,8 +1199,8 @@ # we can only check the charset specified in the attribs after parsing # the document for <META> tags. Seems kind of backward, but that's the # HTML spec. - if not self._charset and self._attribs.has_key('charset'): - self._set_charset(self._attribs['charset']) + #if not self._charset and self._attribs.has_key('charset'): + # self._set_charset(self._attribs['charset']) self._doc.close () def get_plucker_doc (self): @@ -1300,7 +1346,8 @@ _add_vspace() to do that explicitly if you want to.""" if self._visible: if self.atable is not None and self.in_cell: - self.atable.add_cell_text (text) + if type(text)==type(""): + self.atable.add_cell_text (text) else: self._doc.add_text (text) self._element_beginning = 0 @@ -1392,9 +1439,8 @@ self._visible = 1 def _set_charset (self, charset): - if charset_name_to_mibenum(charset): - self._charset = charset - self._doc.set_charset(charset) + message(4, "Setting html charset to "+charset) + self.html_charset = charset ################################################################################ ######## HTML specifics @@ -1430,9 +1476,10 @@ def do_meta (self, data): - # if the charset is not already assigned (from the HTTP headers, presumably) - # and it's available here, then use it - if not self._charset and string.lower(data[0][0]) == 'http-equiv' and string.lower(data[0][1]) == 'content-type': + # if the charset is specified here, use it + # this is against html specs (headers have precedence), but + # conforms to common usage and is easier to program :-) + if string.lower(data[0][0]) == 'http-equiv' and string.lower(data[0][1]) == 'content-type': from PyPlucker.Retriever import parse_http_header_value ctype, parameters = parse_http_header_value(data[1][1]) for parameter in parameters: @@ -1446,10 +1493,7 @@ except ValueError: self.unknown_entityref(name) return - if not 0 <= n <= 255: - self.unknown_charref(name) - return - self.handle_data(chr(n)) + self.handle_data(unichr(n)) def handle_special (self, name): @@ -1478,7 +1522,8 @@ data = string.translate (data, _CLEANUP_TRANSTABLE) data = string.replace (data, "\t", " ") - + if type(data)==type(""): + data = unicode(data, self.html_charset or 'iso8859_1') #stripped_data = string.strip(data) if data: # not just blank or empty text (e.g. from comments), so we @@ -1522,8 +1567,8 @@ style_str = struct.pack (">BB", 0, 0x78) self.atable.add_cell_text(style_str) self.last_table_strike = new_strike - - self._add_text (data) + self._add_text(data) + message(4, "handling data "+`data`) def start_body (self, attributes): @@ -1886,7 +1931,8 @@ def do_p (self, attributes): if self._needs_newpara (): if self._indent_paragraphs: - self._add_text('\xa0\xa0\xa0\xa0\xa0\xa0') + #self._add_text('\xa0\xa0\xa0\xa0\xa0\xa0') + self._add_text(6*NBSP) else: self._add_vspace (2) @@ -2049,7 +2095,7 @@ text = ((0x2022, "o"), " ") indent = 7 elif self._ul_list_depth == 2: - text = chr(0xbb) + " " + text = unichr(0xbb) + " " indent = 6 elif self._ul_list_depth == 3: text = "+ " @@ -2063,15 +2109,15 @@ self._doc.set_style ("") # make sure we render the 'bullet' marker in normal style if self.atable is not None and self.in_cell: - self._add_text('\xa0\xa0' * table_margin) + self._add_text((2*NBSP) * table_margin) style_str = struct.pack (">BBBBB", 0, 0x53, 0, 0, 0) # black self.atable.add_cell_text(style_str) - if type(text) == type(""): + if type(text) == type("") or type(text) == type(u""): self._add_text (text) elif type(text) == type(()): for element in text: - if type(element) == type(""): + if type(element) == type("") or type(element) == type(u""): self._add_text(element) elif type(element) == type(()) and len(element) == 2: self._add_unicode_char(element[0], element[1]) @@ -2367,31 +2413,6 @@ if not self._unhandled_tags.has_key (tag): self._unknown["</%s>"%tag] = 1 - def unknown_charref (self, ref): - if self._visible: - val = int(ref) - if val == 8211: - self._add_unicode_char (val, "-") - elif val == 8212: - self._add_unicode_char (val, "--") - elif val == 8216: - self._add_unicode_char (val, "`") - elif val == 8217: - self._add_unicode_char (val, "´") - elif val == 8220: - self._add_unicode_char (val, "\"") - elif val == 8230: - self._add_unicode_char (val, "...") - elif val == 8221: - self._add_unicode_char (val, "\"") - elif val == 8226: - # what's this? Unbreakable space? - self._add_unicode_char (val, " ") - elif val == 8482: - self._add_unicode_char (val, "(tm)") - else: - self._unknown["charref-%s" % ref] = 1 - self._add_unicode_char (val, "&#%d;" % val) def unknown_entityref (self, ref): if self._visible: @@ -2399,14 +2420,11 @@ s = htmlentitydefs.entitydefs[ref] if len(s) == 1: val = ord(s) - if (val >= 0xa0 and val < 0x100) or (val >= 0x00 and val < 0xFF): - self.handle_data (s) - else: - self._add_unicode_char(val, "&#%d;" % val) + self.handle_data(unichr(val)) else: m = _html_char_ref_pattern.match(s) if m: - self.unknown_charref(m.group(1)) + self.handle_data(unichr(int(m.group(1)))) else: self._unknown["entityref-%s"%ref] = 1 self.handle_data('?') diff -urN plucker-original/viewer/config.h.in plucker-new/viewer/config.h.in --- plucker-original/viewer/config.h.in 2004-03-10 18:01:26.000000000 +0100 +++ plucker-new/viewer/config.h.in 2004-03-10 18:51:34.000000000 +0100 @@ -116,3 +116,6 @@ /* Define if supporting word lookup */ #undef SUPPORT_WORD_LOOKUP + +/* Define if using unicode mode support */ +#undef UNICODE_MODE diff -urN plucker-original/viewer/configure.in plucker-new/viewer/configure.in --- plucker-original/viewer/configure.in 2004-02-28 16:28:21.000000000 +0100 +++ plucker-new/viewer/configure.in 2004-03-08 17:09:20.000000000 +0100 @@ -31,6 +31,7 @@ DEFAULT_SKINS=no DEFAULT_ARMLET=no DEFAULT_IMODE=no +DEFAULT_UNICODE=no DEFAULT_CATEGORY="" DEFAULT_WAIT_ICON=bubble DEFAULT_LANG="en de cs it fr ja fo da zh_CN pl ru es tr th ca no" @@ -418,6 +419,17 @@ AC_DEFINE(HAVE_IMODE,, [ Define if using i-mode support]) fi +AC_MSG_CHECKING(--enable-unicode argument) +AC_ARG_ENABLE(unicode, [ --enable-unicode to enable unicode grayfont support], + UNICODE=yes, UNICODE=$DEFAULT_UNICODE) +AC_MSG_RESULT($UNICODE) + +if test "$UNICODE" != "no"; then + AC_DEFINE(UNICODE_MODE,, [ Define if using unicode mode support]) +fi + + + AC_ARG_DISABLE(scroll_to_bottom, [ --disable-scroll-to-bottom always scroll even pages instead of stopping when the end of the page is reached (will add some extra @@ -784,6 +796,11 @@ else echo " I-mode Support: disabled" fi +if test "$UNICODE" != "no" ; then + echo " Unicode Support: enabled" +else + echo " Unicode Support: disabled" +fi if test "$AXXPAC" != "no" ; then echo " AxxPac Support: enabled" else diff -urN plucker-original/viewer/const.h plucker-new/viewer/const.h --- plucker-original/viewer/const.h 2004-02-28 16:28:21.000000000 +0100 +++ plucker-new/viewer/const.h 2004-03-08 17:09:20.000000000 +0100 @@ -101,3 +101,5 @@ /* 3B 22 is a single character in JIS and Kuten */ #define testDoubleByteJISKuten 0x3B22 +/* 04 00 is a single character in UTF-8 */ +#define testDoubleByteUTF8 0x0400 diff -urN plucker-original/viewer/grayfont.c plucker-new/viewer/grayfont.c --- plucker-original/viewer/grayfont.c 2004-03-05 15:48:27.000000000 +0100 +++ plucker-new/viewer/grayfont.c 2004-03-09 20:01:28.000000000 +0100 @@ -26,6 +26,7 @@ #include "prefsdata.h" #include "palmbitmap.h" #include "font.h" +#include "debug.h" #define NO_GRAY_FONT_SUBSTITUTION #include "grayfont.h" @@ -141,7 +142,6 @@ - /*********************************************************************** * * Private variables @@ -167,6 +167,11 @@ 0x632c, 0x52aa, 0x4228, 0x3186, 0x2104, 0x1082, 0x0000 }; +Boolean UsingGrayFont() +{ + return currentFontPtr != NULL; +} + /* Set a map for colorizing a bitmap */ @@ -519,6 +524,9 @@ uses8BitChars = ( charEncoding <= charEncodingPalmLatin ); else uses8BitChars = true; +#ifdef UNICODE_MODE + uses8BitChars = false; +#endif err = FtrGet( sysFtrCreator, sysFtrNumWinVersion, &version ); havePalmHiRes = ( HIGH_DENSITY_FEATURE_SET_VERSION <= version ); resource.string[ RESOURCE_NAME_IDLETTER ] = RESOURCE_NAME_ID; @@ -810,7 +818,7 @@ inOffset = 0; while ( inOffset < length ) { WChar ch; - inOffset += TxtGlueGetNextChar( chars, inOffset, &ch ); + inOffset += MyTxtGlueGetNextChar( chars, inOffset, &ch ); if ( length < inOffset ) break; width += GetGlyph( ch )->advance; @@ -910,6 +918,7 @@ WinDrawOperation oldOperation = winPaint; Boolean doKern; + if ( currentFontPtr == NULL ) { if ( invert ) WinDrawInvertedChars( chars, length, x, y ); @@ -945,7 +954,7 @@ bitmapTopLeftX = 0; bitmapTopLeftY = 0; - TxtGlueGetNextChar( chars, 0, &ch ); + MyTxtGlueGetNextChar( chars, 0, &ch ); firstKern = GetGlyph( ch )->leftKerning; switch ( resource.string[ RESOURCE_NAME_ORIENTATION ] ) @@ -1039,7 +1048,7 @@ GrayFontGlyphInfo* glyph; UInt16 resourceIndex; - inOffset += TxtGlueGetNextChar( chars, inOffset, &ch ); + inOffset += MyTxtGlueGetNextChar( chars, inOffset, &ch ); if ( length < inOffset ) break; glyph = GetGlyph( ch ); @@ -1174,7 +1183,7 @@ WinDrawChar( ch, x, y ); return; } - length = TxtGlueSetNextChar( line, 0, ch ); + length = MyTxtGlueSetNextChar( line, 0, ch ); GrayWinDrawChars( line, length, x, y ); } diff -urN plucker-original/viewer/grayfont.h plucker-new/viewer/grayfont.h --- plucker-original/viewer/grayfont.h 2004-02-10 03:10:44.000000000 +0100 +++ plucker-new/viewer/grayfont.h 2004-03-09 20:00:23.000000000 +0100 @@ -29,6 +29,7 @@ #include "config.h" #include "viewer.h" #include "hires.h" +#include "unicode.h" #define GRAY_FONT_LEFT 'L' #define GRAY_FONT_RIGHT 'R' @@ -45,6 +46,8 @@ /* Stop them and clear memory */ void GrayFntStop( void ) GRAYFONT_SECTION; +Boolean UsingGrayFont() GRAYFONT_SECTION; + Err GrayFntDefineFont ( FontID font, void* fontP ) GRAYFONT_SECTION; FontID GrayFntGetFont( void ) GRAYFONT_SECTION; diff -urN plucker-original/viewer/Makefile.in plucker-new/viewer/Makefile.in --- plucker-original/viewer/Makefile.in 2004-02-28 22:09:01.000000000 +0100 +++ plucker-new/viewer/Makefile.in 2004-03-08 17:09:20.000000000 +0100 @@ -93,7 +93,7 @@ detailsform.c searchform.c categoryform.c fontform.c \ bookmark.c session.c document.c image.c history.c \ search8.c search.c prefsdata.c anchor.c \ - paragraph.c uncompress.c keyboard.c keyboardform.c \ + paragraph.c unicode.c uncompress.c keyboard.c keyboardform.c \ list.c link.c renamedocform.c hardcopyform.c font.c \ table.c fullscreenform.c @OS_EXTRA_SRC@ diff -urN plucker-original/viewer/os.c plucker-new/viewer/os.c --- plucker-original/viewer/os.c 2004-01-04 13:02:09.000000000 +0100 +++ plucker-new/viewer/os.c 2004-03-08 17:09:20.000000000 +0100 @@ -38,6 +38,7 @@ #include "image.h" #include "axxpacimp.h" #include "skins.h" +#include "unicode.h" #include "os.h" @@ -161,7 +162,7 @@ MemSet( s, MAX_CHARACTER_LENGTH, 0 ); s[ 0 ] = word >> 8; s[ 1 ] = word & 0xFF; - return 1 < TxtGlueGetNextChar( s, 0, NULL ); + return 1 < MyTxtGlueGetNextChar( s, 0, NULL ); } @@ -371,16 +372,22 @@ if ( charEncoding != charEncodingPalmLatin ) return 0; + entries = sizeof(Latin1Mapping)/sizeof(CharMapping); for ( i = 0 ; i < entries; i++ ) { if ( Latin1Mapping [ i ].unicodeValue == 0 ) return 0; - else if ( charValue < Latin1Mapping [ i ].unicodeValue ) +/* else if ( charValue < Latin1Mapping [ i ].unicodeValue ) return 0; +*/ else if ( Latin1Mapping [ i ].unicodeValue == charValue ) return Latin1Mapping[ i ].palmCharValue; } + + if (charValue <= 255) + return charValue; + return 0; } @@ -432,6 +439,7 @@ #endif if ( IsDoubleByteSingleChar( testDoubleByteBig5GB2312EUCJPKR ) || IsDoubleByteSingleChar( testDoubleByteShiftJIS ) || + IsDoubleByteSingleChar( testDoubleByteUTF8 ) || IsDoubleByteSingleChar( testDoubleByteJISKuten ) ) { uses8BitChars = false; } diff -urN plucker-original/viewer/paragraph.c plucker-new/viewer/paragraph.c --- plucker-original/viewer/paragraph.c 2004-02-20 17:19:19.000000000 +0100 +++ plucker-new/viewer/paragraph.c 2004-03-10 20:00:13.000000000 +0100 @@ -340,7 +340,7 @@ static Int16 littleSpace; /* Extra pixels in each */ /* A one-character pushback for character tokens */ -static Char pushedChar = 0; +static WChar pushedChar = 0; /* Used to see if the current font is the fixed with font */ static Boolean fixedWidthFont = false; @@ -396,8 +396,7 @@ tapped position */ while ( offset < len ) { WChar ch; - - offset += TxtGlueGetNextChar( chars, offset, &ch ); + offset += MyTxtGlueGetNextChar( chars, offset, &ch ); charWidth = TxtGlueCharWidth( ch ); if ( CharIsSpace( ch ) ) { x += charWidth; @@ -517,7 +516,9 @@ selectedWordBounds[ i ].extent.y ) bottomY = selectedWordBounds[ i ].topLeft.y + selectedWordBounds[ i ].extent.y; - stringSize += TxtGlueSetNextChar( selectedWord, stringSize, ch ); + UseLegacyEncoding(!UsingGrayFont()); + stringSize += MyTxtGlueSetNextChar( selectedWord, stringSize, ch ); + } selectedWord[ stringSize ] = '\0'; if ( bounds != NULL ) { @@ -1118,6 +1119,8 @@ } else { if ( tContext->writeMode == WRITEMODE_COPY_CHAR || ! goodTable ) { + UseLegacyEncoding(!UsingGrayFont()); + DrawText( name, length, tContext ); *width = FntCharsWidth( name, length ); } @@ -1555,7 +1558,6 @@ UInt8* functionArgs; UInt32 charValue; UInt8 charsToSkip; - UInt16 palmChar; #ifdef HAVE_IMODE DmOpenRef plkrImodeDB; #endif @@ -1596,8 +1598,7 @@ } #endif - palmChar = FindPalmCharForUnicodeChar( charValue ); - if ( 0 < palmChar && PutNextToken( palmChar ) ) { + if ( PutNextToken( charValue ) ) { pContext->position += charsToSkip; } return UNICODE; @@ -1694,15 +1695,15 @@ Int16 offset; if ( pushedChar != 0 ) { - *nextToken = ( UInt8 )pushedChar; + *nextToken = ( WChar )pushedChar; pushedChar = 0; return TOKEN_CHARACTER; } if ( pContext->last <= pContext->position ) return TOKEN_PARAGRAPH_END; - - pContext->position += TxtGlueGetNextChar( pContext->position, 0, + UseLegacyEncoding(1); + pContext->position += MyTxtGlueGetNextChar( pContext->position, 0, &nextChar ); if ( nextChar != '\0' ) { @@ -1726,11 +1727,11 @@ { if ( pushedChar != 0 ) return false; - +/* if ( 256 <= nextToken ) return false; - - pushedChar = (Char) nextToken; +*/ + pushedChar = (WChar) nextToken; return true; } @@ -2227,8 +2228,9 @@ Char* prevPosition; prevPosition = pContext->position; - nextTokenType = GetNextToken( pContext, &nextChar ); - + + nextTokenType = GetNextToken( pContext, &nextChar ); + if ( nextTokenType == TOKEN_PARAGRAPH_END ) { break; } @@ -2248,6 +2250,7 @@ } continue; } + addMarginToCurrent = false; if ( skipLeadingSpace && CharIsSpace( nextChar ) && ! fixedWidthFont ) { @@ -2287,7 +2290,10 @@ tContext->cursorX += FntCharsWidth( chars, len ); len = 0; } - len += TxtGlueSetNextChar( chars, len, nextChar ); + UseLegacyEncoding(!UsingGrayFont()); + len += MyTxtGlueSetNextChar( chars, len, nextChar ); + } if ( pContext->type == ALIGNMENT_JUSTIFY && nextChar == ' ' ) { @@ -2318,6 +2324,8 @@ if ( 0 < len ) { DrawText( chars, len, tContext ); tContext->cursorX += FntCharsWidth( chars, len ); } if ( invertPattern && tContext->writeMode == WRITEMODE_DRAW_CHAR ) @@ -2610,8 +2618,10 @@ yPos += currentHeight / 2; else if ( GetCurrentStyle() == SUPSTYLE ) yPos += currentHeight / 2 - GetPrevFontHeight(); - + UseLegacyEncoding(0); RotDrawChars(chars, len, tContext->cursorX, (Coord)yPos); + } @@ -2623,6 +2633,7 @@ const TextContext* tContext ) { + UseLegacyEncoding(0); RotDrawInvertedChars( chars, len, tContext->cursorX, tContext->cursorY - FntCharHeight() ); } diff -urN plucker-original/viewer/paragraph.h plucker-new/viewer/paragraph.h --- plucker-original/viewer/paragraph.h 2004-02-01 12:26:33.000000000 +0100 +++ plucker-new/viewer/paragraph.h 2004-03-08 17:09:20.000000000 +0100 @@ -26,6 +26,7 @@ #include "viewer.h" #include "document.h" #include "util.h" +#include "unicode.h" /* A paragraph as it appears in the input data stream. The height of the diff -urN plucker-original/viewer/rotate.c plucker-new/viewer/rotate.c --- plucker-original/viewer/rotate.c 2004-01-04 01:21:36.000000000 +0100 +++ plucker-new/viewer/rotate.c 2004-03-08 17:09:20.000000000 +0100 @@ -472,7 +472,7 @@ while ( 0 < length ) { Boolean missing; - charWidth = TxtGlueGetNextChar( string, 0, &ch ); + charWidth = MyTxtGlueGetNextChar( string, 0, &ch ); string += charWidth; length -= charWidth; diff -urN plucker-original/viewer/rotate.h plucker-new/viewer/rotate.h --- plucker-original/viewer/rotate.h 2003-08-11 04:31:57.000000000 +0200 +++ plucker-new/viewer/rotate.h 2004-03-08 17:09:20.000000000 +0100 @@ -30,6 +30,7 @@ #include "jogdial.h" #endif #include "grayfont.h" +#include "unicode.h" #ifdef HAVE_ROTATE diff -urN plucker-original/viewer/unicode.c plucker-new/viewer/unicode.c --- plucker-original/viewer/unicode.c 1970-01-01 01:00:00.000000000 +0100 +++ plucker-new/viewer/unicode.c 2004-03-10 19:17:23.000000000 +0100 @@ -0,0 +1,136 @@ +#include "unicode.h" +#include <TxtGlue.h> + +#ifdef UNICODE_MODE +static Boolean usingLegacyEncoding = 1; + +void UseLegacyEncoding(Boolean x) +{ + usingLegacyEncoding = x; +} + +Boolean GetLegacyEncoding() +{ + return usingLegacyEncoding; +} + + +UInt16 Utf8TxtGlueSetNextChar(Char * ioText, UInt32 inOffset, WChar inChar) +{ + + UInt8 count; /* counts how many bytes takes the UTF8 reprezentation */ + UInt8 *pos; + + UInt32 chr; + + if (usingLegacyEncoding) + return TxtGlueSetNextChar( ioText, inOffset, inChar ); + + pos = (UInt8 *)ioText + inOffset; + chr = inChar; + + + if (chr < 0x80) + count = 1; + else if (chr < 0x800) { + count = 2; +} + else if (chr < 0x10000) + count = 3; +#ifdef UCS4 /* reserved for eventual UCS4 support */ + else if (chr < 0x110000) + count = 4; +#endif /* UCS4 */ + else { +/* invalid character */ + ioText[inOffset] = (UInt8) '?'; + return 1; + } + + switch (count) { /* note: code falls through cases! */ +#ifdef UCS4 + case 4: + pos[3] = 0x80 | (chr & 0x3f); + chr = chr >> 6; + chr |= 0x10000; +#endif /* UCS4 */ + case 3: + pos[2] = 0x80 | (chr & 0x3f); + chr = chr >> 6; + chr |= 0x800; + case 2: + pos[1] = 0x80 | (chr & 0x3f); + chr = chr >> 6; + chr |= 0xc0; + case 1: + pos[0] = (UInt8) chr; + } + return count; +} + + +UInt16 Utf8TxtGlueGetNextChar(const Char * inText, + UInt32 inOffset, WChar * outChar) +{ + + UInt8 *s; + UInt8 c; + + if (usingLegacyEncoding) + return TxtGlueGetNextChar( inText, inOffset, outChar ); + + s = (UInt8 *) inText + inOffset; /* source start */ + + c = s[0]; + + + if (c<0x80) { + if (outChar) + *outChar = ((UInt16) c); + return 1; + } + + + if (c >= 0xc2) { + if (c < 0xe0) { + if ((s[1] ^ 0x80) < 0x40) { + if (outChar) + *outChar = ((UInt16) (c & 0x1f) << 6) + | (UInt16) (s[1] ^ 0x80); + return 2; + } + } else if (c < 0xf0) { + if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 + && (c >= 0xe1 || s[1] >= 0xa0)) { + if (outChar) + *outChar = ((UInt16) (c & 0x0f) << 12) + | ((UInt16) (s[1] ^ 0x80) << 6) + | ((UInt16) (s[2] ^ 0x80)); + return 3; + } + } +#ifdef UCS4 + else if (c < 0xf8) { + if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 + && (s[3] ^ 0x80) < 0x40 && (c >= 0xf1 || s[1] >= 0x90) + && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90)) + ) { + if (outChar) + *outChar = ((unsigned int) (c & 0x07) << 18) + | ((unsigned int) (s[1] ^ 0x80) << 12) + | ((unsigned int) (s[2] ^ 0x80) << 6) + | (unsigned int) (s[3] ^ 0x80); + return 4; + } + } +#endif /* UCS4 */ + } + /* invalid multibyte character */ + if (outChar) + *outChar = 0xfffd; + return 1; + + +} + +#endif /* UNICODE_MODE */ diff -urN plucker-original/viewer/unicode.h plucker-new/viewer/unicode.h --- plucker-original/viewer/unicode.h 1970-01-01 01:00:00.000000000 +0100 +++ plucker-new/viewer/unicode.h 2004-03-08 21:29:11.000000000 +0100 @@ -0,0 +1,32 @@ +#ifndef UNICODE_H +#define UNICODE_H + +#include "config.h" +#include "viewer.h" + +//#define UNICODE_MODE +//#undef UNICODE_MODE + +#ifndef UNICODE_SECTION +# define UNICODE_SECTION +#endif + + +#ifdef UNICODE_MODE + +UInt16 Utf8TxtGlueSetNextChar( Char* ioText, UInt32 inOffset, WChar inChar ) UNICODE_SECTION; +#define MyTxtGlueSetNextChar Utf8TxtGlueSetNextChar +UInt16 Utf8TxtGlueGetNextChar( const Char* inText, UInt32 inOffset, WChar* outChar ) UNICODE_SECTION; +#define MyTxtGlueGetNextChar Utf8TxtGlueGetNextChar +void UseLegacyEncoding(Boolean x) UNICODE_SECTION; +Boolean GetLegacyEncoding() UNICODE_SECTION; + +#else +#define MyTxtGlueSetNextChar TxtGlueSetNextChar +#define MyTxtGlueGetNextChar TxtGlueGetNextChar +#define UseLegacyEncoding(x) +#define GetLegacyEncoding() + +#endif /* UNICODE_MODE */ + +#endif /* UNICODE_H */