Package: tunnelx Version: 20160713-3 Severity: normal Tags: patch Characters outside US-ASCII get corrupted when saving and loading tunnel sketches. The reason is that each char is simply cast to byte when saving, and sign-extended to char when loading. I've made a patch which writes and reads XML numeric character references for characters not in US-ASCII as declared in the XML declaration. I tested with my current survey which contains Gaelic names, and hand-edited the XML to ensure that reading hexadecimal representations works (we always write decimal). The patch is probably suitable for forwarding upstream.
Perhaps an alternative approach might be considered - open the output file with UTF-8 encoding, and change the XML declaration to reflect that.
Index: tunnelx-20160713/src/TNXML.java =================================================================== --- tunnelx-20160713.orig/src/TNXML.java +++ tunnelx-20160713/src/TNXML.java @@ -726,7 +726,7 @@ class TNXML ///////////////////////////////////////////// static char[] chconvCH = { (char)176, (char)246, (char)252, '<', '>', '"', '&', '\\', '\'', '\n', '\t', ' ' }; static char[] chconv = chconvCH; // allow for hacks (which vary chconvleng) - static String[] chconvName = {"°", "ö", "ü", "<", ">", """, "&", "&backslash;", "&apostrophe;", "&newline;", "&tab;", "&space;" }; + static String[] chconvName = {"deg", "ouml", "uuml", "lt", "gt", "quot", "amp", "backslash", "apostrophe", "newline", "tab", "space" }; static int chconvleng = chconvCH.length; // used for hacking out the space ones (this hack needs to be killed, or replaced with a flag) static int chconvlengWSP = chconvCH.length - 4; // used for hacking out the space ones (this hack needs to be killed, or replaced with a flag) ///////////////////////////////////////////// @@ -739,16 +739,23 @@ class TNXML int j; // there might be a regexp that would do this substitution directly, or use indexOf in a concatenated string of chconvCH - for (j = 0; j < chconvleng; j++) + for (j = 3; j < chconvleng; j++) // start at '<' to allow deg, ouml, and uuml to use the general substitution below { if ((ch == chconvCH[j]) && (bAlsoSpace || (ch != ' '))) { - sb.append(chconvName[j]); + sb.append('&').append(chconvName[j]).append(';'); break; } } - if (j == chconvleng) - sb.append(ch); + if (j == chconvleng) { + // not found in table + if (' ' <= ch && ch <= 127) + // printable ASCII + sb.append(ch); + else + // general Unicode character + sb.append("&#").append((int)ch).append(";"); + } } } @@ -771,31 +778,34 @@ class TNXML char ch = s.charAt(i); if (ch == '&') { - int j; - for (j = 0; j < chconvleng; j++) - { - if (s.regionMatches(i, chconvName[j], 0, chconvName[j].length())) - { - sb.append(chconvCH[j]); - i += chconvName[j].length() - 1; - //if (j < 2) - // System.out.println(chconv[j] + " -- " + (int)chconv[j].toCharArray()[0]); - break; - } - } - if (j == chconvleng) - { - if (s.regionMatches(i, "&space;", 0, 7)) // back-compatible - { - sb.append(" "); - i += 6; - } + int refc = s.indexOf(';', i); + if (refc < 0) + TN.emitError("Missing reference close at " + s.substring(i, Math.max(i+15, s.length()))); + + if (s.charAt(++i) == '#') { + // A malformed numeric character reference will result in NumberFormatException + if (s.charAt(++i) == 'x') + // hexadecimal + sb.append((char)Integer.parseInt(s.substring(++i, refc), 16)); else + // decimal + sb.append((char)Integer.parseInt(s.substring(i, refc), 10)); + } else { + String name = s.substring(i, refc); + int j; + for (j = 0; j < chconvleng; j++) { - System.out.println(s.substring(i)); - TN.emitError("unable to resolve & from pos " + i + " in string:" + s); + if (name.equals(chconvName[j])) + { + sb.append(chconvCH[j]); + break; + } } + if (j == chconvleng) + TN.emitError("unable to resolve entity " + name); } + // advance to the reference-close character (loop increment will skip it) + i = refc; } else sb.append(ch);
-- System Information: Debian Release: 9.0 APT prefers testing APT policy: (900, 'testing'), (900, 'stable'), (400, 'unstable') Architecture: amd64 (x86_64) Foreign Architectures: i386, armel Kernel: Linux 3.16.7-ckt2-balti (SMP w/8 CPU cores; PREEMPT) Locale: LANG=en_GB.UTF-8, LC_CTYPE=en_GB.UTF-8 (charmap=UTF-8) Shell: /bin/sh linked to /bin/dash Init: sysvinit (via /sbin/init) Versions of packages tunnelx depends on: ii default-jre [java8-runtime] 2:1.8-58 ii gcj-4.8-jre [java5-runtime] 4.8.5-4 ii jarwrapper 0.59 ii openjdk-8-jre [java8-runtime] 8u111-b14-3 tunnelx recommends no packages. tunnelx suggests no packages. -- no debconf information