Revision: 18945
http://sourceforge.net/p/gate/code/18945
Author: ian_roberts
Date: 2015-10-11 20:40:05 +0000 (Sun, 11 Oct 2015)
Log Message:
-----------
Twitter JSON "entities" count their offsets in terms of Unicode characters, but
GATE annotations count their offsets in terms of Java char values (UTF-16 code
units). Re-implemented the offset adjustment logic to account for this, to fix
odd off-by-one errors decoding entities in tweets with supplementary characters
like emoji.
Modified Paths:
--------------
gate/trunk/plugins/Twitter/src/gate/corpora/export/GATEJsonExporter.java
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Tweet.java
gate/trunk/src/main/gate/corpora/DocumentJsonUtils.java
Modified:
gate/trunk/plugins/Twitter/src/gate/corpora/export/GATEJsonExporter.java
===================================================================
--- gate/trunk/plugins/Twitter/src/gate/corpora/export/GATEJsonExporter.java
2015-10-11 01:19:50 UTC (rev 18944)
+++ gate/trunk/plugins/Twitter/src/gate/corpora/export/GATEJsonExporter.java
2015-10-11 20:40:05 UTC (rev 18945)
@@ -33,6 +33,7 @@
import java.io.IOException;
import java.io.OutputStream;
+import java.io.OutputStreamWriter;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
@@ -158,7 +159,7 @@
*/
protected JsonGenerator openGenerator(OutputStream out, FeatureMap options)
throws IOException {
- JsonGenerator generator = MAPPER.getFactory().createGenerator(out);
+ JsonGenerator generator = MAPPER.getFactory().createGenerator(new
OutputStreamWriter(out, "UTF-8"));
generator.disable(JsonGenerator.Feature.AUTO_CLOSE_TARGET);
generator.enable(JsonGenerator.Feature.AUTO_CLOSE_JSON_CONTENT);
if(options.containsKey("exportAsArray") &&
((Boolean)options.get("exportAsArray")).booleanValue()) {
Modified: gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Tweet.java
===================================================================
--- gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Tweet.java
2015-10-11 01:19:50 UTC (rev 18944)
+++ gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Tweet.java
2015-10-11 20:40:05 UTC (rev 18945)
@@ -147,42 +147,67 @@
this.string = content.toString();
}
- private static Pattern XML_ENTITY_PATTERN = Pattern.compile("&(amp|lt|gt);");
+ /**
+ * Characters to account for in unescaping - HTML-encoded ampersand and angle
+ * brackets, and supplementary characters (which don't need "unescaping" but
do
+ * need to be accounted for in the repos info).
+ */
+ private static Pattern UNESCAPE_PATTERN =
Pattern.compile("&(?:amp|lt|gt);|[\\x{" +
+ Integer.toHexString(Character.MIN_SUPPLEMENTARY_CODE_POINT)+ "}-\\x{" +
+ Integer.toHexString(Character.MAX_CODE_POINT) + "}]");
/**
* Un-escape &, > and < in the given string, populating
- * the supplied {@link RepositioningInfo} to describe the offset changes.
+ * the supplied {@link RepositioningInfo} to describe the offset changes.
Also
+ * record the position of any Unicode supplementary characters, as Twitter's
+ * entities format counts in characters (so a supplementary is 1) whereas
GATE
+ * annotations count in Java <code>char</code> values (UTF-16 code units, so
+ * a supplementary counts as two).
* @param str string, possibly including escaped ampersands or angle brackets
* @param repos {@link RepositioningInfo} to hold offset changes
* @return the unescaped string
*/
private String unescape(String str, RepositioningInfo repos) {
StringBuffer buf = new StringBuffer();
- int correction = 0;
- int lastMatchEnd = 0;
- Matcher mat = XML_ENTITY_PATTERN.matcher(str);
+ int origOffset = 0;
+ int extractedOffset = 0;
+ Matcher mat = UNESCAPE_PATTERN.matcher(str);
while(mat.find()) {
- if(mat.start() != lastMatchEnd) {
+ if(mat.start() != origOffset) {
// repositioning record for the span from end of previous match to
start of this one
- int nonMatchLen = mat.start() - lastMatchEnd;
- repos.addPositionInfo(lastMatchEnd, nonMatchLen, lastMatchEnd -
correction, nonMatchLen);
- }
- // repositioning record covering this match
- repos.addPositionInfo(mat.start(), mat.end() - mat.start(), mat.start()
- correction, 1);
- correction += mat.end() - mat.start() - 1;
+ int nonMatchLen = mat.start() - origOffset;
+ repos.addPositionInfo(origOffset, nonMatchLen, extractedOffset,
nonMatchLen);
+ origOffset += nonMatchLen;
+ extractedOffset += nonMatchLen;
+ }
+
+ // in most cases the original length is the number of code units the
pattern matched
+ int origLen = mat.end() - mat.start();
+ // and the extracted result is one code unit
+ int extractedLen = 1;
String replace = "?";
- switch(mat.group(1)) {
- case "amp": replace = "&"; break;
- case "gt": replace = ">"; break;
- case "lt": replace = "<"; break;
+ switch(mat.group()) {
+ case "&": replace = "&"; break;
+ case ">": replace = ">"; break;
+ case "<": replace = "<"; break;
+ default:
+ // but in the case of supplementary characters, the original length
+ // (in *characters*) is 1 but the extracted length (in code units)
is 2
+ replace = mat.group();
+ origLen = 1;
+ extractedLen = 2;
}
mat.appendReplacement(buf, replace);
- lastMatchEnd = mat.end();
+ // repositioning record covering this match
+ repos.addPositionInfo(origOffset, origLen, extractedOffset,
extractedLen);
+
+ origOffset += origLen;
+ extractedOffset += extractedLen;
}
- int tailLen = str.length() - lastMatchEnd;
+ int tailLen = str.length() - origOffset;
if(tailLen > 0) {
// repositioning record covering everything after the last match
- repos.addPositionInfo(lastMatchEnd, tailLen, lastMatchEnd - correction,
tailLen);
+ repos.addPositionInfo(origOffset, tailLen + 1, extractedOffset, tailLen
+ 1);
}
mat.appendTail(buf);
return buf.toString();
Modified: gate/trunk/src/main/gate/corpora/DocumentJsonUtils.java
===================================================================
--- gate/trunk/src/main/gate/corpora/DocumentJsonUtils.java 2015-10-11
01:19:50 UTC (rev 18944)
+++ gate/trunk/src/main/gate/corpora/DocumentJsonUtils.java 2015-10-11
20:40:05 UTC (rev 18945)
@@ -383,40 +383,69 @@
json.flush();
}
- private static final Pattern CHARS_TO_ESCAPE = Pattern.compile("[<>&]");
+ /**
+ * Characters to account for when escaping - ampersand, angle brackets, and
supplementaries
+ */
+ private static final Pattern CHARS_TO_ESCAPE = Pattern.compile("[<>&\\x{" +
+ Integer.toHexString(Character.MIN_SUPPLEMENTARY_CODE_POINT)+
"}-\\x{" +
+ Integer.toHexString(Character.MAX_CODE_POINT) + "}]");
/**
* Escape all angle brackets and ampersands in the given string,
* recording the adjustments to character offsets within the
- * given {@link RepositioningInfo}.
+ * given {@link RepositioningInfo}. Also record supplementary
+ * characters (above U+FFFF), which count as two in terms of
+ * GATE annotation offsets (which count in Java chars) but one
+ * in terms of JSON (counting in Unicode characters).
*/
private static String escape(String str, RepositioningInfo repos) {
StringBuffer buf = new StringBuffer();
- int correction = 0;
- int lastMatchEnd = 0;
+ int origOffset = 0;
+ int extractedOffset = 0;
Matcher mat = CHARS_TO_ESCAPE.matcher(str);
while(mat.find()) {
- if(mat.start() != lastMatchEnd) {
+ if(mat.start() != extractedOffset) {
// repositioning record for the span from end of previous match to
start of this one
- int nonMatchLen = mat.start() - lastMatchEnd;
- repos.addPositionInfo(lastMatchEnd + correction, nonMatchLen,
lastMatchEnd, nonMatchLen);
+ int nonMatchLen = mat.start() - extractedOffset;
+ repos.addPositionInfo(origOffset, nonMatchLen, extractedOffset,
nonMatchLen);
+ origOffset += nonMatchLen;
+ extractedOffset += nonMatchLen;
}
+
+ // the extracted length is the number of code units matched by the
pattern
+ int extractedLen = mat.end() - mat.start();
+ int origLen = 0;
String replace = "?";
switch(mat.group()) {
- case "&": replace = "&"; break;
- case ">": replace = ">"; break;
- case "<": replace = "<"; break;
+ case "&":
+ replace = "&";
+ origLen = 5;
+ break;
+ case ">":
+ replace = ">";
+ origLen = 4;
+ break;
+ case "<":
+ replace = "<";
+ origLen = 4;
+ break;
+ default:
+ // supplementary character, so no escaping but need to account for
+ // it in repositioning info
+ replace = mat.group();
+ origLen = 1;
}
// repositioning record covering this match
- repos.addPositionInfo(mat.start() + correction, replace.length(),
mat.start(), 1);
- correction += replace.length() - 1;
+ repos.addPositionInfo(origOffset, origLen, extractedOffset,
extractedLen);
mat.appendReplacement(buf, replace);
- lastMatchEnd = mat.end();
+ origOffset += origLen;
+ extractedOffset += extractedLen;
+
}
- int tailLen = str.length() - lastMatchEnd;
+ int tailLen = str.length() - extractedOffset;
if(tailLen > 0) {
// repositioning record covering everything after the last match
- repos.addPositionInfo(lastMatchEnd + correction, tailLen + 1,
lastMatchEnd, tailLen + 1);
+ repos.addPositionInfo(origOffset, tailLen + 1, extractedOffset, tailLen
+ 1);
}
mat.appendTail(buf);
return buf.toString();
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs