Revision: 18418
http://sourceforge.net/p/gate/code/18418
Author: ian_roberts
Date: 2014-10-30 18:38:33 +0000 (Thu, 30 Oct 2014)
Log Message:
-----------
Made the JSON DocumentFormat use the new TweetStreamIterator approach rather
than assuming the original JSON is one tweet per line.
Modified Paths:
--------------
gate/trunk/plugins/Twitter/src/gate/corpora/JSONTweetFormat.java
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterator.java
Modified: gate/trunk/plugins/Twitter/src/gate/corpora/JSONTweetFormat.java
===================================================================
--- gate/trunk/plugins/Twitter/src/gate/corpora/JSONTweetFormat.java
2014-10-30 18:37:26 UTC (rev 18417)
+++ gate/trunk/plugins/Twitter/src/gate/corpora/JSONTweetFormat.java
2014-10-30 18:38:33 UTC (rev 18418)
@@ -17,6 +17,7 @@
import gate.Resource;
import gate.corpora.twitter.PreAnnotation;
import gate.corpora.twitter.Tweet;
+import gate.corpora.twitter.TweetStreamIterator;
import gate.corpora.twitter.TweetUtils;
import gate.creole.ResourceInstantiationException;
import gate.creole.metadata.AutoInstance;
@@ -25,6 +26,8 @@
import gate.util.InvalidOffsetException;
import java.io.IOException;
import java.util.HashMap;
+import java.util.Iterator;
+import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang.StringUtils;
@@ -87,12 +90,13 @@
String jsonString = StringUtils.trimToEmpty(doc.getContent().toString());
try {
// Parse the String
- List<Tweet> tweets = TweetUtils.readTweets(jsonString);
- Map<Tweet, Long> tweetStarts = new HashMap<Tweet, Long>();
+ Iterator<Tweet> tweetSource = new TweetStreamIterator(jsonString, null,
null);
+ Map<Tweet, Long> tweetStarts = new LinkedHashMap<Tweet, Long>();
// Put them all together to make the unpacked document content
StringBuilder concatenation = new StringBuilder();
- for (Tweet tweet : tweets) {
+ while(tweetSource.hasNext()) {
+ Tweet tweet = tweetSource.next();
tweetStarts.put(tweet, (long) concatenation.length());
concatenation.append(tweet.getString()).append("\n\n");
}
@@ -103,7 +107,7 @@
AnnotationSet originalMarkups =
doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
// Create Original markups annotations for each tweet
- for (Tweet tweet : tweets) {
+ for (Tweet tweet : tweetStarts.keySet()) {
for (PreAnnotation preAnn : tweet.getAnnotations()) {
preAnn.toAnnotation(originalMarkups, tweetStarts.get(tweet));
}
Modified:
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterator.java
===================================================================
---
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterator.java
2014-10-30 18:37:26 UTC (rev 18417)
+++
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterator.java
2014-10-30 18:38:33 UTC (rev 18418)
@@ -41,9 +41,17 @@
private List<String> contentKeys, featureKeys;
private boolean nested;
private Iterator<JsonNode> nestedStatuses;
- private JsonNode nextNode;
-
-
+ private JsonNode nextNode;
+
+ public TweetStreamIterator(String json, List<String> contentKeys,
+ List<String> featureKeys) throws JsonParseException, IOException {
+ this.contentKeys = contentKeys;
+ this.featureKeys = featureKeys;
+ objectMapper = new ObjectMapper();
+ jsonParser = objectMapper.getFactory().createParser(json);
+ init();
+ }
+
public TweetStreamIterator(InputStream input, List<String> contentKeys,
List<String> featureKeys, boolean gzip) throws JsonParseException,
IOException {
this.contentKeys = contentKeys;
@@ -61,6 +69,10 @@
}
jsonParser =
objectMapper.getFactory().createParser(workingInput).enable(Feature.AUTO_CLOSE_SOURCE);
+ init();
+ }
+
+ private void init() throws JsonParseException, IOException {
// If the first token in the stream is the start of an array ("[")
// then assume the stream as a whole is an array of objects
// To handle this, simply clear the token - The MappingIterator
@@ -72,7 +84,6 @@
this.nested = false;
this.nestedStatuses = null;
}
-
@Override
public boolean hasNext() {
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs