Revision: 18341
http://sourceforge.net/p/gate/code/18341
Author: adamfunk
Date: 2014-09-16 12:19:23 +0000 (Tue, 16 Sep 2014)
Log Message:
-----------
Changed the iteration through the twitter data to handle edge cases of
odd data formatting.
Modified Paths:
--------------
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Population.java
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/PopulationConfig.java
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterator.java
Removed Paths:
-------------
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterable.java
Modified: gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Population.java
===================================================================
--- gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Population.java
2014-09-16 01:19:57 UTC (rev 18340)
+++ gate/trunk/plugins/Twitter/src/gate/corpora/twitter/Population.java
2014-09-16 12:19:23 UTC (rev 18341)
@@ -75,39 +75,46 @@
input = inputUrl.openStream();
// TODO Detect & handle gzipped input.
- TweetStreamIterable tweetSource = new TweetStreamIterable(input,
contentKeys, featureKeys, false);
+ TweetStreamIterator tweetSource = new TweetStreamIterator(input,
contentKeys, featureKeys, false);
int tweetCounter = 0;
+ int tweetDocCounter = 0;
Document document = newDocument(inputUrl, tweetCounter, COUNTER_DIGITS);
StringBuilder content = new StringBuilder();
Map<PreAnnotation, Integer> annotandaOffsets = new
HashMap<PreAnnotation, Integer>();
- // TODO Suppress empty documents (generated by 0-tweet files).
+ /* TweetStreamIterator.hasNext() returns true if there might be more
+ * tweets in the file; a concatenated set of search results might
+ * have an object with an empty statuses array followed by one
+ * with some tweet in the array; in that case, we ignore the first null
+ * and keep looking. */
- for (Tweet tweet : tweetSource) {
- if ( (tweetsPerDoc > 0) && (tweetCounter > 0) && ((tweetCounter %
tweetsPerDoc) == 0) ) {
- closeDocument(document, content, annotandaOffsets, corpus);
- document = newDocument(inputUrl, tweetCounter, COUNTER_DIGITS);
- content = new StringBuilder();
- annotandaOffsets = new HashMap<PreAnnotation, Integer>();
+ while (tweetSource.hasNext()) {
+ Tweet tweet = tweetSource.next();
+ // next() == null means there wasn't anything ready in the stream,
+ // but there might be next time.
+ if (tweet != null) {
+ tweetDocCounter++;
+ if ( (tweetsPerDoc > 0) && (tweetDocCounter >= tweetsPerDoc) ) {
+ closeDocument(document, content, annotandaOffsets, corpus);
+ tweetDocCounter = 0;
+ document = newDocument(inputUrl, tweetCounter, COUNTER_DIGITS);
+ content = new StringBuilder();
+ annotandaOffsets = new HashMap<PreAnnotation, Integer>();
+ }
+
+ int startOffset = content.length();
+ content.append(tweet.getString());
+ for (PreAnnotation preAnn : tweet.getAnnotations()) {
+ annotandaOffsets.put(preAnn, startOffset);
+ }
+
+ content.append('\n');
+ tweetCounter++;
}
-
- int startOffset = content.length();
- content.append(tweet.getString());
- for (PreAnnotation preAnn : tweet.getAnnotations()) {
- annotandaOffsets.put(preAnn, startOffset);
- }
-
- content.append('\n');
- tweetCounter++;
} // end of Tweet loop
- if (content.length() > 0) {
- closeDocument(document, content, annotandaOffsets, corpus);
- }
- else {
- Factory.deleteResource(document);
- }
+ closeDocument(document, content, annotandaOffsets, corpus);
if(corpus.getDataStore() != null) {
corpus.getDataStore().sync(corpus);
@@ -145,18 +152,23 @@
private static void closeDocument(Document document, StringBuilder content,
Map<PreAnnotation, Integer> annotandaOffsets, Corpus corpus) throws
InvalidOffsetException {
- DocumentContent contentImpl = new DocumentContentImpl(content.toString());
- document.setContent(contentImpl);
- AnnotationSet originalMarkups =
document.getAnnotations(Gate.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
- for (PreAnnotation preAnn : annotandaOffsets.keySet()) {
- preAnn.toAnnotation(originalMarkups, annotandaOffsets.get(preAnn));
- }
- corpus.add(document);
-
- if (corpus.getLRPersistenceId() != null) {
- corpus.unloadDocument(document);
+ if (content.length() == 0) {
Factory.deleteResource(document);
}
+ else {
+ DocumentContent contentImpl = new
DocumentContentImpl(content.toString());
+ document.setContent(contentImpl);
+ AnnotationSet originalMarkups =
document.getAnnotations(Gate.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
+ for (PreAnnotation preAnn : annotandaOffsets.keySet()) {
+ preAnn.toAnnotation(originalMarkups, annotandaOffsets.get(preAnn));
+ }
+ corpus.add(document);
+
+ if (corpus.getLRPersistenceId() != null) {
+ corpus.unloadDocument(document);
+ Factory.deleteResource(document);
+ }
+ }
}
Modified:
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/PopulationConfig.java
===================================================================
--- gate/trunk/plugins/Twitter/src/gate/corpora/twitter/PopulationConfig.java
2014-09-16 01:19:57 UTC (rev 18340)
+++ gate/trunk/plugins/Twitter/src/gate/corpora/twitter/PopulationConfig.java
2014-09-16 12:19:23 UTC (rev 18341)
@@ -13,7 +13,6 @@
import gate.Gate;
-import gate.gui.MainFrame;
import gate.swing.XJFileChooser;
import java.awt.event.ActionEvent;
import java.awt.event.ActionListener;
Deleted:
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterable.java
===================================================================
---
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterable.java
2014-09-16 01:19:57 UTC (rev 18340)
+++
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterable.java
2014-09-16 12:19:23 UTC (rev 18341)
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 1995-2014, The University of Sheffield. See the file
- * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
- *
- * This file is part of GATE (see http://gate.ac.uk/), and is free
- * software, licenced under the GNU Library General Public License,
- * Version 2, June 1991 (in the distribution as file licence.html,
- * and also available at http://gate.ac.uk/gate/licence.html).
- *
- * $Id$
- */
-package gate.corpora.twitter;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Collections;
-import java.util.Iterator;
-import java.util.List;
-import org.apache.log4j.Logger;
-
-
-/**
- * Iterable version, just to make loops easier.
- * @author adam
- *
- */
-public class TweetStreamIterable implements Iterable<Tweet> {
-
- private InputStream input;
- private List<String> contentKeys, featureKeys;
- private boolean gzip;
- private TweetStreamIterator iterator;
-
- private static final Logger logger =
Logger.getLogger(TweetStreamIterable.class.getName());
-
- public TweetStreamIterable(InputStream input, List<String> contentKeys,
- List<String> featureKeys, boolean gzip) {
-
- this.input = input;
- this.contentKeys = contentKeys;
- this.featureKeys = featureKeys;
- this.gzip = gzip;
- this.iterator = null;
- }
-
-
- @Override
- public Iterator<Tweet> iterator() {
- try {
- this.iterator = new TweetStreamIterator(input, contentKeys, featureKeys,
gzip);
- return this.iterator;
- }
- catch(IOException e) {
- logger.warn("Internal error in TweetStreamIterator", e);
- // The Override won't let us throw an exception up.
- return Collections.<Tweet>emptyList().iterator();
- }
- }
-
-
- public void close() {
- if (this.iterator != null) {
- try {
- this.iterator.close();
- }
- catch(IOException e) {
- logger.warn("Internal error in TweetStreamIterator", e);
- }
- }
- }
-
-}
Modified:
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterator.java
===================================================================
---
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterator.java
2014-09-16 01:19:57 UTC (rev 18340)
+++
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetStreamIterator.java
2014-09-16 12:19:23 UTC (rev 18341)
@@ -39,7 +39,7 @@
private JsonParser jsonParser;
private MappingIterator<JsonNode> iterator;
private List<String> contentKeys, featureKeys;
- private boolean nested, hasNextNode;
+ private boolean nested;
private Iterator<JsonNode> nestedStatuses;
private JsonNode nextNode;
@@ -71,20 +71,17 @@
iterator = objectMapper.readValues(jsonParser, JsonNode.class);
this.nested = false;
this.nestedStatuses = null;
- this.hasNextNode = this.iterator.hasNext();
- if (this.hasNextNode) {
- this.nextNode = this.iterator.next();
- }
}
@Override
public boolean hasNext() {
- /* Using this.iterator.hasNext() did not work for search result format,
because
- * it returns true if there is a JSON node with an empty statuses array.
So we
- * have to read ahead a bit in order to let the loop in Population *not*
run in
- * that case (so we can suppress the empty document). */
- return (this.hasNextNode && nonEmpty(this.nextNode)) ||
+ /* Suppressing empty documents in Population.populateCorpus is tricky.
+ * So hasNext() returns true if their *could* be more tweets in the
+ * file, and next() returns null if there are none in the current
+ * main JsonNode; populateCorpus has to text for null.
+ */
+ return this.iterator.hasNext() ||
(this.nested && (this.nestedStatuses != null) &&
this.nestedStatuses.hasNext());
// Belt & braces: this.nested should suffice.
}
@@ -102,30 +99,21 @@
this.nested = this.nestedStatuses.hasNext();
}
- else if (this.hasNext()) {
+ else if (this.iterator.hasNext()) {
+ this.nextNode = this.iterator.next();
+
if (isSearchResultList(this.nextNode)) {
this.nestedStatuses = getStatuses(this.nextNode).iterator();
this.nested = this.nestedStatuses.hasNext();
// Set the nested flag according as there is anything left
- // in thee statuses value array (which could be empty).
+ // in the statuses value array (which could be empty).
}
-
- // Now let's test nested: true IFF we are in a search result thingy AND
- // the thingy's statuses array is non-empty.
- if (this.nested) {
- result = Tweet.readTweet(this.nestedStatuses.next(), contentKeys,
featureKeys);
- // Set the nested flag again for the next call to next()
- this.nested = this.nestedStatuses.hasNext();
- }
else {
- result = Tweet.readTweet(this.nextNode, contentKeys, featureKeys);
+ this.nested = false;
+ this.nestedStatuses = null;
+ result = Tweet.readTweet(nextNode, contentKeys, featureKeys);
}
}
-
- if (! this.nested) {
- hasNextNode = this.iterator.hasNext();
- nextNode = hasNextNode ? this.iterator.next() : null;
- }
}
catch (IOException e) {
logger.warn("Internal error in TweetStreamIterator", e);
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
Want excitement?
Manually upgrade your production database.
When you want reliability, choose Perforce.
Perforce version control. Predictably reliable.
http://pubads.g.doubleclick.net/gampad/clk?id=157508191&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs