Revision: 18483
http://sourceforge.net/p/gate/code/18483
Author: ian_roberts
Date: 2014-12-04 18:28:52 +0000 (Thu, 04 Dec 2014)
Log Message:
-----------
Added a date format parser static to TweetUtils that can parse the "created_at"
timestamp format of Tweet JSON, and a couple of example JAPE grammars that use
the parser to parse timestamps into a more useful form.
Given the performance and thread-(un)safety of SimpleDateFormat I've done this
using Joda Time.
Modified Paths:
--------------
gate/trunk/build/deploy/maven/gate-core.pom.template
gate/trunk/ivy.xml
gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetUtils.java
Added Paths:
-----------
gate/trunk/plugins/Twitter/resources/timestamps/
gate/trunk/plugins/Twitter/resources/timestamps/add_timestamp_date.jape
gate/trunk/plugins/Twitter/resources/timestamps/add_timestamp_hour.jape
Modified: gate/trunk/build/deploy/maven/gate-core.pom.template
===================================================================
--- gate/trunk/build/deploy/maven/gate-core.pom.template 2014-12-03
02:24:19 UTC (rev 18482)
+++ gate/trunk/build/deploy/maven/gate-core.pom.template 2014-12-04
18:28:52 UTC (rev 18483)
@@ -305,6 +305,13 @@
<scope>compile</scope>
</dependency>
+ <!-- Used for date parsing and formatting -->
+ <dependency>
+ <groupId>joda-time</groupId>
+ <artifactId>joda-time</artifactId>
+ <version>2.6</version>
+ </dependency>
+
<!-- Used for testing GATE -->
<dependency>
<groupId>junit</groupId>
Modified: gate/trunk/ivy.xml
===================================================================
--- gate/trunk/ivy.xml 2014-12-03 02:24:19 UTC (rev 18482)
+++ gate/trunk/ivy.xml 2014-12-04 18:28:52 UTC (rev 18483)
@@ -115,6 +115,9 @@
<!-- Used for comparing XML files during some of the GATE tests -->
<dependency org="xmlunit" name="xmlunit" rev="1.5" conf="required->master"
/>
+ <!-- Used for date and time formatting and parsing -->
+ <dependency org="joda-time" name="joda-time" rev="2.6" />
+
<!-- Used for testing GATE -->
<dependency org="junit" name="junit" rev="4.11"
conf="compile->master,compile" />
Added: gate/trunk/plugins/Twitter/resources/timestamps/add_timestamp_date.jape
===================================================================
--- gate/trunk/plugins/Twitter/resources/timestamps/add_timestamp_date.jape
(rev 0)
+++ gate/trunk/plugins/Twitter/resources/timestamps/add_timestamp_date.jape
2014-12-04 18:28:52 UTC (rev 18483)
@@ -0,0 +1,35 @@
+/*
+ * Parse the "created_at" timestamp string from a Tweet annotation and store
+ * the equivalent Java timestamp as a Long.
+ */
+Imports: {
+ import gate.corpora.twitter.TweetUtils;
+ import org.joda.time.DateTime;
+}
+
+Phase: ParseTimestamp
+Input: Tweet
+Options: control = all
+
+Rule: TweetTimestamp
+(
+ {Tweet}
+):twt
+-->
+:twt {
+ for(Annotation twt : twtAnnots) {
+ String createdAt = (String)twt.getFeatures().get("created_at");
+ if(createdAt != null) {
+ try {
+ DateTime dt = TweetUtils.CREATED_AT_FORMAT.parseDateTime(createdAt);
+
+ // store timestamp as a Long on the annotation
+ twt.getFeatures().put("timestamp", dt.getMillis());
+
+ } catch(IllegalArgumentException e) {
+ // could not parse as a string
+ throw new NonFatalJapeException("Could not parse " + createdAt + " as
a date", e);
+ }
+ }
+ }
+}
Added: gate/trunk/plugins/Twitter/resources/timestamps/add_timestamp_hour.jape
===================================================================
--- gate/trunk/plugins/Twitter/resources/timestamps/add_timestamp_hour.jape
(rev 0)
+++ gate/trunk/plugins/Twitter/resources/timestamps/add_timestamp_hour.jape
2014-12-04 18:28:52 UTC (rev 18483)
@@ -0,0 +1,42 @@
+/*
+ * Parse the "created_at" timestamp string from a Tweet annotation and store
+ * the timestamp to the nearest hour as a document feature. The format used is
+ * YYYYMMDDHH as a single number, to support range-based searches.
+ *
+ * Note this only makes sense when there is just one Tweet per document.
+ */
+Imports: {
+ import gate.corpora.twitter.TweetUtils;
+ import org.joda.time.DateTime;
+}
+
+Phase: ParseTimestamp
+Input: Tweet
+Options: control = once
+
+Rule: TweetTimestamp
+(
+ {Tweet}
+):twt
+-->
+:twt {
+ for(Annotation twt : twtAnnots) {
+ String createdAt = (String)twt.getFeatures().get("created_at");
+ if(createdAt != null) {
+ try {
+ DateTime dt = TweetUtils.CREATED_AT_FORMAT.parseDateTime(createdAt);
+
+ // store timestamp as a document feature YYYYMMDDHH
+ doc.getFeatures().put("hour_timestamp", Long.valueOf(
+ (long)dt.getHourOfDay()
+ + 100L * (long)dt.getDayOfMonth()
+ + 100L * 100L * (long)dt.getMonthOfYear()
+ + 100L * 100L * 100L * (long)dt.getYear()));
+
+ } catch(IllegalArgumentException e) {
+ // could not parse as a string
+ throw new NonFatalJapeException("Could not parse " + createdAt + " as
a date", e);
+ }
+ }
+ }
+}
Modified: gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetUtils.java
===================================================================
--- gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetUtils.java
2014-12-03 02:24:19 UTC (rev 18482)
+++ gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetUtils.java
2014-12-04 18:28:52 UTC (rev 18483)
@@ -18,6 +18,9 @@
import java.util.Iterator;
import java.util.List;
import org.apache.commons.lang.StringUtils;
+import org.joda.time.format.DateTimeFormat;
+import org.joda.time.format.DateTimeFormatter;
+
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ArrayNode;
@@ -47,6 +50,14 @@
* The JSON property representing entities (e.g. hashtags).
*/
public static final String ENTITIES_ATTRIBUTE = "entities";
+
+ /**
+ * Date parser that understands the "created_at" timestamp format.
+ * The parser can cope with dates in any timezone but the returned
+ * DateTime objects will always be anchored in UTC.
+ */
+ public static final DateTimeFormatter CREATED_AT_FORMAT =
DateTimeFormat.forPattern(
+ "EEE MMM dd HH:mm:ss ZZZZZ yyyy").withZoneUTC();
public static List<Tweet> readTweets(String string) throws IOException {
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
Download BIRT iHub F-Type - The Free Enterprise-Grade BIRT Server
from Actuate! Instantly Supercharge Your Business Reports and Dashboards
with Interactivity, Sharing, Native Excel Exports, App Integration & more
Get technology previously reserved for billion-dollar corporations, FREE
http://pubads.g.doubleclick.net/gampad/clk?id=164703151&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs