Revision: 18483
          http://sourceforge.net/p/gate/code/18483
Author:   ian_roberts
Date:     2014-12-04 18:28:52 +0000 (Thu, 04 Dec 2014)
Log Message:
-----------
Added a date format parser static to TweetUtils that can parse the "created_at"
timestamp format of Tweet JSON, and a couple of example JAPE grammars that use
the parser to parse timestamps into a more useful form.

Given the performance and thread-(un)safety of SimpleDateFormat I've done this
using Joda Time.

Modified Paths:
--------------
    gate/trunk/build/deploy/maven/gate-core.pom.template
    gate/trunk/ivy.xml
    gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetUtils.java

Added Paths:
-----------
    gate/trunk/plugins/Twitter/resources/timestamps/
    gate/trunk/plugins/Twitter/resources/timestamps/add_timestamp_date.jape
    gate/trunk/plugins/Twitter/resources/timestamps/add_timestamp_hour.jape

Modified: gate/trunk/build/deploy/maven/gate-core.pom.template
===================================================================
--- gate/trunk/build/deploy/maven/gate-core.pom.template        2014-12-03 
02:24:19 UTC (rev 18482)
+++ gate/trunk/build/deploy/maven/gate-core.pom.template        2014-12-04 
18:28:52 UTC (rev 18483)
@@ -305,6 +305,13 @@
             <scope>compile</scope>
         </dependency>
 
+        <!-- Used for date parsing and formatting -->
+        <dependency>
+            <groupId>joda-time</groupId>
+            <artifactId>joda-time</artifactId>
+            <version>2.6</version>
+        </dependency>
+
         <!-- Used for testing GATE -->
         <dependency>
             <groupId>junit</groupId>

Modified: gate/trunk/ivy.xml
===================================================================
--- gate/trunk/ivy.xml  2014-12-03 02:24:19 UTC (rev 18482)
+++ gate/trunk/ivy.xml  2014-12-04 18:28:52 UTC (rev 18483)
@@ -115,6 +115,9 @@
     <!-- Used for comparing XML files during some of the GATE tests -->
     <dependency org="xmlunit" name="xmlunit" rev="1.5" conf="required->master" 
/>
 
+    <!-- Used for date and time formatting and parsing -->
+    <dependency org="joda-time" name="joda-time" rev="2.6" />
+
     <!-- Used for testing GATE -->
     <dependency org="junit" name="junit" rev="4.11" 
conf="compile->master,compile" />
 

Added: gate/trunk/plugins/Twitter/resources/timestamps/add_timestamp_date.jape
===================================================================
--- gate/trunk/plugins/Twitter/resources/timestamps/add_timestamp_date.jape     
                        (rev 0)
+++ gate/trunk/plugins/Twitter/resources/timestamps/add_timestamp_date.jape     
2014-12-04 18:28:52 UTC (rev 18483)
@@ -0,0 +1,35 @@
+/*
+ * Parse the "created_at" timestamp string from a Tweet annotation and store
+ * the equivalent Java timestamp as a Long.
+ */
+Imports: {
+  import gate.corpora.twitter.TweetUtils;
+  import org.joda.time.DateTime;
+}
+
+Phase: ParseTimestamp
+Input: Tweet
+Options: control = all
+
+Rule: TweetTimestamp
+(
+  {Tweet}
+):twt
+-->
+:twt {
+  for(Annotation twt : twtAnnots) {
+    String createdAt = (String)twt.getFeatures().get("created_at");
+    if(createdAt != null) {
+      try {
+        DateTime dt = TweetUtils.CREATED_AT_FORMAT.parseDateTime(createdAt);
+
+        // store timestamp as a Long on the annotation
+        twt.getFeatures().put("timestamp", dt.getMillis());
+
+      } catch(IllegalArgumentException e) {
+        // could not parse as a string
+        throw new NonFatalJapeException("Could not parse " + createdAt + " as 
a date", e);
+      }
+    }
+  }
+}

Added: gate/trunk/plugins/Twitter/resources/timestamps/add_timestamp_hour.jape
===================================================================
--- gate/trunk/plugins/Twitter/resources/timestamps/add_timestamp_hour.jape     
                        (rev 0)
+++ gate/trunk/plugins/Twitter/resources/timestamps/add_timestamp_hour.jape     
2014-12-04 18:28:52 UTC (rev 18483)
@@ -0,0 +1,42 @@
+/*
+ * Parse the "created_at" timestamp string from a Tweet annotation and store
+ * the timestamp to the nearest hour as a document feature.  The format used is
+ * YYYYMMDDHH as a single number, to support range-based searches.
+ *
+ * Note this only makes sense when there is just one Tweet per document.
+ */
+Imports: {
+  import gate.corpora.twitter.TweetUtils;
+  import org.joda.time.DateTime;
+}
+
+Phase: ParseTimestamp
+Input: Tweet
+Options: control = once
+
+Rule: TweetTimestamp
+(
+  {Tweet}
+):twt
+-->
+:twt {
+  for(Annotation twt : twtAnnots) {
+    String createdAt = (String)twt.getFeatures().get("created_at");
+    if(createdAt != null) {
+      try {
+        DateTime dt = TweetUtils.CREATED_AT_FORMAT.parseDateTime(createdAt);
+
+        // store timestamp as a document feature YYYYMMDDHH
+        doc.getFeatures().put("hour_timestamp", Long.valueOf(
+              (long)dt.getHourOfDay()
+              + 100L * (long)dt.getDayOfMonth()
+              + 100L * 100L * (long)dt.getMonthOfYear()
+              + 100L * 100L * 100L * (long)dt.getYear()));
+
+      } catch(IllegalArgumentException e) {
+        // could not parse as a string
+        throw new NonFatalJapeException("Could not parse " + createdAt + " as 
a date", e);
+      }
+    }
+  }
+}

Modified: gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetUtils.java
===================================================================
--- gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetUtils.java 
2014-12-03 02:24:19 UTC (rev 18482)
+++ gate/trunk/plugins/Twitter/src/gate/corpora/twitter/TweetUtils.java 
2014-12-04 18:28:52 UTC (rev 18483)
@@ -18,6 +18,9 @@
 import java.util.Iterator;
 import java.util.List;
 import org.apache.commons.lang.StringUtils;
+import org.joda.time.format.DateTimeFormat;
+import org.joda.time.format.DateTimeFormatter;
+
 import com.fasterxml.jackson.databind.JsonNode;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.fasterxml.jackson.databind.node.ArrayNode;
@@ -47,6 +50,14 @@
    * The JSON property representing entities (e.g. hashtags).
    */
   public static final String ENTITIES_ATTRIBUTE = "entities";
+  
+  /**
+   * Date parser that understands the "created_at" timestamp format.
+   * The parser can cope with dates in any timezone but the returned
+   * DateTime objects will always be anchored in UTC.
+   */
+  public static final DateTimeFormatter CREATED_AT_FORMAT = 
DateTimeFormat.forPattern(
+          "EEE MMM dd HH:mm:ss ZZZZZ yyyy").withZoneUTC();
 
   
   public static List<Tweet> readTweets(String string) throws IOException {

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
Download BIRT iHub F-Type - The Free Enterprise-Grade BIRT Server
from Actuate! Instantly Supercharge Your Business Reports and Dashboards
with Interactivity, Sharing, Native Excel Exports, App Integration & more
Get technology previously reserved for billion-dollar corporations, FREE
http://pubads.g.doubleclick.net/gampad/clk?id=164703151&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs

Reply via email to