Revision: 18117
http://sourceforge.net/p/gate/code/18117
Author: markagreenwood
Date: 2014-06-23 12:40:36 +0000 (Mon, 23 Jun 2014)
Log Message:
-----------
the new DataSift format plugin
Added Paths:
-----------
gate/trunk/plugins/Format_DataSift/
gate/trunk/plugins/Format_DataSift/.classpath
gate/trunk/plugins/Format_DataSift/.project
gate/trunk/plugins/Format_DataSift/build.xml
gate/trunk/plugins/Format_DataSift/creole.xml
gate/trunk/plugins/Format_DataSift/src/
gate/trunk/plugins/Format_DataSift/src/gate/
gate/trunk/plugins/Format_DataSift/src/gate/corpora/
gate/trunk/plugins/Format_DataSift/src/gate/corpora/DataSiftFormat.java
gate/trunk/plugins/Format_DataSift/src/gate/corpora/datasift/
gate/trunk/plugins/Format_DataSift/src/gate/corpora/datasift/DataSift.java
gate/trunk/plugins/Format_DataSift/src/gate/corpora/datasift/Interaction.java
Index: gate/trunk/plugins/Format_DataSift
===================================================================
--- gate/trunk/plugins/Format_DataSift 2014-06-23 11:35:16 UTC (rev 18116)
+++ gate/trunk/plugins/Format_DataSift 2014-06-23 12:40:36 UTC (rev 18117)
Property changes on: gate/trunk/plugins/Format_DataSift
___________________________________________________________________
Added: svn:ignore
## -0,0 +1,2 ##
+classes
+DataSift.jar
Added: gate/trunk/plugins/Format_DataSift/.classpath
===================================================================
--- gate/trunk/plugins/Format_DataSift/.classpath
(rev 0)
+++ gate/trunk/plugins/Format_DataSift/.classpath 2014-06-23 12:40:36 UTC
(rev 18117)
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<classpath>
+ <classpathentry kind="src" path="src"/>
+ <classpathentry kind="con"
path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
+ <classpathentry combineaccessrules="false" kind="src" path="/GATE"/>
+ <classpathentry kind="output" path="classes"/>
+</classpath>
Added: gate/trunk/plugins/Format_DataSift/.project
===================================================================
--- gate/trunk/plugins/Format_DataSift/.project (rev 0)
+++ gate/trunk/plugins/Format_DataSift/.project 2014-06-23 12:40:36 UTC (rev
18117)
@@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+ <name>GATE-plugin-Format_DataSift</name>
+ <comment></comment>
+ <projects>
+ </projects>
+ <buildSpec>
+ <buildCommand>
+ <name>org.eclipse.jdt.core.javabuilder</name>
+ <arguments>
+ </arguments>
+ </buildCommand>
+ </buildSpec>
+ <natures>
+ <nature>org.eclipse.jdt.core.javanature</nature>
+ </natures>
+</projectDescription>
Added: gate/trunk/plugins/Format_DataSift/build.xml
===================================================================
--- gate/trunk/plugins/Format_DataSift/build.xml
(rev 0)
+++ gate/trunk/plugins/Format_DataSift/build.xml 2014-06-23 12:40:36 UTC
(rev 18117)
@@ -0,0 +1,85 @@
+<project name="Format_DataSift" basedir=".">
+ <!-- Prevent Ant from warning about includeantruntime not being set -->
+ <property name="build.sysclasspath" value="ignore" />
+
+ <property file="build.properties" />
+
+ <property name="gate.home" location="../.." />
+ <property name="gate.lib" location="${gate.home}/lib" />
+ <property name="gate.jar" location="${gate.home}/bin/gate.jar" />
+ <property name="src.dir" location="src" />
+ <property name="classes.dir" location="classes" />
+ <property name="jar.location" location="DataSift.jar" />
+ <property name="doc.dir" location="doc" />
+ <property name="javadoc.dir" location="${doc.dir}/javadoc" />
+
+ <!-- Path to compile - includes gate.jar and GATE/lib/*.jar -->
+ <path id="compile.classpath">
+ <pathelement location="${gate.jar}" />
+ <fileset dir="${gate.lib}">
+ <include name="**/*.jar" />
+ <include name="**/*.zip" />
+ </fileset>
+ </path>
+
+ <!-- create build directory structure -->
+ <target name="prepare">
+ <mkdir dir="${classes.dir}" />
+ </target>
+
+ <target name="resources" depends="prepare">
+ <!-- <copy todir="${classes.dir}/gate/resources"
includeEmptyDirs="true">
+ <fileset dir="${src.dir}/gate/resources" />
+ </copy> -->
+ </target>
+
+ <!-- compile the source -->
+ <target name="compile" depends="prepare">
+ <javac classpathref="compile.classpath" srcdir="${src.dir}"
destdir="${classes.dir}" debug="true" debuglevel="lines,source" source="1.7"
target="1.7"/>
+ </target>
+
+ <!-- create the JAR file -->
+ <target name="jar" depends="compile, resources">
+ <jar destfile="${jar.location}" update="false"
basedir="${classes.dir}" />
+ </target>
+
+ <!-- remove the generated .class files -->
+ <target name="clean.classes">
+ <delete dir="${classes.dir}" />
+ </target>
+
+ <!-- Clean up - remove .class and .jar files -->
+ <target name="clean" depends="clean.classes">
+ <delete file="${jar.location}" />
+ </target>
+
+ <!-- Targets used by the main GATE build file:
+ build: build the plugin - just calls "jar" target
+ test : run the unit tests - there aren't any
+ distro.prepare: remove intermediate files that shouldn't be in the
+ distribution
+ -->
+ <target name="build" depends="jar" />
+ <target name="test" />
+ <target name="distro.prepare" depends="clean.classes" />
+
+
+ <!-- Build JavaDoc documentation -->
+ <target name="doc.prepare">
+ <mkdir dir="${javadoc.dir}" />
+ </target>
+
+ <target name="javadoc" depends="doc.prepare">
+ <javadoc destdir="${javadoc.dir}" packagenames="*"
+ classpathref="compile.classpath"
+ encoding="UTF-8"
+ windowtitle="Format_DataSift JavaDoc"
+ source="1.6">
+ <sourcepath>
+ <pathelement location="${src.dir}" />
+ </sourcepath>
+ <link href="http://docs.oracle.com/javase/7/docs/api/" />
+ <link href="http://gate.ac.uk/gate/doc/javadoc/" />
+ </javadoc>
+ </target>
+</project>
Added: gate/trunk/plugins/Format_DataSift/creole.xml
===================================================================
--- gate/trunk/plugins/Format_DataSift/creole.xml
(rev 0)
+++ gate/trunk/plugins/Format_DataSift/creole.xml 2014-06-23 12:40:36 UTC
(rev 18117)
@@ -0,0 +1,5 @@
+<CREOLE-DIRECTORY>
+ <CREOLE>
+ <JAR scan="true">DataSift.jar</JAR>
+ </CREOLE>
+</CREOLE-DIRECTORY>
Added: gate/trunk/plugins/Format_DataSift/src/gate/corpora/DataSiftFormat.java
===================================================================
--- gate/trunk/plugins/Format_DataSift/src/gate/corpora/DataSiftFormat.java
(rev 0)
+++ gate/trunk/plugins/Format_DataSift/src/gate/corpora/DataSiftFormat.java
2014-06-23 12:40:36 UTC (rev 18117)
@@ -0,0 +1,111 @@
+package gate.corpora;
+
+import gate.AnnotationSet;
+import gate.DocumentContent;
+import gate.Factory;
+import gate.FeatureMap;
+import gate.Resource;
+import gate.corpora.datasift.DataSift;
+import gate.corpora.datasift.Interaction;
+import gate.creole.ResourceInstantiationException;
+import gate.creole.metadata.AutoInstance;
+import gate.creole.metadata.CreoleResource;
+import gate.util.DocumentFormatException;
+import gate.util.InvalidOffsetException;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+
+import org.apache.commons.lang.StringUtils;
+
+import com.fasterxml.jackson.core.JsonFactory;
+import com.fasterxml.jackson.core.JsonParser;
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+@CreoleResource(name = "GATE DataSift JSON Document Format", isPrivate = true,
autoinstances = {@AutoInstance(hidden = true)}, comment = "Format parser for
DataSift JSON files")
+public class DataSiftFormat extends TextualDocumentFormat {
+
+ /** Initialise this resource, and return it. */
+ public Resource init() throws ResourceInstantiationException {
+ // Register ad hoc MIME-type
+ // There is an application/json mime type, but I don't think
+ // we want everything to be handled this way?
+ MimeType mime = new MimeType("text", "x-json-datasift");
+ // Register the class handler for this MIME-type
+ mimeString2ClassHandlerMap.put(mime.getType() + "/" + mime.getSubtype(),
+ this);
+ // Register the mime type with string
+ mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime);
+ // Register file suffixes for this mime type
+ suffixes2mimeTypeMap.put("datasift.json", mime);
+ // Register magic numbers for this mime type
+ // magic2mimeTypeMap.put("Subject:",mime);
+ // Set the mimeType for this language resource
+ setMimeType(mime);
+ return this;
+ }
+
+ @Override
+ public void cleanup() {
+ super.cleanup();
+
+ MimeType mime = getMimeType();
+
+ mimeString2ClassHandlerMap.remove(mime.getType() + "/" +
mime.getSubtype());
+ mimeString2mimeTypeMap.remove(mime.getType() + "/" + mime.getSubtype());
+ suffixes2mimeTypeMap.remove("datasift.json");
+ }
+
+ @Override
+ public void unpackMarkup(gate.Document doc) throws DocumentFormatException {
+ if((doc == null)
+ || (doc.getSourceUrl() == null && doc.getContent() == null)) { throw
new DocumentFormatException(
+ "GATE document is null or no content found. Nothing to parse!"); }
+
+ setNewLineProperty(doc);
+ String jsonString = StringUtils.trimToEmpty(doc.getContent().toString());
+
+ // TODO build the new content
+ StringBuilder concatenation = new StringBuilder();
+
+ try {
+ ObjectMapper om = new ObjectMapper();
+
+ /*List<Interaction> twits = om.readValue(jsonString, new
TypeReference<List<Interaction>>() {
+ });*/
+
+ JsonFactory factory = new JsonFactory(om);
+ JsonParser parser = factory.createParser(jsonString);
+
+ Map<DataSift,Long> offsets = new HashMap<DataSift,Long>();
+
+ Iterator<DataSift> it = parser.readValuesAs(DataSift.class);
+ while(it.hasNext()) {
+ DataSift ds = it.next();
+ offsets.put(ds,(long)concatenation.length());
+ concatenation.append(ds.getInteraction().getContent()).append("\n\n");
+ }
+
+ // Set new document content
+ DocumentContent newContent =
+ new DocumentContentImpl(concatenation.toString());
+
+ doc.edit(0L, doc.getContent().size(), newContent);
+
+ AnnotationSet originalMarkups = doc.getAnnotations("Original markups");
+ for (Map.Entry<DataSift, Long> item : offsets.entrySet()) {
+ DataSift ds = item.getKey();
+ Interaction interaction = ds.getInteraction();
+ Long start = item.getValue();
+
+
originalMarkups.add(start,start+interaction.getContent().length(),"Interaction",interaction.asFeatureMap());
+ }
+
+ //TODO add annotations and features
+ } catch(InvalidOffsetException | IOException e) {
+ throw new DocumentFormatException(e);
+ }
+ }
+}
Added:
gate/trunk/plugins/Format_DataSift/src/gate/corpora/datasift/DataSift.java
===================================================================
--- gate/trunk/plugins/Format_DataSift/src/gate/corpora/datasift/DataSift.java
(rev 0)
+++ gate/trunk/plugins/Format_DataSift/src/gate/corpora/datasift/DataSift.java
2014-06-23 12:40:36 UTC (rev 18117)
@@ -0,0 +1,19 @@
+package gate.corpora.datasift;
+
+import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
+import com.fasterxml.jackson.databind.annotation.JsonSerialize;
+import com.fasterxml.jackson.databind.annotation.JsonSerialize.Inclusion;
+
+@JsonSerialize(include = Inclusion.NON_NULL)
+@JsonIgnoreProperties(ignoreUnknown = true)
+public class DataSift {
+ private Interaction interaction;
+
+ public Interaction getInteraction() {
+ return interaction;
+ }
+
+ public void setInteraction(Interaction interaction) {
+ this.interaction = interaction;
+ }
+}
Added:
gate/trunk/plugins/Format_DataSift/src/gate/corpora/datasift/Interaction.java
===================================================================
---
gate/trunk/plugins/Format_DataSift/src/gate/corpora/datasift/Interaction.java
(rev 0)
+++
gate/trunk/plugins/Format_DataSift/src/gate/corpora/datasift/Interaction.java
2014-06-23 12:40:36 UTC (rev 18117)
@@ -0,0 +1,87 @@
+package gate.corpora.datasift;
+
+import gate.Factory;
+import gate.FeatureMap;
+
+import java.util.Map;
+
+import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.databind.annotation.JsonSerialize;
+import com.fasterxml.jackson.databind.annotation.JsonSerialize.Inclusion;
+
+@JsonSerialize(include = Inclusion.NON_NULL)
+@JsonIgnoreProperties(ignoreUnknown = true)
+public class Interaction {
+ private String content, type, link, id, created;
+
+ private Map<String,Object> author;
+
+ public String getContent() {
+ return content;
+ }
+
+ public void setContent(String content) {
+ this.content = content;
+ }
+
+ @JsonProperty("id")
+ public String getID() {
+ return id;
+ }
+
+ public void setID(String id) {
+ this.id = id;
+ }
+
+ @JsonProperty("created_at")
+ public String getCreatedAt() {
+ return created;
+ }
+
+ public void setCreatedAt(String created) {
+ this.created = created;
+ }
+
+ public String getType() {
+ return type;
+ }
+
+ public void setType(String type) {
+ this.type = type;
+ }
+
+ public Map<String,Object> getAuthor() {
+ return author;
+ }
+
+ public void setAuthor(Map<String,Object> author) {
+ this.author = author;
+ }
+
+ public String getLink() {
+ return link;
+ }
+
+ public void setLink(String link) {
+ this.link = link;
+ }
+
+ public FeatureMap asFeatureMap() {
+ FeatureMap features = Factory.newFeatureMap();
+
+ if (type != null && !type.trim().equals("")) features.put("type", type);
+ if (link != null && !link.trim().equals("")) features.put("link", link);
+ if (id != null && !id.trim().equals("")) features.put("id", id);
+ if (created != null && !created.trim().equals(""))
features.put("created_at", created);
+
+ if (author != null) {
+ for(Map.Entry<String,Object> data : author.entrySet()) {
+ features.put("author_"+data.getKey(), data.getValue());
+ }
+ }
+
+ return features;
+ }
+}
+
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
HPCC Systems Open Source Big Data Platform from LexisNexis Risk Solutions
Find What Matters Most in Your Big Data with HPCC Systems
Open Source. Fast. Scalable. Simple. Ideal for Dirty Data.
Leverages Graph Analysis for Fast Processing & Easy Data Exploration
http://p.sf.net/sfu/hpccsystems
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs