Revision: 17888
          http://sourceforge.net/p/gate/code/17888
Author:   markagreenwood
Date:     2014-04-21 09:58:53 +0000 (Mon, 21 Apr 2014)
Log Message:
-----------
applied patch #40 from sourceforge to give us a wrapper for the OpenNLP parser 
-- I had to modify the patch slightly due to changes in OpenNLP since it was 
submitted in 2010, but the output it produces looks reasonable. I haven't 
included the parser in the default OpenNLP app though

Added Paths:
-----------
    gate/trunk/plugins/OpenNLP/src/gate/opennlp/OpenNlpParser.java

Added: gate/trunk/plugins/OpenNLP/src/gate/opennlp/OpenNlpParser.java
===================================================================
--- gate/trunk/plugins/OpenNLP/src/gate/opennlp/OpenNlpParser.java              
                (rev 0)
+++ gate/trunk/plugins/OpenNLP/src/gate/opennlp/OpenNlpParser.java      
2014-04-21 09:58:53 UTC (rev 17888)
@@ -0,0 +1,178 @@
+package gate.opennlp;
+
+import gate.Annotation;
+import gate.AnnotationSet;
+import gate.FeatureMap;
+import gate.Resource;
+import gate.creole.AbstractLanguageAnalyser;
+import gate.creole.ExecutionException;
+import gate.creole.ResourceInstantiationException;
+import gate.creole.metadata.CreoleParameter;
+import gate.creole.metadata.CreoleResource;
+import gate.creole.metadata.Optional;
+import gate.creole.metadata.RunTime;
+
+import java.io.InputStream;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+import opennlp.tools.parser.Parse;
+import opennlp.tools.parser.Parser;
+import opennlp.tools.parser.ParserFactory;
+import opennlp.tools.parser.ParserModel;
+import opennlp.tools.util.Span;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.log4j.Logger;
+
+/**
+ * Wrapper for the opennlp parser
+ */
+@CreoleResource(name="OpenNLP Parser")
+public class OpenNlpParser extends AbstractLanguageAnalyser {
+
+  public static final long serialVersionUID = 1L;
+
+  private static final Logger logger = Logger.getLogger(OpenNlpChunker.class);
+
+  String inputASName;
+
+  public String getInputASName() {
+    return inputASName;
+  }
+
+  @CreoleParameter
+  @RunTime
+  @Optional
+  public void setInputASName(String inputASName) {
+    this.inputASName = inputASName;
+  }
+
+  Parser parser = null;
+
+  private URL model;
+
+  private AnnotationSet annotations;
+
+  @Override
+  public void execute() throws ExecutionException {
+    // text doc annotations
+    if(inputASName != null && inputASName.length() > 0) {
+      annotations = document.getAnnotations(inputASName);
+    } else {
+      annotations = document.getAnnotations();
+    }
+
+    // get token and sentence annotations
+    AnnotationSet sentences = annotations.get("Sentence");
+    AnnotationSet tokensAS = annotations.get("Token");
+
+    if(sentences != null && sentences.size() > 0 && tokensAS != null &&
+      tokensAS.size() > 0) {
+
+      List<Annotation> sentList = new ArrayList<Annotation>(sentences);
+      java.util.Collections.sort(sentList, new gate.util.OffsetComparator());
+
+      try {
+        for(Annotation annotation : sentList) {
+          AnnotationSet sentenceTokens =
+            annotations.get("Token", annotation.getStartNode().getOffset(),
+              annotation.getEndNode().getOffset());
+
+          List<Annotation> annList = new ArrayList<Annotation>(sentenceTokens);
+          Collections.sort(annList, new gate.util.OffsetComparator());
+
+          Long sentStart = annotation.getStartNode().getOffset();
+          Long sentEnd = annotation.getEndNode().getOffset();
+          String text =
+            document.getContent().getContent(sentStart, sentEnd).toString();
+          Parse parse =
+            new Parse(text, new Span(0, text.length()), "INC", 1, null);
+
+          for(Annotation ann : annList) {
+            Long start = ann.getStartNode().getOffset() - sentStart;
+            Long end = ann.getEndNode().getOffset() - sentStart;
+            parse.insert(new Parse(text, new Span(start.intValue(), end
+              .intValue()), "TK", 0, 0));
+          }
+
+          Parse result = parser.parse(parse);
+
+          annotate(result, sentStart);
+        }
+      } catch(gate.util.InvalidOffsetException e) {
+        e.printStackTrace();
+        throw new ExecutionException(e);
+      }
+    } else {
+      throw new ExecutionException("No sentences or tokens to process!\n"
+        + "Please run a sentence splitter " + "and tokeniser first!");
+    }
+  }
+
+  private Integer annotate(Parse p, Long sentStart)
+    throws gate.util.InvalidOffsetException {
+
+    List<Integer> childIDs = new ArrayList<Integer>();
+    Parse[] children = p.getChildren();
+    for(Parse cp : children) {
+      Integer childID = annotate(cp, sentStart);
+      if(childID >= 0) childIDs.add(childID);
+    }
+
+    String type = p.getType();
+    if(type.equals("TK")) return -1;
+
+    Span span = p.getSpan();
+    Long start = sentStart + span.getStart();
+    Long end = sentStart + span.getEnd();
+
+    FeatureMap fm = gate.Factory.newFeatureMap();
+    String text = document.getContent().getContent(start, end).toString();
+    fm.put("text", text);
+    fm.put("cat", p.getType());
+    if(!childIDs.isEmpty()) fm.put("consists", childIDs);
+
+    return annotations.add(start, end, "SyntaxTreeNode", fm);
+  }
+
+  public URL getModel() {
+    return model;
+  }
+
+  /* getters and setters for the PR */
+  /* public members */
+
+  @Override
+  public Resource init() throws ResourceInstantiationException {
+    InputStream modelIn = null;
+    try {
+      modelIn = model.openStream();
+
+      ParserModel model = new ParserModel(modelIn);
+
+      parser = ParserFactory.create(model);
+    } catch(Exception e) {
+      e.printStackTrace();
+      logger.error("Parser can not be initialized!");
+      throw new RuntimeException("Parser cannot be initialized!", e);
+    } finally {
+      IOUtils.closeQuietly(modelIn);
+    }
+    return this;
+  }
+
+  @Override
+  public void reInit() throws ResourceInstantiationException {
+    init();
+  }
+
+  @CreoleParameter(defaultValue = "models/english/en-parser-chunking.bin",
+      comment = "location of the parser model")
+  public void setModel(URL model) {
+    this.model = model;
+  }
+
+}
\ No newline at end of file

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
Start Your Social Network Today - Download eXo Platform
Build your Enterprise Intranet with eXo Platform Software
Java Based Open Source Intranet - Social, Extensible, Cloud Ready
Get Started Now And Turn Your Intranet Into A Collaboration Platform
http://p.sf.net/sfu/ExoPlatform
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs

Reply via email to