Revision: 19526
http://sourceforge.net/p/gate/code/19526
Author: markagreenwood
Date: 2016-08-19 17:01:15 +0000 (Fri, 19 Aug 2016)
Log Message:
-----------
formatting and removed some unused stuff
Modified Paths:
--------------
gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/DependencyMode.java
gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/DependencyRelation.java
gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/NER.java
gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/Parser.java
gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/StanfordSentence.java
gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/Tagger.java
gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/Tokenizer.java
gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/apps/EnglishDependencies.java
gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/apps/EnglishPOSDependencies.java
Modified:
gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/DependencyMode.java
===================================================================
---
gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/DependencyMode.java
2016-08-19 16:50:05 UTC (rev 19525)
+++
gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/DependencyMode.java
2016-08-19 17:01:15 UTC (rev 19526)
@@ -26,35 +26,23 @@
import edu.stanford.nlp.trees.TypedDependency;
public enum DependencyMode {
- Typed,
- AllTyped,
- TypedCollapsed,
- TypedCCprocessed;
-
-
- protected static Collection<TypedDependency>
getDependencies(GrammaticalStructure gs,
- DependencyMode mode, boolean includeExtras) {
+ Typed, AllTyped, TypedCollapsed, TypedCCprocessed;
+ protected static Collection<TypedDependency> getDependencies(
+ GrammaticalStructure gs, DependencyMode mode, boolean includeExtras) {
Collection<TypedDependency> result = null;
-
Extras incl = Extras.NONE;
if(includeExtras) {
incl = Extras.MAXIMAL;
}
-
- if (mode.equals(Typed)) {
+ if(mode.equals(Typed)) {
result = gs.typedDependencies(incl);
- }
- else if (mode.equals(AllTyped)) {
+ } else if(mode.equals(AllTyped)) {
result = gs.allTypedDependencies();
- }
- else if (mode.equals(TypedCollapsed)) {
+ } else if(mode.equals(TypedCollapsed)) {
result = gs.typedDependenciesCollapsed(incl);
- }
- else if (mode.equals(TypedCCprocessed)) {
+ } else if(mode.equals(TypedCCprocessed)) {
result = gs.typedDependenciesCCprocessed(incl);
}
-
return result;
}
-
}
Modified:
gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/DependencyRelation.java
===================================================================
---
gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/DependencyRelation.java
2016-08-19 16:50:05 UTC (rev 19525)
+++
gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/DependencyRelation.java
2016-08-19 17:01:15 UTC (rev 19526)
@@ -14,34 +14,33 @@
*
* You should have received a copy of the GNU General Public License along with
* this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * $Id: DependencyRelation.java 15600 2012-03-19 15:40:56Z adamfunk $
+ *
+ * $Id: DependencyRelation.java 15600 2012-03-19 15:40:56Z adamfunk $
*/
package gate.stanford;
import java.io.Serializable;
/**
- * Simple data structure representing a single dependency relation. The
"target"
- * is the Annotation ID of the dependent; the "type" is the dependency
- * tag (<a href="http://nlp.stanford.edu/software/parser-faq.shtml#c">the
- * Stanford Parser documentation</a> contains links to the tagset</a>; for
example,
- * nsubj = "nominal subject", dobj = "direct object).
+ * Simple data structure representing a single dependency relation. The
"target"
+ * is the Annotation ID of the dependent; the "type" is the dependency tag (<a
+ * href="http://nlp.stanford.edu/software/parser-faq.shtml#c">the Stanford
+ * Parser documentation</a> contains links to the tagset</a>; for example,
nsubj
+ * = "nominal subject", dobj = "direct object).
*/
public class DependencyRelation implements Serializable {
-
private static final long serialVersionUID = -7842607116149222052L;
/**
* The type of the dependency relation (det, amod, etc.).
*/
private String type;
-
+
/**
* The ID of the token that is the target of this relation.
*/
private Integer targetId;
-
+
public DependencyRelation(String type, Integer targetId) {
this.type = type;
this.targetId = targetId;
@@ -49,6 +48,7 @@
/**
* Return the dependency tag (type).
+ *
* @return the dependency tag
*/
public String getType() {
@@ -57,7 +57,9 @@
/**
* Set the dependency tag.
- * @param type dependency tag
+ *
+ * @param type
+ * dependency tag
*/
public void setType(String type) {
this.type = type;
@@ -65,6 +67,7 @@
/**
* Return the GATE Annotation ID of the dependent.
+ *
* @return the Annotation ID
*/
public Integer getTargetId() {
@@ -73,16 +76,17 @@
/**
* Set the Annotation ID of the dependent.
- * @param targetId the Annotation ID
+ *
+ * @param targetId
+ * the Annotation ID
*/
public void setTargetId(Integer targetId) {
this.targetId = targetId;
}
-
+
/**
- * Format the data structure for display.
- * For example, if type is "dobj" and the dependent has Annotation ID 37,
- * return the String "dobj(37)".
+ * Format the data structure for display. For example, if type is "dobj" and
+ * the dependent has Annotation ID 37, return the String "dobj(37)".
*/
public String toString() {
return type + "(" + targetId + ")";
Modified:
gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/NER.java
===================================================================
---
gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/NER.java
2016-08-19 16:50:05 UTC (rev 19525)
+++
gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/NER.java
2016-08-19 17:01:15 UTC (rev 19526)
@@ -19,7 +19,6 @@
*
* $Id: NER.java 15468 2013-10-22 21:13:15Z $
*/
-
package gate.stanford;
import edu.stanford.nlp.ie.AbstractSequenceClassifier;
@@ -56,9 +55,8 @@
/**
* This class is a wrapper for the Stanford NER tool v3.2.0.
*/
-@CreoleResource(name = "Stanford NER", comment = "Stanford Named Entity
Recogniser", icon = "ne-transducer",
helpURL="http://gate.ac.uk/userguide/sec:misc:creole:stanford")
+@CreoleResource(name = "Stanford NER", comment = "Stanford Named Entity
Recogniser", icon = "ne-transducer", helpURL =
"http://gate.ac.uk/userguide/sec:misc:creole:stanford")
public class NER extends AbstractLanguageAnalyser {
-
private static final long serialVersionUID = -6001372186847970080L;
public static final String TAG_DOCUMENT_PARAMETER_NAME = "document";
@@ -68,10 +66,10 @@
public static final String TAG_ENCODING_PARAMETER_NAME = "encoding";
public static final String BASE_TOKEN_ANNOTATION_TYPE_PARAMETER_NAME =
- "baseTokenAnnotationType";
+ "baseTokenAnnotationType";
public static final String BASE_SENTENCE_ANNOTATION_TYPE_PARAMETER_NAME =
- "baseSentenceAnnotationType";
+ "baseSentenceAnnotationType";
public static final String TAG_OUTPUT_AS_PARAMETER_NAME = "outputASName";
@@ -97,7 +95,8 @@
if(tagger == null) {
fireStatusChanged("Loading Stanford NER model");
try {
- // nasty workaround for stanford NER's path format inconsistency -
tagger is content with uris beginning file:, ner labeller is not
+ // nasty workaround for stanford NER's path format inconsistency -
+ // tagger is content with uris beginning file:, ner labeller is not
tagger =
CRFClassifier.getClassifier(modelFile.toString().substring(5));
} catch(Exception e) {
throw new ResourceInstantiationException(e);
@@ -117,172 +116,143 @@
// check the parameters
if(document == null)
throw new ExecutionException("No document to process!");
-
AnnotationSet inputAS = document.getAnnotations(inputASName);
AnnotationSet outputAS = document.getAnnotations(outputASName);
-
- if(baseTokenAnnotationType == null ||
- baseTokenAnnotationType.trim().length() == 0) { throw new
ExecutionException(
- "No base Token Annotation Type provided!"); }
-
- if(baseSentenceAnnotationType == null ||
- baseSentenceAnnotationType.trim().length() == 0) { throw new
ExecutionException(
- "No base Sentence Annotation Type provided!"); }
-
+ if(baseTokenAnnotationType == null
+ || baseTokenAnnotationType.trim().length() == 0) { throw new
ExecutionException(
+ "No base Token Annotation Type provided!"); }
+ if(baseSentenceAnnotationType == null
+ || baseSentenceAnnotationType.trim().length() == 0) { throw new
ExecutionException(
+ "No base Sentence Annotation Type provided!"); }
AnnotationSet sentencesAS = inputAS.get(baseSentenceAnnotationType);
AnnotationSet tokensAS = inputAS.get(baseTokenAnnotationType);
- if(sentencesAS != null && sentencesAS.size() > 0 && tokensAS != null &&
- tokensAS.size() > 0) {
+ if(sentencesAS != null && sentencesAS.size() > 0 && tokensAS != null
+ && tokensAS.size() > 0) {
long startTime = System.currentTimeMillis();
fireStatusChanged("NER searching " + document.getName());
fireProgressChanged(0);
-
// prepare the input for CRFClassifier
List<CoreLabel> sentenceForTagger = new ArrayList<CoreLabel>();
-
// define a comparator for annotations by start offset
OffsetComparator offsetComparator = new OffsetComparator();
-
// read all the tokens and all the sentences
List<Annotation> sentencesList = new ArrayList<Annotation>(sentencesAS);
Collections.sort(sentencesList, offsetComparator);
List<Annotation> tokensList = new ArrayList<Annotation>(tokensAS);
Collections.sort(tokensList, offsetComparator);
-
Iterator<Annotation> sentencesIter = sentencesList.iterator();
ListIterator<Annotation> tokensIter = tokensList.listIterator();
-
List<Annotation> tokensInCurrentSentence = new ArrayList<Annotation>();
Annotation currentToken = tokensIter.next();
int sentIndex = 0;
int sentCnt = sentencesAS.size();
-
// go through sentence annotations in the document
while(sentencesIter.hasNext()) {
Annotation currentSentence = sentencesIter.next();
-
// reset sentence-level processing variables
tokensInCurrentSentence.clear();
sentenceForTagger.clear();
-
// while we have sane tokens
- while(currentToken != null &&
- currentToken.getEndNode().getOffset()
- .compareTo(currentSentence.getEndNode().getOffset()) <= 0) {
-
+ while(currentToken != null
+ && currentToken.getEndNode().getOffset()
+ .compareTo(currentSentence.getEndNode().getOffset()) <= 0) {
// If we're only labelling Tokens within baseSentenceAnnotationType,
// don't add the sentence if the Tokens aren't within the span of
// baseSentenceAnnotationType
if(currentToken.withinSpanOf(currentSentence)) {
tokensInCurrentSentence.add(currentToken);
-
- // build a stanford nlp representation of the token and add it to
the sequence
+ // build a stanford nlp representation of the token and add it to
+ // the sequence
CoreLabel currentLabel = new CoreLabel();
-
currentLabel.setWord((String)currentToken.getFeatures().get(TOKEN_STRING_FEATURE_NAME));
-
+ currentLabel.setWord((String)currentToken.getFeatures().get(
+ TOKEN_STRING_FEATURE_NAME));
sentenceForTagger.add(currentLabel);
}
currentToken = (tokensIter.hasNext() ? tokensIter.next() : null);
}
-
// if the sentence doesn't contain any tokens (which is a bit weird but
// is possible) then don't try running the labeller
if(sentenceForTagger.isEmpty()) continue;
-
// run the labeller
List<CoreLabel> taggerResults =
- tagger.classifySentence(sentenceForTagger);
-
+ tagger.classifySentence(sentenceForTagger);
// add the results
// make sure no malfunction occurred
if(taggerResults.size() != tokensInCurrentSentence.size())
throw new ExecutionException(
- "NER labeller malfunction: the output size (" +
- taggerResults.size() + ") is different from the input size (" +
- tokensInCurrentSentence.size() + ")!");
-
+ "NER labeller malfunction: the output size ("
+ + taggerResults.size()
+ + ") is different from the input size ("
+ + tokensInCurrentSentence.size() + ")!");
// proceed through the annotated sequence
Iterator<CoreLabel> resIter = taggerResults.iterator();
Iterator<Annotation> tokIter = tokensInCurrentSentence.iterator();
-
String previousLabel = outsideLabel;
Long previousEnd = new Long(-1);
Long entityStart = new Long(-1);
-
- //No idea why this was there so lets comment it out
- //Long entityEnd = new Long(-1);
-
+ // No idea why this was there so lets comment it out
+ // Long entityEnd = new Long(-1);
Annotation annot;
String nerLabel = "";
-
while(resIter.hasNext()) {
-
// for each labelled token..
annot = tokIter.next();
CoreLabel word = resIter.next();
nerLabel = word.get(CoreAnnotations.AnswerAnnotation.class);
-
// falling edge transition: entity ends
// guard against this triggering at document start
- if (!nerLabel.equals(previousLabel) &&
!previousLabel.equals(outsideLabel) && entityStart != -1) {
-
-// System.out.println("falling edge");
+ if(!nerLabel.equals(previousLabel)
+ && !previousLabel.equals(outsideLabel) && entityStart != -1) {
+ // System.out.println("falling edge");
// get final bound; add new annotation in output AS
try {
- outputAS.add(entityStart, previousEnd, previousLabel, new
SimpleFeatureMapImpl());
- } catch (InvalidOffsetException e) {
+ outputAS.add(entityStart, previousEnd, previousLabel,
+ new SimpleFeatureMapImpl());
+ } catch(InvalidOffsetException e) {
System.out.println("Token alignment problem:" + e);
}
-
}
-
// rising edge transition: entity starts
- if (!nerLabel.equals(previousLabel) &&
!nerLabel.equals(outsideLabel)) {
-// System.out.println("rising edge");
+ if(!nerLabel.equals(previousLabel) &&
!nerLabel.equals(outsideLabel)) {
+ // System.out.println("rising edge");
entityStart = annot.getStartNode().getOffset();
}
-// System.out.println(word.word() + "/" + nerLabel);
-
+ // System.out.println(word.word() + "/" + nerLabel);
previousLabel = nerLabel;
previousEnd = annot.getEndNode().getOffset();
-
}
-
// clean up, in case last token in sentence was in an entity
- if (!nerLabel.equals(outsideLabel)) {
+ if(!nerLabel.equals(outsideLabel)) {
try {
- outputAS.add(entityStart, previousEnd, previousLabel, new
SimpleFeatureMapImpl());
- } catch (InvalidOffsetException e) {
+ outputAS.add(entityStart, previousEnd, previousLabel,
+ new SimpleFeatureMapImpl());
+ } catch(InvalidOffsetException e) {
System.out.println("Token alignment problem:" + e);
}
}
-
fireProgressChanged(sentIndex++ * 100 / sentCnt);
-
}
-
fireProcessFinished();
- fireStatusChanged(document.getName() +
- " tagged in " +
- NumberFormat.getInstance().format(
- (double)(System.currentTimeMillis() - startTime) / 1000) +
- " seconds!");
+ fireStatusChanged(document.getName()
+ + " tagged in "
+ + NumberFormat.getInstance().format(
+ (double)(System.currentTimeMillis() - startTime) / 1000)
+ + " seconds!");
} else {
if(failOnMissingInputAnnotations) {
throw new ExecutionException(
- "No sentences or tokens to process in document " +
- document.getName() + "\n" + "Please run a sentence splitter " +
- "and tokeniser first!");
+ "No sentences or tokens to process in document "
+ + document.getName() + "\n" + "Please run a sentence splitter "
+ + "and tokeniser first!");
} else {
Utils
- .logOnce(
- logger,
- Level.INFO,
- "NE labeller: no sentence or token annotations in input document -
see debug log for details.");
+ .logOnce(
+ logger,
+ Level.INFO,
+ "NE labeller: no sentence or token annotations in input
document - see debug log for details.");
logger.debug("No input annotations in document " + document.getName());
}
}
-
}
public void setEncoding(String encoding) {
@@ -335,7 +305,6 @@
this.outputASName = outputASName;
}
-
@RunTime
@CreoleParameter(comment = "Label used by model for tokens outside
entities", defaultValue = "O")
public void setOutsideLabel(String outsideLabel) {
@@ -346,14 +315,11 @@
return this.outsideLabel;
}
-
- @CreoleParameter(comment = "Path to the NER model file", defaultValue =
"resources/english.all.3class.distsim.crf.ser.gz", suffixes="tagger;model;gz")
+ @CreoleParameter(comment = "Path to the NER model file", defaultValue =
"resources/english.all.3class.distsim.crf.ser.gz", suffixes = "tagger;model;gz")
public void setModelFile(URL modelFile) {
this.modelFile = modelFile;
}
-
-
public URL getModelFile() {
return this.modelFile;
}
Modified:
gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/Parser.java
===================================================================
---
gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/Parser.java
2016-08-19 16:50:05 UTC (rev 19525)
+++
gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/Parser.java
2016-08-19 17:01:15 UTC (rev 19526)
@@ -14,8 +14,8 @@
*
* You should have received a copy of the GNU General Public License along with
* this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * $Id: Parser.java 17831 2014-04-15 09:37:23Z ian_roberts $
+ *
+ * $Id: Parser.java 17831 2014-04-15 09:37:23Z ian_roberts $
*/
package gate.stanford;
@@ -43,7 +43,6 @@
import gate.creole.metadata.Optional;
import gate.creole.metadata.RunTime;
import gate.creole.metadata.Sharable;
-import gate.util.Files;
import gate.util.InvalidOffsetException;
import java.io.BufferedReader;
@@ -67,197 +66,190 @@
* be stored in the outputAS in various ways, controlled by CREOLE run-time
* parameters.
*/
-@CreoleResource(name = "StanfordParser", comment = "Stanford parser wrapper",
- helpURL = "http://gate.ac.uk/userguide/sec:parsers:stanford")
-public class Parser extends AbstractLanguageAnalyser
-implements ProcessingResource {
-
+@CreoleResource(name = "StanfordParser", comment = "Stanford parser wrapper",
helpURL = "http://gate.ac.uk/userguide/sec:parsers:stanford")
+public class Parser extends AbstractLanguageAnalyser implements
+ ProcessingResource {
private static final long serialVersionUID = -3062171258011850283L;
protected LexicalizedParser stanfordParser;
- /* Type "SyntaxTreeNode" with feature "cat" is compatible with the
- * classic SyntaxTreeViewer. */
- public static final String PHRASE_ANNOTATION_TYPE = "SyntaxTreeNode" ;
- public static final String PHRASE_CAT_FEATURE = "cat" ;
-
- /* But "category" feature is compatible with the ANNIE POS tagger. */
- private static final String POS_TAG_FEATURE =
ANNIEConstants.TOKEN_CATEGORY_FEATURE_NAME;
+ /*
+ * Type "SyntaxTreeNode" with feature "cat" is compatible with the classic
+ * SyntaxTreeViewer.
+ */
+ public static final String PHRASE_ANNOTATION_TYPE = "SyntaxTreeNode";
- public static final String DEPENDENCY_ANNOTATION_TYPE = "Dependency";
- public static final String DEPENDENCY_ARG_FEATURE = "args";
- public static final String DEPENDENCY_LABEL_FEATURE = "kind";
+ public static final String PHRASE_CAT_FEATURE = "cat";
- protected String annotationSetName;
- private URL parserFile;
- protected boolean debugMode;
- private boolean reusePosTags;
+ /* But "category" feature is compatible with the ANNIE POS tagger. */
+ private static final String POS_TAG_FEATURE =
+ ANNIEConstants.TOKEN_CATEGORY_FEATURE_NAME;
- private Map<String, String> tagMap;
- protected GrammaticalStructureFactory gsf;
-
+ public static final String DEPENDENCY_ANNOTATION_TYPE = "Dependency";
- /* CREOLE parameters for optional mapping */
- private boolean useMapping = false;
- private URL mappingFileURL;
-
- /* internal variables for mapping */
- private File mappingFile;
- private boolean mappingLoaded = false;
-
- /* CREOLE parameters: what are we going to annotate, and how? */
- private String inputSentenceType;
- private String inputTokenType;
- private boolean addConstituentAnnotations;
- private boolean addDependencyFeatures;
- private boolean addDependencyAnnotations;
- private boolean addPosTags;
- private boolean includeExtraDependencies;
+ public static final String DEPENDENCY_ARG_FEATURE = "args";
+
+ public static final String DEPENDENCY_LABEL_FEATURE = "kind";
+
+ protected String annotationSetName;
+
+ private URL parserFile;
+
+ protected boolean debugMode;
+
+ private boolean reusePosTags;
+
+ private Map<String, String> tagMap;
+
+ protected GrammaticalStructureFactory gsf;
+
+ /* CREOLE parameters for optional mapping */
+ private boolean useMapping = false;
+
+ private URL mappingFileURL;
+
+ /* internal variables for mapping */
+ private File mappingFile;
+
+ private boolean mappingLoaded = false;
+
+ /* CREOLE parameters: what are we going to annotate, and how? */
+ private String inputSentenceType;
+
+ private String inputTokenType;
+
+ private boolean addConstituentAnnotations;
+
+ private boolean addDependencyFeatures;
+
+ private boolean addDependencyAnnotations;
+
+ private boolean addPosTags;
+
+ private boolean includeExtraDependencies;
+
private DependencyMode dependencyMode;
-
/**
- * The {@link TreebankLangParserParams} implementation to use. This is
- * where we get the language pack, and then the
- * {@link GrammaticalStructureFactory} used to extract the
- * dependencies from the parse. In most cases you should leave this at
- * the default value, which is suitable for English text.
+ * The {@link TreebankLangParserParams} implementation to use. This is where
+ * we get the language pack, and then the {@link GrammaticalStructureFactory}
+ * used to extract the dependencies from the parse. In most cases you should
+ * leave this at the default value, which is suitable for English text.
*/
private String tlppClass;
-
/**
* The name of the feature to add to tokens. The feature value is a
- * {@link List} of {@link DependencyRelation} objects giving the
- * dependencies from this token to other tokens.
+ * {@link List} of {@link DependencyRelation} objects giving the dependencies
+ * from this token to other tokens.
*/
protected String dependenciesFeature = "dependencies";
-
-
/**
- * Parse the current document. (This is the principal
- * method called by a CorpusController.)
+ * Parse the current document. (This is the principal method called by a
+ * CorpusController.)
*/
public void execute() throws ExecutionException {
interrupted = false;
long startTime = System.currentTimeMillis();
- if(document == null) {
- throw new ExecutionException("No document to process!");
- }
+ if(document == null) { throw new ExecutionException(
+ "No document to process!"); }
fireStatusChanged("Running " + this.getName() + " on " +
document.getName());
fireProgressChanged(0);
-
- if (debugMode) {
+ if(debugMode) {
System.out.println("Parsing document: " + document.getName());
}
-
- if (useMapping && (! mappingLoaded) ) {
+ if(useMapping && (!mappingLoaded)) {
System.err.println("Warning: no mapping loaded!");
}
-
checkInterruption();
- if (addConstituentAnnotations || addDependencyFeatures ||
addDependencyAnnotations || addPosTags) {
+ if(addConstituentAnnotations || addDependencyFeatures
+ || addDependencyAnnotations || addPosTags) {
parseSentences(document.getAnnotations(annotationSetName));
- }
- else {
+ } else {
System.err.println("There is nothing for the parser to do.");
- System.err.println("Please enable at least one of the \"add...\"
options.");
+ System.err
+ .println("Please enable at least one of the \"add...\" options.");
}
-
fireProcessFinished();
- fireStatusChanged("Finished " + this.getName() + " on " +
document.getName()
- + " in " + NumberFormat.getInstance().format(
+ fireStatusChanged("Finished "
+ + this.getName()
+ + " on "
+ + document.getName()
+ + " in "
+ + NumberFormat.getInstance().format(
(double)(System.currentTimeMillis() - startTime) / 1000)
- + " seconds!");
+ + " seconds!");
}
-
/**
- * Initialize the Parser resource. In particular, load the trained data
- * file.
+ * Initialize the Parser resource. In particular, load the trained data file.
*/
public Resource init() throws ResourceInstantiationException {
instantiateStanfordParser();
- if (mappingFile != null) {
+ if(mappingFile != null) {
loadTagMapping(mappingFile);
}
-
super.init();
-
- if(tlppClass == null || tlppClass.equals("")) {
- throw new ResourceInstantiationException(
- "TLPP class name must be specified");
- }
+ if(tlppClass == null || tlppClass.equals("")) { throw new
ResourceInstantiationException(
+ "TLPP class name must be specified"); }
try {
- Class<?> tlppClassObj =
- Class.forName(tlppClass);
- if(!TreebankLangParserParams.class.isAssignableFrom(tlppClassObj)) {
- throw new ResourceInstantiationException(tlppClassObj
- + " does not implement "
- + TreebankLangParserParams.class.getName());
- }
+ Class<?> tlppClassObj = Class.forName(tlppClass);
+ if(!TreebankLangParserParams.class.isAssignableFrom(tlppClassObj)) {
throw new ResourceInstantiationException(
+ tlppClassObj + " does not implement "
+ + TreebankLangParserParams.class.getName()); }
TreebankLangParserParams tlpp =
- TreebankLangParserParams.class.cast(tlppClassObj.newInstance());
+ TreebankLangParserParams.class.cast(tlppClassObj.newInstance());
gsf = tlpp.treebankLanguagePack().grammaticalStructureFactory();
- }
- catch(UnsupportedOperationException e) {
+ } catch(UnsupportedOperationException e) {
throw new ResourceInstantiationException(e);
- }
- catch(ClassNotFoundException e) {
+ } catch(ClassNotFoundException e) {
throw new ResourceInstantiationException("Class " + tlppClass
- + " not found", e);
- }
- catch(InstantiationException e) {
+ + " not found", e);
+ } catch(InstantiationException e) {
throw new ResourceInstantiationException("Error creating TLPP object",
e);
- }
- catch(IllegalAccessException e) {
+ } catch(IllegalAccessException e) {
throw new ResourceInstantiationException("Error creating TLPP object",
e);
}
return this;
}
-
/**
- * Re-initialize the Parser resource. In particular, reload the trained
- * data file.
+ * Re-initialize the Parser resource. In particular, reload the trained data
+ * file.
*/
- @Override
+ @Override
public void reInit() throws ResourceInstantiationException {
stanfordParser = null;
init();
- }
+ }
-
-
/**
* Find all the Sentence annotations and iterate through them, parsing one
* sentence at a time and storing the result in the output AS. (Sentences are
* scanned for Tokens. You have to run the ANNIE tokenizer and splitter
before
* this PR.)
- * @throws ExecutionInterruptedException
+ *
+ * @throws ExecutionInterruptedException
*/
- private void parseSentences(AnnotationSet annotationSet) throws
ExecutionInterruptedException {
- List<Annotation> sentences =
gate.Utils.inDocumentOrder(annotationSet.get(inputSentenceType));
+ private void parseSentences(AnnotationSet annotationSet)
+ throws ExecutionInterruptedException {
+ List<Annotation> sentences =
+ gate.Utils.inDocumentOrder(annotationSet.get(inputSentenceType));
int sentencesDone = 0;
int nbrSentences = sentences.size();
-
- for (Annotation sentence : sentences) {
+ for(Annotation sentence : sentences) {
parseOneSentence(annotationSet, sentence, sentencesDone, nbrSentences);
sentencesDone++;
checkInterruption();
}
-
sentencesDone++;
fireProgressChanged(100 * sentencesDone / nbrSentences);
+ }
- }
-
-
-
/**
* Generate the special data structure for one sentence and pass the List of
- * Word to the parser. Apply the annotations back to the document.
+ * Word to the parser. Apply the annotations back to the document.
*
* @param sentence
* the Sentence annotation
@@ -265,106 +257,100 @@
* sentence number of debugging output
* @param ofS
* total number of sentences for debugging output
- * @return null if the sentence is empty
- * @throws ExecutionInterruptedException
+ * @return null if the sentence is empty
+ * @throws ExecutionInterruptedException
*/
- private void parseOneSentence(AnnotationSet annotationSet, Annotation
sentence, int sentCtr, int sentCount) throws ExecutionInterruptedException {
+ private void parseOneSentence(AnnotationSet annotationSet,
+ Annotation sentence, int sentCtr, int sentCount)
+ throws ExecutionInterruptedException {
Tree tree;
-
- StanfordSentence stanfordSentence = new StanfordSentence(sentence,
inputTokenType, annotationSet, reusePosTags);
- if (debugMode) {
+ StanfordSentence stanfordSentence =
+ new StanfordSentence(sentence, inputTokenType, annotationSet,
+ reusePosTags);
+ if(debugMode) {
System.out.println(stanfordSentence.toString());
}
-
- /* Ignore an empty Sentence (sometimes the regex splitter can create one
+ /*
+ * Ignore an empty Sentence (sometimes the regex splitter can create one
* with no Token annotations in it).
*/
- if ( stanfordSentence.isNotEmpty() ) {
+ if(stanfordSentence.isNotEmpty()) {
List<Word> wordList = stanfordSentence.getWordList();
-
- if (reusePosTags) {
+ if(reusePosTags) {
int nbrMissingTags = stanfordSentence.numberOfMissingPosTags();
- if (nbrMissingTags > 0) {
- double percentMissing = Math.ceil(100.0 * (nbrMissingTags) /
- (stanfordSentence.numberOfTokens()) );
- System.err.println("Warning (sentence " + sentCtr + "): " + (int)
percentMissing
- + "% of the Tokens are missing POS tags." );
+ if(nbrMissingTags > 0) {
+ double percentMissing =
+ Math.ceil(100.0 * (nbrMissingTags)
+ / (stanfordSentence.numberOfTokens()));
+ System.err.println("Warning (sentence " + sentCtr + "): "
+ + (int)percentMissing + "% of the Tokens are missing POS tags.");
}
}
-
- tree = stanfordParser.parse(wordList);
+ tree = stanfordParser.parse(wordList);
checkInterruption();
-
- if (addConstituentAnnotations || addPosTags) {
- annotatePhraseStructureRecursively(annotationSet, stanfordSentence,
tree, tree);
+ if(addConstituentAnnotations || addPosTags) {
+ annotatePhraseStructureRecursively(annotationSet, stanfordSentence,
+ tree, tree);
}
-
checkInterruption();
- if (addDependencyFeatures || addDependencyAnnotations) {
+ if(addDependencyFeatures || addDependencyAnnotations) {
annotateDependencies(annotationSet, stanfordSentence, tree);
}
-
- if (debugMode) {
+ if(debugMode) {
System.out.println("Parsed sentence " + sentCtr + " of " + sentCount);
}
+ } else if(debugMode) {
+ System.out.println("Ignored empty sentence " + sentCtr + " of "
+ + sentCount);
}
-
- else if (debugMode) {
- System.out.println("Ignored empty sentence " + sentCtr + " of " +
sentCount);
- }
}
-
/**
- * Generate a SyntaxTreeNode Annotation corresponding to this Tree. Work
- * recursively so that the annotations are actually generated from the
- * bottom up, in order to build the consists list of annotation IDs.
+ * Generate a SyntaxTreeNode Annotation corresponding to this Tree. Work
+ * recursively so that the annotations are actually generated from the bottom
+ * up, in order to build the consists list of annotation IDs.
*
- * @param tree the current subtree
- * @param rootTree the whole sentence, used to find the span of the current
subtree
+ * @param tree
+ * the current subtree
+ * @param rootTree
+ * the whole sentence, used to find the span of the current subtree
* @return a GATE Annotation of type "SyntaxTreeNode"
*/
- protected Annotation annotatePhraseStructureRecursively(AnnotationSet
annotationSet, StanfordSentence stanfordSentence, Tree tree, Tree rootTree) {
+ protected Annotation annotatePhraseStructureRecursively(
+ AnnotationSet annotationSet, StanfordSentence stanfordSentence,
+ Tree tree, Tree rootTree) {
Annotation annotation = null;
Annotation child;
- String label = tree.value();
-
+ String label = tree.value();
List<Tree> children = tree.getChildrenAsList();
-
- if (children.size() == 0) {
- return null;
- }
+ if(children.size() == 0) { return null; }
/* implied else */
-
- /* following line generates ClassCastException
- * IntPair span = tree.getSpan();
- * edu.stanford.nlp.ling.CategoryWordTag
- * at edu.stanford.nlp.trees.Tree.getSpan(Tree.java:393)
- * but I think it's a bug in the parser, so I'm hacking
- * around it as follows. */
+ /*
+ * following line generates ClassCastException IntPair span =
+ * tree.getSpan(); edu.stanford.nlp.ling.CategoryWordTag at
+ * edu.stanford.nlp.trees.Tree.getSpan(Tree.java:393) but I think it's a
bug
+ * in the parser, so I'm hacking around it as follows.
+ */
int startPos = Trees.leftEdge(tree, rootTree);
- int endPos = Trees.rightEdge(tree, rootTree);
-
+ int endPos = Trees.rightEdge(tree, rootTree);
Long startNode = stanfordSentence.startPos2offset(startPos);
- Long endNode = stanfordSentence.endPos2offset(endPos);
-
+ Long endNode = stanfordSentence.endPos2offset(endPos);
List<Integer> consists = new ArrayList<Integer>();
-
Iterator<Tree> childIter = children.iterator();
- while (childIter.hasNext()) {
- child = annotatePhraseStructureRecursively(annotationSet,
stanfordSentence, childIter.next(), rootTree);
- if ( (child != null) &&
- (! child.getType().equals(inputTokenType) )) {
+ while(childIter.hasNext()) {
+ child =
+ annotatePhraseStructureRecursively(annotationSet, stanfordSentence,
+ childIter.next(), rootTree);
+ if((child != null) && (!child.getType().equals(inputTokenType))) {
consists.add(child.getId());
}
}
- annotation = annotatePhraseStructureConstituent(annotationSet, startNode,
endNode, label, consists, tree.depth());
-
+ annotation =
+ annotatePhraseStructureConstituent(annotationSet, startNode, endNode,
+ label, consists, tree.depth());
return annotation;
}
-
-
/**
* Record one constituent as an annotation.
*
@@ -375,169 +361,148 @@
* @param depth
* @return
*/
- private Annotation annotatePhraseStructureConstituent(AnnotationSet
annotationSet, Long startOffset, Long endOffset, String label,
- List<Integer> consists, int depth) {
+ private Annotation annotatePhraseStructureConstituent(
+ AnnotationSet annotationSet, Long startOffset, Long endOffset,
+ String label, List<Integer> consists, int depth) {
Annotation phrAnnotation = null;
Integer phrID;
-
try {
String cat;
- if (useMapping && mappingLoaded) {
- cat = translateTag(label);
+ if(useMapping && mappingLoaded) {
+ cat = translateTag(label);
+ } else {
+ cat = label;
}
- else {
- cat = label;
- }
-
- if (addConstituentAnnotations) {
- String text = document.getContent().getContent(startOffset,
endOffset).toString();
+ if(addConstituentAnnotations) {
+ String text =
+ document.getContent().getContent(startOffset,
endOffset).toString();
FeatureMap fm = gate.Factory.newFeatureMap();
fm.put(PHRASE_CAT_FEATURE, cat);
fm.put("text", text);
-
/* Ignore empty list features on the token-equivalent annotations. */
- if (consists.size() > 0) {
+ if(consists.size() > 0) {
fm.put("consists", consists);
}
-
- phrID = annotationSet.add(startOffset, endOffset,
PHRASE_ANNOTATION_TYPE, fm);
+ phrID =
+ annotationSet.add(startOffset, endOffset, PHRASE_ANNOTATION_TYPE,
+ fm);
phrAnnotation = annotationSet.get(phrID);
recordID(annotationSet, phrID);
}
-
- if ( addPosTags && (depth == 1) ) {
+ if(addPosTags && (depth == 1)) {
/* Expected to be a singleton set! */
- AnnotationSet tokenSet = annotationSet.get(inputTokenType,
startOffset, endOffset);
- if (tokenSet.size() == 1) {
+ AnnotationSet tokenSet =
+ annotationSet.get(inputTokenType, startOffset, endOffset);
+ if(tokenSet.size() == 1) {
Annotation token = tokenSet.iterator().next();
-
- /* Add POS tag to token.
- * (Note: GATE/Hepple uses "(" and ")" for Penn/Stanford's
- * "-LRB-" and "-RRB-". */
+ /*
+ * Add POS tag to token. (Note: GATE/Hepple uses "(" and ")" for
+ * Penn/Stanford's "-LRB-" and "-RRB-".
+ */
String hepCat = StanfordSentence.unescapePosTag(cat);
token.getFeatures().put(POS_TAG_FEATURE, hepCat);
-
+ } else {
+ System.err.println("Found a tokenSet with " + tokenSet.size()
+ + " members!");
}
- else {
- System.err.println("Found a tokenSet with " + tokenSet.size() + "
members!");
- }
}
- }
- catch (InvalidOffsetException e) {
+ } catch(InvalidOffsetException e) {
e.printStackTrace();
}
-
return phrAnnotation;
}
-
-
@SuppressWarnings("unchecked")
- private void annotateDependencies(AnnotationSet annotationSet,
StanfordSentence stanfordSentence, Tree tree) {
+ private void annotateDependencies(AnnotationSet annotationSet,
+ StanfordSentence stanfordSentence, Tree tree) {
GrammaticalStructure gs = gsf.newGrammaticalStructure(tree);
- Collection<TypedDependency> dependencies =
DependencyMode.getDependencies(gs, dependencyMode, includeExtraDependencies);
-
- if (dependencies == null) {
- if (debugMode) {
+ Collection<TypedDependency> dependencies =
+ DependencyMode.getDependencies(gs, dependencyMode,
+ includeExtraDependencies);
+ if(dependencies == null) {
+ if(debugMode) {
System.out.println("dependencies == null");
}
return;
}
-
String dependencyKind;
FeatureMap depFeatures;
Integer dependentTokenID, governorTokenID;
List<Integer> argList;
Long offsetLH0, offsetRH0, offsetLH1, offsetRH1, depLH, depRH;
Annotation governor, dependent;
-
for(TypedDependency dependency : dependencies) {
if(debugMode) {
System.out.println(dependency);
}
-
// Does not work in version 3.5.2 any more
- //int governorIndex = dependency.gov().label().index() - 1;
- int governorIndex = dependency.gov().index()-1;
- governor = stanfordSentence.startPos2token(governorIndex);
-
- //int dependentIndex = dependency.dep().label().index() - 1;
- int dependentIndex = dependency.dep().index()-1;
+ // int governorIndex = dependency.gov().label().index() - 1;
+ int governorIndex = dependency.gov().index() - 1;
+ governor = stanfordSentence.startPos2token(governorIndex);
+ // int dependentIndex = dependency.dep().label().index() - 1;
+ int dependentIndex = dependency.dep().index() - 1;
dependent = stanfordSentence.startPos2token(dependentIndex);
-
dependencyKind = dependency.reln().toString();
governorTokenID = governor.getId();
dependentTokenID = dependent.getId();
-
- if (addDependencyFeatures) {
+ if(addDependencyFeatures) {
List<DependencyRelation> depsForTok =
- (List<DependencyRelation>)
governor.getFeatures().get(dependenciesFeature);
-
+ (List<DependencyRelation>)governor.getFeatures().get(
+ dependenciesFeature);
if(depsForTok == null) {
depsForTok = new ArrayList<DependencyRelation>();
governor.getFeatures().put(dependenciesFeature, depsForTok);
}
-
- depsForTok.add(new DependencyRelation(dependencyKind,
dependentTokenID));
+ depsForTok
+ .add(new DependencyRelation(dependencyKind, dependentTokenID));
}
-
- if (addDependencyAnnotations) {
+ if(addDependencyAnnotations) {
depFeatures = gate.Factory.newFeatureMap();
argList = new ArrayList<Integer>();
argList.add(governorTokenID);
argList.add(dependentTokenID);
depFeatures.put(DEPENDENCY_ARG_FEATURE, argList);
depFeatures.put(DEPENDENCY_LABEL_FEATURE, dependencyKind);
-
offsetLH0 = governor.getStartNode().getOffset();
offsetRH0 = governor.getEndNode().getOffset();
offsetLH1 = dependent.getStartNode().getOffset();
offsetRH1 = dependent.getEndNode().getOffset();
-
depLH = Math.min(offsetLH0, offsetLH1);
depRH = Math.max(offsetRH0, offsetRH1);
-
try {
- annotationSet.add(depLH, depRH, DEPENDENCY_ANNOTATION_TYPE,
depFeatures);
+ annotationSet.add(depLH, depRH, DEPENDENCY_ANNOTATION_TYPE,
+ depFeatures);
+ } catch(InvalidOffsetException e) {
+ e.printStackTrace();
}
- catch(InvalidOffsetException e) {
- e.printStackTrace();
- }
}
}
}
-
-
private void instantiateStanfordParser()
- throws ResourceInstantiationException {
+ throws ResourceInstantiationException {
if(stanfordParser != null) return;
-
try {
- //String filepath = Files.fileFromURL(parserFile).getAbsolutePath();
- stanfordParser =
LexicalizedParser.getParserFromSerializedFile(parserFile.toExternalForm());
- }
- catch(Exception e) {
+ // String filepath = Files.fileFromURL(parserFile).getAbsolutePath();
+ stanfordParser =
+ LexicalizedParser.getParserFromSerializedFile(parserFile
+ .toExternalForm());
+ } catch(Exception e) {
throw new ResourceInstantiationException(e);
}
- }
+ }
-
- private void loadTagMapping(File mappingFile) {
+ private void loadTagMapping(File mappingFile) {
tagMap = new HashMap<String, String>();
mappingLoaded = false;
-
try {
- if (mappingFile.exists() && mappingFile.canRead()) {
-
+ if(mappingFile.exists() && mappingFile.canRead()) {
BufferedReader br = new BufferedReader(new FileReader(mappingFile));
String line = "";
-
// read until it reaches to an end of the file
while((line = br.readLine()) != null) {
- // two columns delimited by whitespace
- String [] data = line.split("\\s+", 2);
-
+ // two columns delimited by whitespace
+ String[] data = line.split("\\s+", 2);
// are there key and value available
if(data == null || data.length < 2) {
continue;
@@ -546,27 +511,22 @@
tagMap.put(data[0].trim(), data[1].trim());
}
}
-
br.close();
+ } else {
+ System.err.println("Can't find or read mapping file "
+ + mappingFile.getPath() + " so no mappings will be used.");
}
-
- else {
- System.err.println("Can't find or read mapping file "
- + mappingFile.getPath() + " so no mappings will be used.");
- }
- }
- catch(Exception e) {
+ } catch(Exception e) {
System.err.println("Exception trying to load mapping file "
- + mappingFile.getPath());
+ + mappingFile.getPath());
e.printStackTrace();
}
-
int nbrMapped = tagMap.size();
- System.out.println("Loaded " + nbrMapped + " mappings from file " +
mappingFile);
+ System.out.println("Loaded " + nbrMapped + " mappings from file "
+ + mappingFile);
mappingLoaded = (nbrMapped > 0);
}
-
/**
* This method stores the annotation ID as a value of feature "ID" on the
* relevant annotation. (Mainly to make the ID visible in the GUI for
@@ -579,13 +539,11 @@
annSet.get(annotationID).getFeatures().put("ID", annotationID);
}
-
private void checkInterruption() throws ExecutionInterruptedException {
if(isInterrupted()) { throw new ExecutionInterruptedException(
"Execution of " + this.getName() + " has been abruptly interrupted!");
}
}
-
/**
* Translate the tag in the map, or leave it the same if there is no
* translation.
@@ -595,27 +553,22 @@
*/
private String translateTag(String stanfordTag) {
String translatedTag = stanfordTag;
-
- if (tagMap.containsKey(stanfordTag)) {
+ if(tagMap.containsKey(stanfordTag)) {
translatedTag = tagMap.get(stanfordTag);
}
-
return translatedTag;
}
-
/* get & set methods for the CREOLE parameters */
- @CreoleParameter(comment = "TreebankLangParserParams implementation used to
extract the dependencies",
- defaultValue =
"edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams")
+ @CreoleParameter(comment = "TreebankLangParserParams implementation used to
extract the dependencies", defaultValue =
"edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams")
public void setTlppClass(String tlppClass) {
this.tlppClass = tlppClass;
}
-
+
public String getTlppClass() {
return tlppClass;
}
-
@Optional
@RunTime
@CreoleParameter(comment = "annotationSet used for input (Token and "
@@ -628,8 +581,7 @@
return this.annotationSetName;
}
- @CreoleParameter(comment = "path to the parser's grammar file",
- defaultValue = "resources/englishRNN.ser.gz")
+ @CreoleParameter(comment = "path to the parser's grammar file", defaultValue
= "resources/englishRNN.ser.gz")
public void setParserFile(URL parserFile) {
this.parserFile = parserFile;
}
@@ -649,8 +601,7 @@
}
@RunTime
- @CreoleParameter(comment = "verbose mode for debugging",
- defaultValue = "false")
+ @CreoleParameter(comment = "verbose mode for debugging", defaultValue =
"false")
public void setDebug(Boolean debug) {
this.debugMode = debug.booleanValue();
}
@@ -658,10 +609,9 @@
public Boolean getDebug() {
return new Boolean(this.debugMode);
}
-
+
@RunTime
- @CreoleParameter(comment = "Re-use existing POS tags on tokens",
- defaultValue = "false")
+ @CreoleParameter(comment = "Re-use existing POS tags on tokens",
defaultValue = "false")
public void setReusePosTags(Boolean reusePosTags) {
this.reusePosTags = reusePosTags.booleanValue();
}
@@ -669,91 +619,79 @@
public Boolean getReusePosTags() {
return new Boolean(this.reusePosTags);
}
-
+
@RunTime
- @CreoleParameter(comment = "Create POS tags on the Token annotations",
- defaultValue = "false")
+ @CreoleParameter(comment = "Create POS tags on the Token annotations",
defaultValue = "false")
public void setAddPosTags(Boolean posTagTokens) {
this.addPosTags = posTagTokens.booleanValue();
}
-
+
public Boolean getAddPosTags() {
return new Boolean(this.addPosTags);
}
@RunTime
- @CreoleParameter(comment = "use tag mapping",
- defaultValue = "false")
+ @CreoleParameter(comment = "use tag mapping", defaultValue = "false")
public void setUseMapping(Boolean useMapping) {
this.useMapping = useMapping.booleanValue();
}
-
+
public Boolean getUseMapping() {
return new Boolean(this.useMapping);
}
-
+
@RunTime
- @CreoleParameter(comment = "Create dependency features on Token annotations",
- defaultValue = "true")
+ @CreoleParameter(comment = "Create dependency features on Token
annotations", defaultValue = "true")
public void setAddDependencyFeatures(Boolean useDependency) {
this.addDependencyFeatures = useDependency.booleanValue();
}
-
+
public Boolean getAddDependencyFeatures() {
return new Boolean(this.addDependencyFeatures);
}
-
+
@RunTime
- @CreoleParameter(comment = "Create annotations to show dependencies",
- defaultValue = "true")
+ @CreoleParameter(comment = "Create annotations to show dependencies",
defaultValue = "true")
public void setAddDependencyAnnotations(Boolean useDependency) {
this.addDependencyAnnotations = useDependency.booleanValue();
}
-
+
public Boolean getAddDependencyAnnotations() {
return new Boolean(this.addDependencyAnnotations);
}
-
-
+
@RunTime
- @CreoleParameter(comment = "input annotation type for each sentence",
- defaultValue = ANNIEConstants.SENTENCE_ANNOTATION_TYPE )
+ @CreoleParameter(comment = "input annotation type for each sentence",
defaultValue = ANNIEConstants.SENTENCE_ANNOTATION_TYPE)
public void setInputSentenceType(String sType) {
this.inputSentenceType = sType;
}
-
+
public String getInputSentenceType() {
return this.inputSentenceType;
}
-
@RunTime
- @CreoleParameter(comment = "input annotation type for each token",
- defaultValue = ANNIEConstants.TOKEN_ANNOTATION_TYPE )
+ @CreoleParameter(comment = "input annotation type for each token",
defaultValue = ANNIEConstants.TOKEN_ANNOTATION_TYPE)
public void setInputTokenType(String tType) {
this.inputTokenType = tType;
}
-
+
public String getInputTokenType() {
return this.inputTokenType;
}
-
@RunTime
- @CreoleParameter(comment = "Create annotations to show phrase structures",
- defaultValue = "true")
+ @CreoleParameter(comment = "Create annotations to show phrase structures",
defaultValue = "true")
public void setAddConstituentAnnotations(Boolean usePhraseStructure) {
this.addConstituentAnnotations = usePhraseStructure.booleanValue();
}
-
+
public Boolean getAddConstituentAnnotations() {
return new Boolean(this.addConstituentAnnotations);
}
-
-
+
@RunTime
- @CreoleParameter(comment = "Dependency Mode",
- defaultValue = "Typed")
+ @CreoleParameter(comment = "Dependency Mode", defaultValue = "Typed")
public void setDependencyMode(DependencyMode mode) {
this.dependencyMode = mode;
}
@@ -761,38 +699,34 @@
public DependencyMode getDependencyMode() {
return this.dependencyMode;
}
-
+
@RunTime
- @CreoleParameter(comment = "include extra dependencies",
- defaultValue = "false")
+ @CreoleParameter(comment = "include extra dependencies", defaultValue =
"false")
public void setIncludeExtraDependencies(Boolean include) {
this.includeExtraDependencies = include;
}
-
+
public Boolean getIncludeExtraDependencies() {
return this.includeExtraDependencies;
}
-
-
- /* Made mappingFile an init parameter to simplify things.
- * The CREOLE parameter is called "mappingFile" but it's actually a URL.
+
+ /*
+ * Made mappingFile an init parameter to simplify things. The CREOLE
parameter
+ * is called "mappingFile" but it's actually a URL.
*/
@Optional
@CreoleParameter(comment = "path to the tag mapping file")
public void setMappingFile(URL mappingFileURL) {
this.mappingFile = null; // override below
this.mappingFileURL = mappingFileURL;
-
- if ( (this.mappingFileURL != null) &&
- (! this.mappingFileURL.toString().trim().equals("")) ) {
+ if((this.mappingFileURL != null)
+ && (!this.mappingFileURL.toString().trim().equals(""))) {
try {
this.mappingFile = new File(this.mappingFileURL.toURI());
- }
- catch(URISyntaxException e) {
+ } catch(URISyntaxException e) {
e.printStackTrace();
}
}
-
}
public URL getMappingFile() {
@@ -800,22 +734,21 @@
}
/**
- * Inject an existing instance of the LexicalizedParser.
- * <b>This method is intended for use by {@link Factory#ducplicate}
- * and should not be called directly.</b>
+ * Inject an existing instance of the LexicalizedParser. <b>This method is
+ * intended for use by {@link Factory#ducplicate} and should not be called
+ * directly.</b>
*/
@Sharable
public void setStanfordParser(LexicalizedParser parser) {
this.stanfordParser = parser;
}
-
+
/**
- * Get the LexicalizedParser used internally by this PR.
- * <b>This method is intended for use by {@link Factory#ducplicate}
- * and should not be called directly.</b>
+ * Get the LexicalizedParser used internally by this PR. <b>This method is
+ * intended for use by {@link Factory#ducplicate} and should not be called
+ * directly.</b>
*/
public LexicalizedParser getStanfordParser() {
return stanfordParser;
}
-
}
Modified:
gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/StanfordSentence.java
===================================================================
---
gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/StanfordSentence.java
2016-08-19 16:50:05 UTC (rev 19525)
+++
gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/StanfordSentence.java
2016-08-19 17:01:15 UTC (rev 19526)
@@ -14,8 +14,8 @@
*
* You should have received a copy of the GNU General Public License along with
* this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * $Id: StanfordSentence.java 15600 2012-03-19 15:40:56Z adamfunk $
+ *
+ * $Id: StanfordSentence.java 15600 2012-03-19 15:40:56Z adamfunk $
*/
package gate.stanford;
@@ -28,210 +28,174 @@
import gate.util.Strings;
/**
- * The Stanford Parser itself takes as input a List of
edu.stanford.nlp.ling.Word.
- * This data structure is constructed from a Sentence Annotation, using the
enclosed
- * Token Annotations, and yields the required List, as well as methods for
- * converting the parser's output spans into GATE Annotation offsets.
+ * The Stanford Parser itself takes as input a List of
+ * edu.stanford.nlp.ling.Word. This data structure is constructed from a
+ * Sentence Annotation, using the enclosed Token Annotations, and yields the
+ * required List, as well as methods for converting the parser's output spans
+ * into GATE Annotation offsets.
*/
public class StanfordSentence {
-
private Map<Integer, Long> startPosToOffset;
+
private Map<Integer, Long> endPosToOffset;
+
private Map<Integer, Annotation> startPosToToken;
+
private Map<Integer, String> startPosToString;
- private List<Word> words;
- private Long sentenceStartOffset, sentenceEndOffset;
- private List<Annotation> tokens;
- private static final String POS_TAG_FEATURE =
ANNIEConstants.TOKEN_CATEGORY_FEATURE_NAME;
- private static final String STRING_FEATURE =
ANNIEConstants.TOKEN_STRING_FEATURE_NAME;
-
+ private List<Word> words;
+
+ private Long sentenceStartOffset, sentenceEndOffset;
+
+ private List<Annotation> tokens;
+
+ private static final String POS_TAG_FEATURE =
+ ANNIEConstants.TOKEN_CATEGORY_FEATURE_NAME;
+
+ private static final String STRING_FEATURE =
+ ANNIEConstants.TOKEN_STRING_FEATURE_NAME;
+
int nbrOfTokens, nbrOfMissingPosTags;
-
-
- /* This is probably dodgy, but I can't find an "unknown" tag
- * in the Penn documentation. */
- private static final String UNKNOWN_TAG = "NN";
-
- public StanfordSentence(Annotation sentence, String tokenType,
- AnnotationSet inputAS, boolean usePosTags) {
-
+ /*
+ * This is probably dodgy, but I can't find an "unknown" tag in the Penn
+ * documentation.
+ */
+ private static final String UNKNOWN_TAG = "NN";
+
+ public StanfordSentence(Annotation sentence, String tokenType,
+ AnnotationSet inputAS, boolean usePosTags) {
startPosToOffset = new HashMap<Integer, Long>();
- endPosToOffset = new HashMap<Integer, Long>();
- startPosToToken = new HashMap<Integer, Annotation>();
+ endPosToOffset = new HashMap<Integer, Long>();
+ startPosToToken = new HashMap<Integer, Annotation>();
startPosToString = new HashMap<Integer, String>();
-
sentenceStartOffset = sentence.getStartNode().getOffset();
- sentenceEndOffset = sentence.getEndNode().getOffset();
-
- nbrOfTokens = 0;
+ sentenceEndOffset = sentence.getEndNode().getOffset();
+ nbrOfTokens = 0;
nbrOfMissingPosTags = 0;
-
- tokens = Utils.inDocumentOrder(inputAS.getContained(sentenceStartOffset,
sentenceEndOffset).get(tokenType));
+ tokens =
+ Utils.inDocumentOrder(inputAS.getContained(sentenceStartOffset,
+ sentenceEndOffset).get(tokenType));
words = new ArrayList<Word>();
-
add(-1, sentence, "S");
-
int tokenNo = 0;
-
- for (Annotation token : tokens) {
- String tokenString =
escapeToken(token.getFeatures().get(STRING_FEATURE).toString());
+ for(Annotation token : tokens) {
+ String tokenString =
+ escapeToken(token.getFeatures().get(STRING_FEATURE).toString());
add(tokenNo, token, tokenString);
-
- /* The FAQ says the parser will automatically use existing POS tags
- * if the List elements are of type TaggedWord.
+ /*
+ * The FAQ says the parser will automatically use existing POS tags if
the
+ * List elements are of type TaggedWord.
* http://nlp.stanford.edu/software/parser-faq.shtml#f
*/
-
- if (usePosTags) {
+ if(usePosTags) {
words.add(new TaggedWord(tokenString, getEscapedPosTag(token)));
- }
- else {
+ } else {
words.add(new Word(tokenString));
}
-
tokenNo++;
}
-
nbrOfTokens = tokenNo;
}
-
public String toString() {
StringBuffer output = new StringBuffer();
- output.append("S:
").append(Strings.toString(startPosToOffset)).append('\n');
- output.append("
").append(Strings.toString(startPosToString)).append('\n');
+ output.append("S: ").append(Strings.toString(startPosToOffset))
+ .append('\n');
+ output.append(" ").append(Strings.toString(startPosToString))
+ .append('\n');
output.append(" ").append(Strings.toString(endPosToOffset));
return output.toString();
}
-
-
- private String getEscapedPosTag(Annotation token) {
+
+ private String getEscapedPosTag(Annotation token) {
String pos = UNKNOWN_TAG;
FeatureMap tokenFeatures = token.getFeatures();
-
- if (tokenFeatures.containsKey(POS_TAG_FEATURE)) {
+ if(tokenFeatures.containsKey(POS_TAG_FEATURE)) {
Object temp = tokenFeatures.get(POS_TAG_FEATURE);
-
- if (temp instanceof String) {
- pos = (String) temp;
- }
- else {
+ if(temp instanceof String) {
+ pos = (String)temp;
+ } else {
nbrOfMissingPosTags++;
}
-
- }
- else {
+ } else {
nbrOfMissingPosTags++;
}
-
return escapePosTag(pos);
}
-
-
private void add(int tokenNbr, Annotation token, String tokenString) {
Long tokenStartOffset = token.getStartNode().getOffset();
- Long tokenEndOffset = token.getEndNode().getOffset();
-
+ Long tokenEndOffset = token.getEndNode().getOffset();
startPosToOffset.put(tokenNbr, tokenStartOffset);
endPosToOffset.put(new Integer(tokenNbr + 1), tokenEndOffset);
startPosToToken.put(tokenNbr, token);
startPosToString.put(tokenNbr, tokenString);
}
-
-
- /* Explanation of the position conversion:
- * The output of the Stanford Parser specifies each constituent's span in
terms of
- * token boundaries re-numbered within each sentence, which we need to
convert to
- * GATE character offsets within the whole document.
+ /*
+ * Explanation of the position conversion: The output of the Stanford Parser
+ * specifies each constituent's span in terms of token boundaries re-numbered
+ * within each sentence, which we need to convert to GATE character offsets
+ * within the whole document.
*
- * Example:
- * "This is a test." starting at document offset 100, containing five tokens.
- * Stanford says "This" starts at 0 and ends at 1; GATE says 100 to 104.
- * Stanford says "is a test" starts at 1 and ends at 4;
- * GATE says 105 to 114.
+ * Example: "This is a test." starting at document offset 100, containing
five
+ * tokens. Stanford says "This" starts at 0 and ends at 1; GATE says 100 to
+ * 104. Stanford says "is a test" starts at 1 and ends at 4; GATE says 105 to
+ * 114.
*/
-
-
public int numberOfTokens() {
return nbrOfTokens;
}
-
+
public int numberOfMissingPosTags() {
return nbrOfMissingPosTags;
}
-
+
public boolean isNotEmpty() {
return (nbrOfTokens > 0);
}
-
-
+
/**
- * Change the Token's string to match the Penn Treebank's
- * escaping system.
- * See Stanford parser FAQ "How can I provide the correct tokenization of my
- * sentence to the parser?"
-
- * @param token original string feature of Token
+ * Change the Token's string to match the Penn Treebank's escaping system.
See
+ * Stanford parser FAQ "How can I provide the correct tokenization of my
+ * sentence to the parser?"
+ *
+ * @param token
+ * original string feature of Token
* @return escaped version of string
*/
protected static String escapeToken(String token) {
- // ( --> -LRB-
- if (token.equals("(")) {
- return "-LRB-";
- }
-
- // ) --> -RRB-
- if (token.equals(")")) {
- return "-RRB-";
- }
-
- // / --> \/
- // * --> \*
- if (token.contains("/") || token.contains("*")) {
- return token.replace("/", "\\/").replace("*", "\\*");
- }
-
+ // ( --> -LRB-
+ if(token.equals("(")) { return "-LRB-"; }
+ // ) --> -RRB-
+ if(token.equals(")")) { return "-RRB-"; }
+ // / --> \/
+ // * --> \*
+ if(token.contains("/") || token.contains("*")) { return token.replace("/",
+ "\\/").replace("*", "\\*"); }
return token;
}
-
protected static String escapePosTag(String tag) {
- // ( --> -LRB-
- if (tag.equals("(")) {
- return "-LRB-";
- }
-
- // ) --> -RRB-
- if (tag.equals(")")) {
- return "-RRB-";
- }
-
+ // ( --> -LRB-
+ if(tag.equals("(")) { return "-LRB-"; }
+ // ) --> -RRB-
+ if(tag.equals(")")) { return "-RRB-"; }
return tag;
}
-
protected static String unescapePosTag(String tag) {
- // ( <-- -LRB-
- if (tag.equals("-LRB-")) {
- return "(";
- }
-
- // ) <-- -RRB-
- if (tag.equals("-RRB-")) {
- return ")";
- }
-
+ // ( <-- -LRB-
+ if(tag.equals("-LRB-")) { return "("; }
+ // ) <-- -RRB-
+ if(tag.equals("-RRB-")) { return ")"; }
return tag;
}
-
/**
- * Convert a Stanford start position to the GATE Annotation of type
- * "Token" that starts there.
+ * Convert a Stanford start position to the GATE Annotation of type "Token"
+ * that starts there.
*/
public Annotation startPos2token(int startPos) {
return startPosToToken.get(startPos);
@@ -239,6 +203,7 @@
/**
* Convert a Stanford start position to a GATE offset.
+ *
* @param startPos
* @return the offset in the GATE document
*/
@@ -248,6 +213,7 @@
/**
* Convert a Stanford end position to a GATE offset.
+ *
* @param endPos
* @return the offset in the GATE document
*/
@@ -255,7 +221,6 @@
return endPosToOffset.get(endPos);
}
-
/**
* @return The data structure that is passed to the Stanford Parser itself.
*/
Modified:
gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/Tagger.java
===================================================================
---
gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/Tagger.java
2016-08-19 16:50:05 UTC (rev 19525)
+++
gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/Tagger.java
2016-08-19 17:01:15 UTC (rev 19526)
@@ -19,7 +19,6 @@
*
* $Id: Tagger.java 15468 2012-02-25 14:41:15Z $
*/
-
package gate.stanford;
import edu.stanford.nlp.ling.TaggedWord;
@@ -56,9 +55,8 @@
/**
* This class is a wrapper for the Stanford PoS tagger v3.2.0.
*/
-@CreoleResource(name = "Stanford POS Tagger", comment = "Stanford
Part-of-Speech Tagger", icon = "pos-tagger",
helpURL="http://gate.ac.uk/userguide/sec:misc:creole:stanford")
+@CreoleResource(name = "Stanford POS Tagger", comment = "Stanford
Part-of-Speech Tagger", icon = "pos-tagger", helpURL =
"http://gate.ac.uk/userguide/sec:misc:creole:stanford")
public class Tagger extends AbstractLanguageAnalyser {
-
private static final long serialVersionUID = -6001372186847970081L;
public static final String TAG_DOCUMENT_PARAMETER_NAME = "document";
@@ -68,13 +66,13 @@
public static final String TAG_ENCODING_PARAMETER_NAME = "encoding";
public static final String BASE_TOKEN_ANNOTATION_TYPE_PARAMETER_NAME =
- "baseTokenAnnotationType";
+ "baseTokenAnnotationType";
public static final String OUTPUT_ANNOTATION_TYPE_PARAMETER_NAME =
- "outputAnnotationType";
+ "outputAnnotationType";
public static final String BASE_SENTENCE_ANNOTATION_TYPE_PARAMETER_NAME =
- "baseSentenceAnnotationType";
+ "baseSentenceAnnotationType";
public static final String TAG_OUTPUT_AS_PARAMETER_NAME = "outputASName";
@@ -108,9 +106,9 @@
@RunTime
@Optional
- @CreoleParameter(comment = "Should existing " + TOKEN_CATEGORY_FEATURE_NAME +
- " features on input annotations be respected (true) or ignored (false)?",
- defaultValue = "true")
+ @CreoleParameter(comment = "Should existing "
+ + TOKEN_CATEGORY_FEATURE_NAME
+ + " features on input annotations be respected (true) or ignored
(false)?", defaultValue = "true")
public void setUseExistingTags(Boolean useTags) {
useExistingTags = useTags;
}
@@ -118,6 +116,7 @@
public Boolean getUseExistingTags() {
return useExistingTags;
}
+
private Boolean useExistingTags;
protected Logger logger = Logger.getLogger(this.getClass().getName());
@@ -145,43 +144,34 @@
// check the parameters
if(document == null)
throw new ExecutionException("No document to process!");
-
AnnotationSet inputAS = document.getAnnotations(inputASName);
-
- if(baseTokenAnnotationType == null ||
- baseTokenAnnotationType.trim().length() == 0) { throw new
ExecutionException(
- "No base Token Annotation Type provided!"); }
-
- if(baseSentenceAnnotationType == null ||
- baseSentenceAnnotationType.trim().length() == 0) { throw new
ExecutionException(
- "No base Sentence Annotation Type provided!"); }
-
- if(outputAnnotationType == null ||
- outputAnnotationType.trim().length() == 0) { throw new
ExecutionException(
- "No AnnotationType provided to store the new feature!"); }
-
+ if(baseTokenAnnotationType == null
+ || baseTokenAnnotationType.trim().length() == 0) { throw new
ExecutionException(
+ "No base Token Annotation Type provided!"); }
+ if(baseSentenceAnnotationType == null
+ || baseSentenceAnnotationType.trim().length() == 0) { throw new
ExecutionException(
+ "No base Sentence Annotation Type provided!"); }
+ if(outputAnnotationType == null
+ || outputAnnotationType.trim().length() == 0) { throw new
ExecutionException(
+ "No AnnotationType provided to store the new feature!"); }
AnnotationSet sentencesAS = inputAS.get(baseSentenceAnnotationType);
AnnotationSet tokensAS = inputAS.get(baseTokenAnnotationType);
- if(sentencesAS != null && sentencesAS.size() > 0 && tokensAS != null &&
- tokensAS.size() > 0) {
+ if(sentencesAS != null && sentencesAS.size() > 0 && tokensAS != null
+ && tokensAS.size() > 0) {
long startTime = System.currentTimeMillis();
fireStatusChanged("POS tagging " + document.getName());
fireProgressChanged(0);
// prepare the input for MaxentTagger
List<Word> sentenceForTagger = new ArrayList<Word>();
-
// define a comparator for annotations by start offset
OffsetComparator offsetComparator = new OffsetComparator();
-
// read all the tokens and all the sentences
List<Annotation> sentencesList = new ArrayList<Annotation>(sentencesAS);
Collections.sort(sentencesList, offsetComparator);
List<Annotation> tokensList = new ArrayList<Annotation>(tokensAS);
Collections.sort(tokensList, offsetComparator);
-
Iterator<Annotation> sentencesIter = sentencesList.iterator();
ListIterator<Annotation> tokensIter = tokensList.listIterator();
-
List<Annotation> tokensInCurrentSentence = new ArrayList<Annotation>();
Annotation currentToken = tokensIter.next();
int sentIndex = 0;
@@ -190,131 +180,125 @@
Annotation currentSentence = sentencesIter.next();
tokensInCurrentSentence.clear();
sentenceForTagger.clear();
- while(currentToken != null &&
- currentToken.getEndNode().getOffset()
- .compareTo(currentSentence.getEndNode().getOffset()) <= 0) {
+ while(currentToken != null
+ && currentToken.getEndNode().getOffset()
+ .compareTo(currentSentence.getEndNode().getOffset()) <= 0) {
// If we're only POS tagging Tokens within
baseSentenceAnnotationType,
// don't add the sentence if the Tokens aren't within the span of
// baseSentenceAnnotationType
if(posTagAllTokens || currentToken.withinSpanOf(currentSentence)) {
tokensInCurrentSentence.add(currentToken);
-
- if(useExistingTags && currentToken.getFeatures().containsKey(
- TOKEN_CATEGORY_FEATURE_NAME)) {
- sentenceForTagger.add(new TaggedWord(
- (String)currentToken.getFeatures()
- .get(TOKEN_STRING_FEATURE_NAME),
- (String)currentToken.getFeatures()
- .get(TOKEN_CATEGORY_FEATURE_NAME)));
+ if(useExistingTags
+ && currentToken.getFeatures().containsKey(
+ TOKEN_CATEGORY_FEATURE_NAME)) {
+ sentenceForTagger.add(new TaggedWord((String)currentToken
+ .getFeatures().get(TOKEN_STRING_FEATURE_NAME),
+ (String)currentToken.getFeatures().get(
+ TOKEN_CATEGORY_FEATURE_NAME)));
} else {
sentenceForTagger.add(new Word((String)currentToken.getFeatures()
- .get(TOKEN_STRING_FEATURE_NAME)));
+ .get(TOKEN_STRING_FEATURE_NAME)));
}
}
currentToken = (tokensIter.hasNext() ? tokensIter.next() : null);
}
-
// if the sentence doesn't contain any tokens (which is a bit weird but
// is possible) then don't try running the POS tagger as you will get
an
// array index out of bounds exception
if(sentenceForTagger.isEmpty()) continue;
-
// run the POS tagger
List<TaggedWord> taggerResults =
- tagger.tagSentence(sentenceForTagger, useExistingTags);
-
+ tagger.tagSentence(sentenceForTagger, useExistingTags);
// add the results
// make sure no malfunction occurred
if(taggerResults.size() != tokensInCurrentSentence.size())
throw new ExecutionException(
- "POS Tagger malfunction: the output size (" +
- taggerResults.size() + ") is different from the input size (" +
- tokensInCurrentSentence.size() + ")!");
+ "POS Tagger malfunction: the output size ("
+ + taggerResults.size()
+ + ") is different from the input size ("
+ + tokensInCurrentSentence.size() + ")!");
Iterator<TaggedWord> resIter = taggerResults.iterator();
Iterator<Annotation> tokIter = tokensInCurrentSentence.iterator();
while(resIter.hasNext()) {
Annotation annot = tokIter.next();
- addFeatures(annot, TOKEN_CATEGORY_FEATURE_NAME,
(resIter.next().tag()));
+ addFeatures(annot, TOKEN_CATEGORY_FEATURE_NAME,
+ (resIter.next().tag()));
}
fireProgressChanged(sentIndex++ * 100 / sentCnt);
}// while(sentencesIter.hasNext())
-
if(currentToken != null && posTagAllTokens) {
// Tag remaining Tokens if we are not considering those only within
// baseSentenceAnnotationType
-
// we have remaining tokens after the last sentence
tokensInCurrentSentence.clear();
sentenceForTagger.clear();
while(currentToken != null) {
tokensInCurrentSentence.add(currentToken);
- if(useExistingTags && currentToken.getFeatures().containsKey(
- TOKEN_CATEGORY_FEATURE_NAME)) {
- sentenceForTagger.add(new TaggedWord(
- (String)currentToken.getFeatures()
- .get(TOKEN_STRING_FEATURE_NAME),
- (String)currentToken.getFeatures()
- .get(TOKEN_CATEGORY_FEATURE_NAME)));
+ if(useExistingTags
+ && currentToken.getFeatures().containsKey(
+ TOKEN_CATEGORY_FEATURE_NAME)) {
+ sentenceForTagger.add(new TaggedWord((String)currentToken
+ .getFeatures().get(TOKEN_STRING_FEATURE_NAME),
+ (String)currentToken.getFeatures().get(
+ TOKEN_CATEGORY_FEATURE_NAME)));
} else {
sentenceForTagger.add(new Word((String)currentToken.getFeatures()
- .get(TOKEN_STRING_FEATURE_NAME)));
+ .get(TOKEN_STRING_FEATURE_NAME)));
}
currentToken = (tokensIter.hasNext() ? tokensIter.next() : null);
}
-
// run the POS tagger on remaining tokens
List<TaggedWord> taggerResults =
- tagger.tagSentence(sentenceForTagger, useExistingTags);
-
+ tagger.tagSentence(sentenceForTagger, useExistingTags);
// add the results and make sure no malfunction occurred
if(taggerResults.size() != tokensInCurrentSentence.size())
throw new ExecutionException(
- "POS Tagger malfunction: the output size (" + taggerResults.size()
+
- ") is different from the input size (" +
- tokensInCurrentSentence.size() + ")!");
+ "POS Tagger malfunction: the output size ("
+ + taggerResults.size()
+ + ") is different from the input size ("
+ + tokensInCurrentSentence.size() + ")!");
Iterator<TaggedWord> resIter = taggerResults.iterator();
Iterator<Annotation> tokIter = tokensInCurrentSentence.iterator();
while(resIter.hasNext()) {
Annotation annot = tokIter.next();
- addFeatures(annot, TOKEN_CATEGORY_FEATURE_NAME,
(resIter.next().tag()));
+ addFeatures(annot, TOKEN_CATEGORY_FEATURE_NAME,
+ (resIter.next().tag()));
}
}// if(currentToken != null)
fireProcessFinished();
- fireStatusChanged(document.getName() +
- " tagged in " +
- NumberFormat.getInstance().format(
- (double)(System.currentTimeMillis() - startTime) / 1000) +
- " seconds!");
+ fireStatusChanged(document.getName()
+ + " tagged in "
+ + NumberFormat.getInstance().format(
+ (double)(System.currentTimeMillis() - startTime) / 1000)
+ + " seconds!");
} else {
if(failOnMissingInputAnnotations) {
throw new ExecutionException(
- "No sentences or tokens to process in document " +
- document.getName() + "\n" + "Please run a sentence splitter " +
- "and tokeniser first!");
+ "No sentences or tokens to process in document "
+ + document.getName() + "\n" + "Please run a sentence splitter "
+ + "and tokeniser first!");
} else {
Utils
- .logOnce(
- logger,
- Level.INFO,
- "POS tagger: no sentence or token annotations in input document -
see debug log for details.");
+ .logOnce(
+ logger,
+ Level.INFO,
+ "POS tagger: no sentence or token annotations in input
document - see debug log for details.");
logger.debug("No input annotations in document " + document.getName());
}
}
-
}
protected void addFeatures(Annotation annot, String featureName,
- String featureValue) throws GateRuntimeException {
+ String featureValue) throws GateRuntimeException {
String tempIASN = inputASName == null ? "" : inputASName;
String tempOASN = outputASName == null ? "" : outputASName;
- if(outputAnnotationType.equals(baseTokenAnnotationType) &&
- tempIASN.equals(tempOASN)) {
+ if(outputAnnotationType.equals(baseTokenAnnotationType)
+ && tempIASN.equals(tempOASN)) {
annot.getFeatures().put(featureName, featureValue);
return;
} else {
int start = annot.getStartNode().getOffset().intValue();
int end = annot.getEndNode().getOffset().intValue();
-
// get the annotations of type outputAnnotationType
AnnotationSet outputAS = document.getAnnotations(outputASName);
AnnotationSet annotations = outputAS.get(outputAnnotationType);
@@ -324,7 +308,7 @@
features.put(featureName, featureValue);
try {
outputAS.add(new Long(start), new Long(end), outputAnnotationType,
- features);
+ features);
} catch(Exception e) {
throw new GateRuntimeException("Invalid Offsets");
}
@@ -332,26 +316,25 @@
// search for the annotation if there is one with the same start and
end
// offsets
ArrayList<Annotation> tempList =
- new ArrayList<Annotation>(annotations.get());
+ new ArrayList<Annotation>(annotations.get());
boolean found = false;
for(int i = 0; i < tempList.size(); i++) {
Annotation annotation = tempList.get(i);
- if(annotation.getStartNode().getOffset().intValue() == start &&
- annotation.getEndNode().getOffset().intValue() == end) {
+ if(annotation.getStartNode().getOffset().intValue() == start
+ && annotation.getEndNode().getOffset().intValue() == end) {
// this is the one
annotation.getFeatures().put(featureName, featureValue);
found = true;
break;
}
}
-
if(!found) {
// add new annotation
FeatureMap features = Factory.newFeatureMap();
features.put(featureName, featureValue);
try {
outputAS.add(new Long(start), new Long(end), outputAnnotationType,
- features);
+ features);
} catch(Exception e) {
throw new GateRuntimeException("Invalid Offsets");
}
@@ -420,7 +403,7 @@
this.outputASName = outputASName;
}
- @CreoleParameter(comment = "Path to the tagger's model file", defaultValue =
"resources/english-left3words-distsim.tagger", suffixes="tagger;model")
+ @CreoleParameter(comment = "Path to the tagger's model file", defaultValue =
"resources/english-left3words-distsim.tagger", suffixes = "tagger;model")
public void setModelFile(URL modelFile) {
this.modelFile = modelFile;
}
Modified:
gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/Tokenizer.java
===================================================================
---
gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/Tokenizer.java
2016-08-19 16:50:05 UTC (rev 19525)
+++
gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/Tokenizer.java
2016-08-19 17:01:15 UTC (rev 19526)
@@ -19,10 +19,8 @@
*
* $Id: Tokenizer.java 15468 2013-10-22 21:13:15Z $
*/
-
package gate.stanford;
-
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.process.CoreLabelTokenFactory;
import edu.stanford.nlp.process.PTBTokenizer;
@@ -46,9 +44,8 @@
/**
* This class is a wrapper for the Stanford Tokenizer v3.2.0.
*/
-@CreoleResource(name = "Stanford PTB Tokenizer", comment = "Stanford Penn
Treebank v3 Tokenizer, for English", icon = "tokeniser",
helpURL="http://gate.ac.uk/userguide/sec:misc:creole:stanford")
+@CreoleResource(name = "Stanford PTB Tokenizer", comment = "Stanford Penn
Treebank v3 Tokenizer, for English", icon = "tokeniser", helpURL =
"http://gate.ac.uk/userguide/sec:misc:creole:stanford")
public class Tokenizer extends AbstractLanguageAnalyser {
-
private static final long serialVersionUID = -6001371186847970080L;
public static final String TAG_DOCUMENT_PARAMETER_NAME = "document";
@@ -95,94 +92,65 @@
// check the parameters
if(document == null)
throw new ExecutionException("No document to process!");
-
- AnnotationSet inputAS = document.getAnnotations(inputASName);
+
AnnotationSet outputAS = document.getAnnotations(outputASName);
-
-
long startTime = System.currentTimeMillis();
fireStatusChanged("Tokenising " + document.getName());
- fireProgressChanged(0);
-
-
+ fireProgressChanged(0);
// tokenising goes here
String rawText = "";
try {
- rawText = document.getContent().getContent(new Long(0),
document.getContent().size()).toString();
- } catch (Exception e) {
+ rawText =
+ document.getContent()
+ .getContent(new Long(0),
document.getContent().size()).toString();
+ } catch(Exception e) {
System.out.println("Document content offsets wrong: " + e);
}
-
PTBTokenizer<CoreLabel> ptbt;
try {
- ptbt = new PTBTokenizer<CoreLabel>(new StringReader(rawText), new
CoreLabelTokenFactory(), "invertible=true");
- } catch (Exception e) {
+ ptbt =
+ new PTBTokenizer<CoreLabel>(new StringReader(rawText),
+ new CoreLabelTokenFactory(), "invertible=true");
+ } catch(Exception e) {
System.out.println("Failed when calling tokenizer: " + e);
return;
}
-
Long tokenStart;
Long tokenEnd;
- Long prevTokenEnd = new Long(0); // this default value lets us capture
leading spaces
-
- for (CoreLabel label; ptbt.hasNext(); ) {
+ Long prevTokenEnd = new Long(0); // this default value lets us capture
+ // leading spaces
+ for(CoreLabel label; ptbt.hasNext();) {
label = ptbt.next();
tokenStart = new Long(label.beginPosition());
tokenEnd = new Long(label.endPosition());
-
-
SimpleFeatureMapImpl tokenMap = new SimpleFeatureMapImpl();
-
// add the token annotation
try {
- tokenMap.put(TOKEN_STRING_FEATURE,
document.getContent().getContent(tokenStart, tokenEnd).toString());
+ tokenMap.put(TOKEN_STRING_FEATURE,
+ document.getContent().getContent(tokenStart, tokenEnd).toString());
outputAS.add(tokenStart, tokenEnd, tokenLabel, tokenMap);
- } catch (InvalidOffsetException e) {
+ } catch(InvalidOffsetException e) {
System.out.println("Token alignment problem:" + e);
}
-
// do we need to add a space annotation?
- if (tokenStart > prevTokenEnd) {
+ if(tokenStart > prevTokenEnd) {
try {
- outputAS.add(prevTokenEnd, tokenStart, spaceLabel, new
SimpleFeatureMapImpl());
- } catch (InvalidOffsetException e) {
+ outputAS.add(prevTokenEnd, tokenStart, spaceLabel,
+ new SimpleFeatureMapImpl());
+ } catch(InvalidOffsetException e) {
System.out.println("Space token alignment problem:" + e);
}
-
}
-
prevTokenEnd = tokenEnd;
-
}
-
-
fireProcessFinished();
- fireStatusChanged(document.getName() +
- " tokenised in " +
- NumberFormat.getInstance().format(
- (double)(System.currentTimeMillis() - startTime) / 1000) +
- " seconds!");
+ fireStatusChanged(document.getName()
+ + " tokenised in "
+ + NumberFormat.getInstance().format(
+ (double)(System.currentTimeMillis() - startTime) / 1000)
+ + " seconds!");
}
- public void setEncoding(String encoding) {
- this.encoding = encoding;
- }
-
- @Optional
- @RunTime
- @CreoleParameter(comment = "Input annotation set name", defaultValue = "")
- public void setInputASName(String newInputASName) {
- inputASName = newInputASName;
- }
-
- public String getInputASName() {
- return inputASName;
- }
-
- public String getEncoding() {
- return this.encoding;
- }
-
public String getOutputASName() {
return this.outputASName;
}
@@ -194,7 +162,6 @@
this.outputASName = outputASName;
}
-
public String getTokenLabel() {
return this.tokenLabel;
}
@@ -217,14 +184,9 @@
this.spaceLabel = spaceLabel;
}
- private String inputASName;
-
- private String encoding;
-
private String outputASName;
private String tokenLabel;
private String spaceLabel;
-
}
Modified:
gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/apps/EnglishDependencies.java
===================================================================
---
gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/apps/EnglishDependencies.java
2016-08-19 16:50:05 UTC (rev 19525)
+++
gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/apps/EnglishDependencies.java
2016-08-19 17:01:15 UTC (rev 19526)
@@ -15,7 +15,6 @@
* You should have received a copy of the GNU General Public License along with
* this program. If not, see <http://www.gnu.org/licenses/>.
*/
-
package gate.stanford.apps;
import gate.creole.PackagedController;
@@ -26,21 +25,18 @@
import java.net.URL;
import java.util.List;
-@CreoleResource(name="English Dependency Parser",
- comment = "Ready-made application for Stanford English parser",
- autoinstances = @AutoInstance)
+@CreoleResource(name = "English Dependency Parser", comment = "Ready-made
application for Stanford English parser", autoinstances = @AutoInstance)
public class EnglishDependencies extends PackagedController {
-
private static final long serialVersionUID = 3163023140886167369L;
@Override
- @CreoleParameter(defaultValue="resources/sample_parser_en.gapp")
+ @CreoleParameter(defaultValue = "resources/sample_parser_en.gapp")
public void setPipelineURL(URL url) {
- this.url = url;
+ this.url = url;
}
-
+
@Override
- @CreoleParameter(defaultValue="Stanford Parser")
+ @CreoleParameter(defaultValue = "Stanford Parser")
public void setMenu(List<String> menu) {
super.setMenu(menu);
}
Modified:
gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/apps/EnglishPOSDependencies.java
===================================================================
---
gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/apps/EnglishPOSDependencies.java
2016-08-19 16:50:05 UTC (rev 19525)
+++
gate/branches/sawdust2/plugins/Stanford_CoreNLP/src/main/java/gate/stanford/apps/EnglishPOSDependencies.java
2016-08-19 17:01:15 UTC (rev 19526)
@@ -15,7 +15,6 @@
* You should have received a copy of the GNU General Public License along with
* this program. If not, see <http://www.gnu.org/licenses/>.
*/
-
package gate.stanford.apps;
import gate.creole.PackagedController;
@@ -26,21 +25,18 @@
import java.net.URL;
import java.util.List;
-@CreoleResource(name="English POS Tagger and Dependency Parser",
- comment = "Ready-made application for Stanford English POS tagger and
parser",
- autoinstances = @AutoInstance)
+@CreoleResource(name = "English POS Tagger and Dependency Parser", comment =
"Ready-made application for Stanford English POS tagger and parser",
autoinstances = @AutoInstance)
public class EnglishPOSDependencies extends PackagedController {
-
private static final long serialVersionUID = 3163023140886167369L;
@Override
- @CreoleParameter(defaultValue="resources/sample_pos+parser_en.gapp")
+ @CreoleParameter(defaultValue = "resources/sample_pos+parser_en.gapp")
public void setPipelineURL(URL url) {
- this.url = url;
+ this.url = url;
}
-
+
@Override
- @CreoleParameter(defaultValue="Stanford Parser")
+ @CreoleParameter(defaultValue = "Stanford Parser")
public void setMenu(List<String> menu) {
super.setMenu(menu);
}
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs