Revision: 19514
http://sourceforge.net/p/gate/code/19514
Author: markagreenwood
Date: 2016-08-19 10:17:23 +0000 (Fri, 19 Aug 2016)
Log Message:
-----------
some cleaning up and fixing bugs highlighted by findbugs
Modified Paths:
--------------
gate/branches/sawdust2/plugins/Tagger_NP_Chunking/src/main/java/mark/chunking/Chunker.java
gate/branches/sawdust2/plugins/Tagger_NP_Chunking/src/main/java/mark/chunking/ChunkingApp.java
gate/branches/sawdust2/plugins/Tagger_NP_Chunking/src/main/java/mark/chunking/GATEWrapper.java
gate/branches/sawdust2/plugins/Tagger_NP_Chunking/src/main/java/mark/chunking/Rule.java
Modified:
gate/branches/sawdust2/plugins/Tagger_NP_Chunking/src/main/java/mark/chunking/Chunker.java
===================================================================
---
gate/branches/sawdust2/plugins/Tagger_NP_Chunking/src/main/java/mark/chunking/Chunker.java
2016-08-19 01:22:42 UTC (rev 19513)
+++
gate/branches/sawdust2/plugins/Tagger_NP_Chunking/src/main/java/mark/chunking/Chunker.java
2016-08-19 10:17:23 UTC (rev 19514)
@@ -1,5 +1,5 @@
/************************************************************************
- * Copyright (C) 2004-2009 The University of Sheffield *
+ * Copyright (C) 2004-2016 The University of Sheffield *
* Developed by Mark Greenwood <[email protected]> *
* *
* This program is free software; you can redistribute it and/or modify *
@@ -22,203 +22,105 @@
import gate.util.BomStrippingInputStreamReader;
import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileReader;
import java.io.IOException;
-import java.io.InputStreamReader;
+import java.io.Serializable;
import java.net.URL;
import java.util.ArrayList;
-import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
-import java.util.Map;
-public class Chunker
-{
- private List<Rule> rules = new ArrayList<Rule>();
+public class Chunker implements Serializable {
- public static void main(String args[]) throws Exception
- {
- Chunker c = new Chunker((new File(args[0])).toURI().toURL());
+ private static final long serialVersionUID = 9147365383638459068L;
- BufferedReader in = new BufferedReader(new FileReader(args[1]));
+ private List<Rule> rules = new ArrayList<Rule>();
- String line = in.readLine();
-
- Map<String,String> chunkTags = new HashMap<String,String>();
-
- while (line != null)
- {
- if (!line.trim().equals(""))
- {
- String[] tags = line.split(" ");
- chunkTags.put(tags[0],tags[1]);
- }
-
- line = in.readLine();
- }
-
- in.close();
-
- in = new BomStrippingInputStreamReader(System.in);
-
- line = in.readLine();
-
- while (line != null)
- {
- String[] tokens = line.split(" ");
-
- List<String> wl = new ArrayList<String>();
- List<String> tl = new ArrayList<String>();
- List<String> pl = new ArrayList<String>();
-
- for (int i = 0 ; i < tokens.length ; ++i)
- {
- String[] data = tokens[i].split("/");
-
- wl.add(data[0]);
- pl.add(data[1]);
-
- String ct = chunkTags.get(data[1]);
-
- if (ct == null) ct = "I";
-
- tl.add(ct);
- }
-
- tl = c.chunkSentence(wl,tl,pl);
-
- boolean inBaseNP = false;
- boolean lineBegin = true;
-
- for (int i = 0 ; i < wl.size() ; ++i)
- {
- String ct = tl.get(i);
-
- if (inBaseNP)
- {
- if (ct.equals("B"))
- {
- System.out.print(" ] [");
- }
- else if (ct.equals("O"))
- {
- System.out.print(" ]");
- inBaseNP = false;
- }
- }
- else
- {
- if (ct.equals("B") || ct.equals("I"))
- {
- if (!lineBegin)
System.out.print(" ");
- lineBegin = false;
- System.out.print("[");
- inBaseNP = true;
- }
- }
- if (!lineBegin) System.out.print(" ");
- lineBegin = false;
- System.out.print(wl.get(i) + "/" + pl.get(i));
- }
-
- if (inBaseNP)
- {
- System.out.print("]");
- }
-
- System.out.println();
-
- line = in.readLine();
- }
- }
-
/**
* The only constructor that reads the rules from a URL.
- * @param u the URL of the rules file.
+ *
+ * @param u
+ * the URL of the rules file.
**/
- public Chunker(URL u) throws IOException
- {
- //Open up the rules file read for reading
- BufferedReader in = new
BomStrippingInputStreamReader(u.openStream());
+ public Chunker(URL u) throws IOException {
+ // Open up the rules file read for reading
+ try (BufferedReader in = new BomStrippingInputStreamReader(
+ u.openStream())) {
- //read in the first rule from the file
- String rule = in.readLine();
+ // read in the first rule from the file
+ String rule = in.readLine();
- while (rule != null)
- {
- //while there are still rules to process...
+ while (rule != null) {
+ // while there are still rules to process...
- if (!rule.trim().equals(""))
- {
- //create and add a rule to the list of rules
- rules.add(new Rule(rule));
+ if (!rule.trim().equals("")) {
+ // create and add a rule to the list of
rules
+ rules.add(new Rule(rule));
+ }
+
+ // read in the next rule;
+ rule = in.readLine();
}
-
- //read in the next rule;
- rule = in.readLine();
}
}
/**
- * This is the method which does all the work and returns
- * an updated set of chunk tags.
- * @param words an ordered List of the words within the sentence.
- * @param tags an ordered List of the chunk tags within the sentence.
- * @param pos an ordered List of the POS tags within the sentence.
+ * This is the method which does all the work and returns an updated
set of
+ * chunk tags.
+ *
+ * @param words
+ * an ordered List of the words within the sentence.
+ * @param tags
+ * an ordered List of the chunk tags within the sentence.
+ * @param pos
+ * an ordered List of the POS tags within the sentence.
* @return an ordered List of the updated chunk tags for the sentence.
**/
- public List<String> chunkSentence(List<String> words, List<String>
tags, List<String> pos)
- {
- //add the word/pos/tag that represents the end of
- //the sentence, cos some of the rules match against
- //the end of the sentence
+ public List<String> chunkSentence(List<String> words, List<String> tags,
+ List<String> pos) {
+ // add the word/pos/tag that represents the end of
+ // the sentence, cos some of the rules match against
+ // the end of the sentence
words.add("ZZZ");
pos.add("ZZZ");
tags.add("Z");
- //Get an iterator over the rules and loop
- //through them...
+ // Get an iterator over the rules and loop
+ // through them...
Iterator<Rule> it = rules.iterator();
- while (it.hasNext())
- {
- //create an empty list to hold the new
- //chunk tags for this iterations
+ while (it.hasNext()) {
+ // create an empty list to hold the new
+ // chunk tags for this iterations
List<String> newTags = new ArrayList<String>();
- //get the next rule we are going to apply
+ // get the next rule we are going to apply
Rule r = it.next();
- //loop over all the words in the sentence
- for (int i = 0 ; i < words.size() ; ++i)
- {
- if (r.match(i,words,tags,pos))
- {
- //if the rule matches against the
current
- //word in the sentence then and the new
tag
- //from the rule to the new tag list
+ // loop over all the words in the sentence
+ for (int i = 0; i < words.size(); ++i) {
+ if (r.match(i, words, tags, pos)) {
+ // if the rule matches against the
current
+ // word in the sentence then and the
new tag
+ // from the rule to the new tag list
newTags.add(r.getNewTag());
- }
- else
- {
- //the rule didn't match so simply copy
the
- //chunk tag that was already assigned
+ } else {
+ // the rule didn't match so simply copy
the
+ // chunk tag that was already assigned
newTags.add(tags.get(i));
}
}
- //now replace the old tags with the new ones ready
- //for running the next rule, this stops rule-chaining
+ // now replace the old tags with the new ones ready
+ // for running the next rule, this stops rule-chaining
tags = newTags;
}
- //remove the last token from each list as these
- //are not part of the original input sentence
- words.remove(words.size()-1);
- pos.remove(pos.size()-1);
- tags.remove(tags.size()-1);
+ // remove the last token from each list as these
+ // are not part of the original input sentence
+ words.remove(words.size() - 1);
+ pos.remove(pos.size() - 1);
+ tags.remove(tags.size() - 1);
- //return the final updated chunk tag lists
+ // return the final updated chunk tag lists
return tags;
}
}
\ No newline at end of file
Modified:
gate/branches/sawdust2/plugins/Tagger_NP_Chunking/src/main/java/mark/chunking/ChunkingApp.java
===================================================================
---
gate/branches/sawdust2/plugins/Tagger_NP_Chunking/src/main/java/mark/chunking/ChunkingApp.java
2016-08-19 01:22:42 UTC (rev 19513)
+++
gate/branches/sawdust2/plugins/Tagger_NP_Chunking/src/main/java/mark/chunking/ChunkingApp.java
2016-08-19 10:17:23 UTC (rev 19514)
@@ -1,3 +1,22 @@
+/************************************************************************
+ * Copyright (C) 2004-2016 The University of Sheffield *
+ * Developed by Mark Greenwood <[email protected]> *
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU Lesser General Public License as *
+ * published by the Free Software Foundation; either version 2.1 of the *
+ * License, or (at your option) any later version. *
+ * *
+ * This program is distributed in the hope that it will be useful, *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
+ * GNU General Public License for more details. *
+ * *
+ * You should have received a copy of the GNU Lesser General Public *
+ * License along with this program; if not, write to the Free Software *
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *
+ ************************************************************************/
+
package mark.chunking;
import gate.creole.PackagedController;
Modified:
gate/branches/sawdust2/plugins/Tagger_NP_Chunking/src/main/java/mark/chunking/GATEWrapper.java
===================================================================
---
gate/branches/sawdust2/plugins/Tagger_NP_Chunking/src/main/java/mark/chunking/GATEWrapper.java
2016-08-19 01:22:42 UTC (rev 19513)
+++
gate/branches/sawdust2/plugins/Tagger_NP_Chunking/src/main/java/mark/chunking/GATEWrapper.java
2016-08-19 10:17:23 UTC (rev 19514)
@@ -1,5 +1,5 @@
/************************************************************************
- * Copyright (C) 2004-2009 The University of Sheffield *
+ * Copyright (C) 2004-2016 The University of Sheffield *
* Developed by Mark Greenwood <[email protected]> *
* *
* This program is free software; you can redistribute it and/or modify *
@@ -23,7 +23,6 @@
import gate.AnnotationSet;
import gate.Factory;
import gate.FeatureMap;
-import gate.ProcessingResource;
import gate.Resource;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
@@ -37,7 +36,7 @@
import gate.util.OffsetComparator;
import java.io.BufferedReader;
-import java.io.Serializable;
+import java.io.IOException;
import java.net.URL;
import java.text.NumberFormat;
import java.util.ArrayList;
@@ -48,343 +47,340 @@
import java.util.Map;
@CreoleResource(name = "Noun Phrase Chunker", comment = "Implementation of the
Ramshaw and Marcus base noun phrase chunker", helpURL =
"http://gate.ac.uk/userguide/sec:parsers:npchunker", icon = "NpChunker")
-public class GATEWrapper extends AbstractLanguageAnalyser implements
- ProcessingResource,
- Serializable {
-
+public class GATEWrapper extends AbstractLanguageAnalyser {
+
private static final long serialVersionUID = -801244032207014722L;
- private Chunker c = null;
+ private Chunker c = null;
- private Map<String,String> chunkTags = null;
+ private Map<String, String> chunkTags = null;
- private OffsetComparator offsetComparator = new OffsetComparator();
+ private URL posTagURL;
- private URL posTagURL;
+ @CreoleParameter(defaultValue = "resources/pos_tag_dict", comment =
"The URL of the pos_tag_dict file.")
+ public void setPosTagURL(URL posTagURL) {
+ this.posTagURL = posTagURL;
+ }
- @CreoleParameter(defaultValue="resources/pos_tag_dict",comment="The URL of
the pos_tag_dict file.")
- public void setPosTagURL(URL posTagURL) {
- this.posTagURL = posTagURL;
- }
+ public URL getPosTagURL() {
+ return posTagURL;
+ }
- public URL getPosTagURL() {
- return posTagURL;
- }
+ private URL rulesURL;
- private URL rulesURL;
+ @CreoleParameter(defaultValue = "resources/rules", comment = "The URL
of the rules file.")
+ public void setRulesURL(URL rulesURL) {
+ this.rulesURL = rulesURL;
+ }
- @CreoleParameter(defaultValue="resources/rules",comment="The URL of the
rules file.")
- public void setRulesURL(URL rulesURL) {
- this.rulesURL = rulesURL;
- }
+ public URL getRulesURL() {
+ return rulesURL;
+ }
- public URL getRulesURL() {
- return rulesURL;
- }
+ private String posFeature;
- private String posFeature;
+ @RunTime
+ @CreoleParameter(defaultValue = "category", comment = "The name of the
feature which holds the POS tag.")
+ public void setPosFeature(String posFeature) {
+ this.posFeature = posFeature;
+ }
- @RunTime
- @CreoleParameter(defaultValue="category",comment="The name of the feature
which holds the POS tag.")
- public void setPosFeature(String posFeature) {
- this.posFeature = posFeature;
- }
+ public String getPosFeature() {
+ return posFeature;
+ }
- public String getPosFeature() {
- return posFeature;
- }
+ private String unknownTag;
- private String unknownTag;
+ @RunTime
+ @CreoleParameter(defaultValue = "I", comment = "The chunk tag to use
for an unknown POS tag.")
+ public void setUnknownTag(String unknownTag) {
+ this.unknownTag = unknownTag;
+ }
- @RunTime
- @CreoleParameter(defaultValue="I",comment="The chunk tag to use for an
unknown POS tag.")
- public void setUnknownTag(String unknownTag) {
- this.unknownTag = unknownTag;
- }
+ public String getUnknownTag() {
+ return unknownTag;
+ }
- public String getUnknownTag() {
- return unknownTag;
- }
+ private String inputASName;
- private String inputASName;
+ @Optional
+ @RunTime
+ @CreoleParameter(comment = "The name of the annotation set used for
input.")
+ public void setInputASName(String inputASName) {
+ this.inputASName = inputASName;
+ }
- @Optional
- @RunTime
- @CreoleParameter(comment="The name of the annotation set used for input.")
- public void setInputASName(String inputASName) {
- this.inputASName = inputASName;
- }
+ public String getInputASName() {
+ return inputASName;
+ }
- public String getInputASName() {
- return inputASName;
- }
+ private String outputASName;
- private String outputASName;
+ @Optional
+ @RunTime
+ @CreoleParameter(comment = "The name of the annotation set used for
output.")
+ public void setOutputASName(String outputASName) {
+ this.outputASName = outputASName;
+ }
- @Optional
- @RunTime
- @CreoleParameter(comment="The name of the annotation set used for output.")
- public void setOutputASName(String outputASName) {
- this.outputASName = outputASName;
- }
+ public String getOutputASName() {
+ return outputASName;
+ }
- public String getOutputASName() {
- return outputASName;
- }
+ private String annotationName;
- private String annotationName;
+ @RunTime
+ @CreoleParameter(defaultValue = "NounChunk", comment = "The name of the
annotations added to mark noun chunks.")
+ public void setAnnotationName(String annotationName) {
+ this.annotationName = annotationName;
+ }
- @RunTime
- @CreoleParameter(defaultValue="NounChunk",comment="The name of the
annotations added to mark noun chunks.")
- public void setAnnotationName(String annotationName) {
- this.annotationName = annotationName;
- }
+ public String getAnnotationName() {
+ return annotationName;
+ }
- public String getAnnotationName() {
- return annotationName;
- }
+ public Resource init() throws ResourceInstantiationException {
+ if (rulesURL == null) {
+ throw new ResourceInstantiationException(
+ "Rules URL must be specified");
+ }
- public Resource init() throws ResourceInstantiationException {
- if(rulesURL == null) {
- throw new ResourceInstantiationException("Rules URL must be specified");
- }
+ if (posTagURL == null) {
+ throw new ResourceInstantiationException(
+ "POS tag dictionary URL must be
specified");
+ }
- if(posTagURL == null) {
- throw new ResourceInstantiationException(
- "POS tag dictionary URL must be specified");
- }
+ try (BufferedReader in = new BomStrippingInputStreamReader(
+ posTagURL.openStream())) {
+ // lets create a new Chunker using the URL provided
(which we know
+ // is not null as we already checked it).
+ c = new Chunker(rulesURL);
- try {
- // lets create a new Chunker using the URL provided (which we know
- // is not null as we already checked it).
- c = new Chunker(rulesURL);
+ // read in the first line of the file
+ String line = in.readLine();
- // Open a reader over the pos_tag_dict file so we can load
- // the database
- BufferedReader in = new BomStrippingInputStreamReader(posTagURL
- .openStream());
+ // create a new empty map to hold the pos and chunk tags
+ chunkTags = new HashMap<String, String>();
- // read in the first line of the file
- String line = in.readLine();
+ while (line != null) {
+ // while there is still data in the file...
- // create a new empty map to hold the pos and chunk tags
- chunkTags = new HashMap<String,String>();
+ // split the current line into two parts
+ String[] tags = line.split(" ");
- while(line != null) {
- // while there is still data in the file...
+ // put the data in the map, POS tags as key
+ // chunk tag as value
+ chunkTags.put(tags[0], tags[1]);
- // split the current line into two parts
- String[] tags = line.split(" ");
+ // get the next line from the data file
+ line = in.readLine();
+ }
- // put the data in the map, POS tags as key
- // chunk tag as value
- chunkTags.put(tags[0], tags[1]);
+ // close the data file now we have finished with it
+ in.close();
+ } catch (IOException e) {
+ // if an error occurred then throw an exception so that
the user
+ // knows
+ throw new ResourceInstantiationException(
+ "Unable to correctly init the chunker:
" + e.getMessage());
+ }
- // get the next line from the data file
- line = in.readLine();
- }
+ // if we get to here then everything has initialised correctly
+ // so return this instance
+ return this;
+ }
- // close the data file now we have finished with it
- in.close();
- }
- catch(Exception e) {
- // if an error occurred then throw an exception so that the user
- // knows
- throw new ResourceInstantiationException(
- "Unable to correctly init the chunker: " + e.getMessage());
- }
+ public void execute() throws ExecutionException {
+ // lets get the AnnotationSet we are using as input. Get either
the
+ // set the user has asked for or if they haven't specified use
the
+ // default set
+ if (inputASName != null && inputASName.equals(""))
+ inputASName = null;
+ AnnotationSet inputAS = (inputASName == null) ? document
+ .getAnnotations() :
document.getAnnotations(inputASName);
- // if we get to here then everything has initialised correctly
- // so return this instance
- return this;
- }
+ // lets get the AnnotationSet we are using as output. Get
either the
+ // set the user has asked for or if they haven't specified use
the
+ // default set
+ if (outputASName != null && outputASName.equals(""))
+ outputASName = null;
+ AnnotationSet outputAS = (outputASName == null) ? document
+ .getAnnotations() :
document.getAnnotations(outputASName);
- public void execute() throws ExecutionException {
- // lets get the AnnotationSet we are using as input. Get either the
- // set the user has asked for or if they haven't specified use the
- // default set
- if(inputASName != null && inputASName.equals("")) inputASName = null;
- AnnotationSet inputAS = (inputASName == null)
- ? document.getAnnotations()
- : document.getAnnotations(inputASName);
+ // Get the set of sentences contained within the current
document
+ AnnotationSet sentences = inputAS.get(SENTENCE_ANNOTATION_TYPE);
- // lets get the AnnotationSet we are using as output. Get either the
- // set the user has asked for or if they haven't specified use the
- // default set
- if(outputASName != null && outputASName.equals("")) outputASName = null;
- AnnotationSet outputAS = (outputASName == null)
- ? document.getAnnotations()
- : document.getAnnotations(outputASName);
+ // All annotations of type tokens
+ AnnotationSet tokenas = inputAS.get(TOKEN_ANNOTATION_TYPE);
- // Get the set of sentences contained within the current document
- AnnotationSet sentences = inputAS.get(SENTENCE_ANNOTATION_TYPE);
+ if (sentences != null && sentences.size() > 0) {
+ // assuming there are sentences...
- // All annotations of type tokens
- AnnotationSet tokenas = inputAS.get(TOKEN_ANNOTATION_TYPE);
+ // get the current time to use as part of the progress
feedback
+ long startTime = System.currentTimeMillis();
- if(sentences != null && sentences.size() > 0) {
- // assuming there are sentences...
+ // tell the user we are just starting to chunk the
document
+ fireStatusChanged("Chunking " + document.getName());
+ fireProgressChanged(0);
- // get the current time to use as part of the progress feedback
- long startTime = System.currentTimeMillis();
+ // we are just starting so we haven't processed a
document yet
+ // so remember this ready for the progress feedback
+ int i = 0;
- // tell the user we are just starting to chunk the document
- fireStatusChanged("Chunking " + document.getName());
- fireProgressChanged(0);
+ // Loop through all the sentences
+ Iterator<Annotation> sit = sentences.iterator();
+ while (sit.hasNext()) {
+ // get the current sentence to process
+ Annotation sentence = sit.next();
- // we are just starting so we haven't processed a document yet
- // so remember this ready for the progress feedback
- int i = 0;
+ // Get a sorted list of the tokens within the
current sentence
+ List<Annotation> tokens = new
ArrayList<Annotation>();
+
tokens.addAll(tokenas.getContained(sentence.getStartNode()
+ .getOffset(),
sentence.getEndNode().getOffset()));
+ Collections.sort(tokens, new
OffsetComparator());
- // Loop through all the sentences
- Iterator<Annotation> sit = sentences.iterator();
- while(sit.hasNext()) {
- // get the current sentence to process
- Annotation sentence = sit.next();
+ // Create three empty lists to hold the words,
pos and chunk
+ // tags of the tokens in the current sentence
+ List<String> wl = new ArrayList<String>();
+ List<String> tl = new ArrayList<String>();
+ List<String> pl = new ArrayList<String>();
- // Get a sorted list of the tokens within the current sentence
- List<Annotation> tokens = new ArrayList<Annotation>();
- tokens.addAll(tokenas.getContained(sentence.getStartNode().getOffset(),
- sentence.getEndNode().getOffset()));
- Collections.sort(tokens, offsetComparator);
+ // Loop through all the tokens in the current
sentence
+ Iterator<Annotation> tit = tokens.iterator();
+ while (tit.hasNext()) {
+ // get the current token to process
+ Annotation token = tit.next();
- // Create three empty lists to hold the words, pos and chunk
- // tags of the tokens in the current sentence
- List<String> wl = new ArrayList<String>();
- List<String> tl = new ArrayList<String>();
- List<String> pl = new ArrayList<String>();
+ // add the string spanned by the
current token to the list
+ // of
+ // words
+ wl.add((String)
token.getFeatures().get("string"));
- // Loop through all the tokens in the current sentence
- Iterator<Annotation> tit = tokens.iterator();
- while(tit.hasNext()) {
- // get the current token to process
- Annotation token = tit.next();
+ // get the POS tag for the current token
+ String pos = (String)
token.getFeatures().get(posFeature);
- // add the string spanned by the current token to the list of
- // words
- wl.add((String)token.getFeatures().get("string"));
+ // add the POS tag to the list of POS
tags
+ pl.add(pos);
- // get the POS tag for the current token
- String pos = (String)token.getFeatures().get(posFeature);
+ // get the initial chunk tag for this
POS tag
+ String chunkTag = chunkTags.get(pos);
- // add the POS tag to the list of POS tags
- pl.add(pos);
+ // if the chunk tag is null then use
the unknown chunk tag
+ if (chunkTag == null)
+ chunkTag = unknownTag;
- // get the initial chunk tag for this POS tag
- String chunkTag = chunkTags.get(pos);
+ // now add the chunk tag to the list of
chunk tags
+ tl.add(chunkTag);
+ }
- // if the chunk tag is null then use the unknown chunk tag
- if(chunkTag == null) chunkTag = unknownTag;
+ // run the chunker over the current sentence
and get back
+ // an updated list of chunk tags
+ tl = c.chunkSentence(wl, tl, pl);
- // now add the chunk tag to the list of chunk tags
- tl.add(chunkTag);
- }
+ // a variable to hold the index of the token
which
+ // starts the current noun chunk
+ int start = 0;
- // run the chunker over the current sentence and get back
- // an updated list of chunk tags
- tl = c.chunkSentence(wl, tl, pl);
+ // a flag so we know if we are in an NP or not
+ boolean inBaseNP = false;
- // a variable to hold the index of the token which
- // starts the current noun chunk
- int start = 0;
+ // Loop through all the chunk tags in the
current sentence
+ // so we can find the noun chunks
+ for (int tIndex = 0; tIndex < tl.size();
++tIndex) {
+ // get the current chunk tag
+ String ct = tl.get(tIndex);
- // a flag so we know if we are in an NP or not
- boolean inBaseNP = false;
+ if (inBaseNP) {
+ // if we are currently inside a
noun chunk then...
- // Loop through all the chunk tags in the current sentence
- // so we can find the noun chunks
- for(int tIndex = 0; tIndex < tl.size(); ++tIndex) {
- // get the current chunk tag
- String ct = tl.get(tIndex);
+ if (ct.equals("B")) {
+ // if the chunk tag is
"B" then we are about to
+ // start a
+ // new chunk so record
the one that has just
+ // finished
+ addAnnotation(outputAS,
tokens, start, tIndex - 1);
- if(inBaseNP) {
- // if we are currently inside a noun chunk then...
+ // now reset the
beginning of the chunk to the
+ // current
+ // token
+ start = tIndex;
+ } else if (ct.equals("O")) {
+ // if the chunk tag is
"O" then we have dropped out
+ // the end of a chunk
so add the chunk we just
+ // finished
+ addAnnotation(outputAS,
tokens, start, tIndex - 1);
- if(ct.equals("B")) {
- // if the chunk tag is "B" then we are about to start a
- // new chunk so record the one that has just finished
- addAnnotation(outputAS, tokens, start, tIndex - 1);
+ // now flag that we are
outside of any chunk
+ inBaseNP = false;
+ }
+ } else {
+ // we aren't currently in a
noun chunk so...
- // now reset the beginning of the chunk to the current
- // token
- start = tIndex;
- }
- else if(ct.equals("O")) {
- // if the chunk tag is "O" then we have dropped out
- // the end of a chunk so add the chunk we just finished
- addAnnotation(outputAS, tokens, start, tIndex - 1);
+ if (ct.equals("B") ||
ct.equals("I")) {
+ // if the chunk tag is
"B" or "I" then we have found
+ // the beginning of a
chunk, so....
- // now flag that we are outside of any chunk
- inBaseNP = false;
- }
- }
- else {
- // we aren't currently in a noun chunk so...
+ // record the start
index
+ start = tIndex;
- if(ct.equals("B") || ct.equals("I")) {
- // if the chunk tag is "B" or "I" then we have found
- // the beginning of a chunk, so....
+ // and flag that we are
now inside a chunk
+ inBaseNP = true;
+ }
+ }
+ }
- // record the start index
- start = tIndex;
+ if (inBaseNP) {
+ // if we got to the end of a sentence
and we are still in a
+ // noun chunk then we need to close the
end and add the
+ // annotation
+ addAnnotation(outputAS, tokens, start,
tl.size() - 1);
+ }
- // and flag that we are now inside a chunk
- inBaseNP = true;
- }
- }
- }
+ // update the progress stuff to show the
precentage of sentences
+ // we have processed so far
+ fireProgressChanged(i++ * 100 /
sentences.size());
+ }
- if(inBaseNP) {
- // if we got to the end of a sentence and we are still in a
- // noun chunk then we need to close the end and add the
- // annotation
- addAnnotation(outputAS, tokens, start, tl.size() - 1);
- }
+ // we have finished! so update the progress and tell
+ // the user how long it took to chunk the document
+ fireProcessFinished();
+ fireStatusChanged(document.getName()
+ + " chunked in "
+ + NumberFormat
+ .getInstance()
+ .format((double)
(System.currentTimeMillis() - startTime) / 1000)
+ + " seconds!");
+ } else {
+ // if there are no sentence annotations then throw an
exception as
+ // theres
+ // not much we can do
+ throw new GateRuntimeException(
+ "No sentences to process! Please run a
sentence splitter first!");
+ }
+ }
- // update the progress stuff to show the precentage of sentences
- // we have processed so far
- fireProgressChanged(i++ * 100 / sentences.size());
- }
+ private void addAnnotation(AnnotationSet outputAS, List<Annotation>
tokens,
+ int start, int end) {
+ // Create a new FeatureMap to act as the features for the new
+ // annotation
+ // but we will leave it blank for now as we don't have anything
to
+ // add
+ FeatureMap params = Factory.newFeatureMap();
- // we have finished! so update the progress and tell
- // the user how long it took to chunk the document
- fireProcessFinished();
- fireStatusChanged(document.getName()
- + " chunked in "
- + NumberFormat.getInstance().format(
- (double)(System.currentTimeMillis() - startTime) / 1000)
- + " seconds!");
- }
- else {
- // if there are no sentence annotations then throw an exception as
- // theres
- // not much we can do
- throw new GateRuntimeException(
- "No sentences to process! Please run a sentence splitter
first!");
- }
- }
+ // Get the token annotation from the beginning of the chunk
+ Annotation aStart = tokens.get(start);
- private void addAnnotation(AnnotationSet outputAS, List<Annotation> tokens,
int start,
- int end) {
- // Create a new FeatureMap to act as the features for the new
- // annotation
- // but we will leave it blank for now as we don't have anything to
- // add
- FeatureMap params = Factory.newFeatureMap();
+ // Get the token annotation from the end of the chunk
+ Annotation aEnd = tokens.get(end);
- // Get the token annotation from the beginning of the chunk
- Annotation aStart = tokens.get(start);
+ // This spots errors where the start is after the end. What
+ // we should do is figure out why this occurs in the first place
+ if (aStart.getStartNode().getOffset().longValue() >=
aEnd.getEndNode()
+ .getOffset().longValue())
+ return;
- // Get the token annotation from the end of the chunk
- Annotation aEnd = tokens.get(end);
-
- // This spots errors where the start is after the end. What
- // we should do is figure out why this occurs in the first place
- if(aStart.getStartNode().getOffset().longValue() >= aEnd.getEndNode()
- .getOffset().longValue()) return;
-
- // add a new annotation to mark the noun chunk
- outputAS.add(aStart.getStartNode(), aEnd.getEndNode(), annotationName,
- params);
- }
+ // add a new annotation to mark the noun chunk
+ outputAS.add(aStart.getStartNode(), aEnd.getEndNode(),
annotationName,
+ params);
+ }
}
Modified:
gate/branches/sawdust2/plugins/Tagger_NP_Chunking/src/main/java/mark/chunking/Rule.java
===================================================================
---
gate/branches/sawdust2/plugins/Tagger_NP_Chunking/src/main/java/mark/chunking/Rule.java
2016-08-19 01:22:42 UTC (rev 19513)
+++
gate/branches/sawdust2/plugins/Tagger_NP_Chunking/src/main/java/mark/chunking/Rule.java
2016-08-19 10:17:23 UTC (rev 19514)
@@ -1,5 +1,5 @@
/************************************************************************
- * Copyright (C) 2004-2009 The University of Sheffield *
+ * Copyright (C) 2004-2016 The University of Sheffield *
* Developed by Mark Greenwood <[email protected]> *
* *
* This program is free software; you can redistribute it and/or modify *
@@ -19,18 +19,19 @@
package mark.chunking;
+import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
- * This class encapulates chunking rules, providing methods to
- * see if it matches against text and to see if it fits within
- * a sentence.
+ * This class encapsulates chunking rules, providing methods to see if it
+ * matches against text and to see if it fits within a sentence.
**/
-public class Rule
-{
+public class Rule implements Serializable {
+ private static final long serialVersionUID = 787395188113920930L;
+
/**
* A Pattern which will split the rule name into type and offsets.
**/
@@ -42,32 +43,28 @@
private static final Pattern po = Pattern.compile("_?[0-3]");
/**
- * The new chunk tag that is assigned if this rule matches the
- * input text.
+ * The new chunk tag that is assigned if this rule matches the input
text.
**/
private String outTag = null;
/**
- * The full line of the rules file which created this rule,
- * used mainly for the toString() method.
+ * The full line of the rules file which created this rule, used mainly
for
+ * the toString() method.
**/
private String rule = null;
/**
- * A List to hold the types (T, W or P)
- * of the parts of the rule.
+ * A List to hold the types (T, W or P) of the parts of the rule.
**/
private List<String> types = new ArrayList<String>();
/**
- * A List to hold the offsets for the
- * parts of the rule.
+ * A List to hold the offsets for the parts of the rule.
**/
private List<List<Integer>> offsets = new ArrayList<List<Integer>>();
/**
- * A List to hold the values for the
- * parts of the rule.
+ * A List to hold the values for the parts of the rule.
**/
private List<String> values = new ArrayList<String>();
@@ -81,174 +78,180 @@
**/
private int end = Integer.MIN_VALUE;
- public Rule(String rule)
- {
- //store a copy of the defining text
+ public Rule(String rule) {
+ // store a copy of the defining text
this.rule = rule;
- //split the rule into pieces at the spaces
+ // split the rule into pieces at the spaces
String[] parts = rule.split(" ");
- //store the last part of the rule as the out tag
- outTag = parts[parts.length-1];
+ // store the last part of the rule as the out tag
+ outTag = parts[parts.length - 1];
- //Use the Pattern to split the template type
- //into the different token/offsets
+ // Use the Pattern to split the template type
+ // into the different token/offsets
Matcher mt = pt.matcher(parts[0]);
- //We know that the first value is at position 1
- //in the split array
+ // We know that the first value is at position 1
+ // in the split array
int index = 1;
- while (mt.find())
- {
- //while there are still parts to process,
- //get the next one
+ while (mt.find()) {
+ // while there are still parts to process,
+ // get the next one
String to = mt.group();
- //store the type of this part
- types.add(to.substring(0,1));
+ // store the type of this part
+ types.add(to.substring(0, 1));
- //create a new list to hold the offsets
- //for this part
+ // create a new list to hold the offsets
+ // for this part
List<Integer> ofs = new ArrayList<Integer>();
- //split the offsets into separate parts
+ // split the offsets into separate parts
Matcher mo = po.matcher(to.substring(1));
- //store the value associated with this part of the rule
+ // store the value associated with this part of the rule
values.add(parts[index++]);
- while (mo.find())
- {
- //while there are more offsets,
+ while (mo.find()) {
+ // while there are more offsets,
- //get the next one and make an Integer from it
- //(we have to replace '_' by '-' first for it
to work)
- Integer offset = new
Integer(mo.group().replaceAll("_","-"));
+ // get the next one and make an Integer from it
+ // (we have to replace '_' by '-' first for it
to work)
+ Integer offset =
Integer.valueOf(mo.group().replaceAll("_", "-"));
- //if the current offset is before the known
beginning then
- //make this the beginning
- if (offset.intValue() < begin) begin =
offset.intValue();
+ // if the current offset is before the known
beginning then
+ // make this the beginning
+ if (offset.intValue() < begin)
+ begin = offset.intValue();
- //if the current offset is after the known
ending then
- //make this offset the end
- if (offset.intValue() > end) end =
offset.intValue();
+ // if the current offset is after the known
ending then
+ // make this offset the end
+ if (offset.intValue() > end)
+ end = offset.intValue();
- //store the offset in the list
+ // store the offset in the list
ofs.add(offset);
}
- //store the list of offsets for this part
+ // store the list of offsets for this part
offsets.add(ofs);
}
}
/**
- * Simply returns true if this rule matches against the sentence at
- * a given position. This method makes no alterations to the
- * tags assigned to any specific offset.
- * @param currentToken the index within the sentece of the token
- * upon which the rule is centered.
- * @param words an ordered List of the words within the sentence.
- * @param tags an ordered List of the chunk tags within the sentence.
- * @param pos an ordered List of the POS tags within the sentence.
+ * Simply returns true if this rule matches against the sentence at a
given
+ * position. This method makes no alterations to the tags assigned to
any
+ * specific offset.
+ *
+ * @param currentToken
+ * the index within the sentece of the token upon which the
rule
+ * is centered.
+ * @param words
+ * an ordered List of the words within the sentence.
+ * @param tags
+ * an ordered List of the chunk tags within the sentence.
+ * @param pos
+ * an ordered List of the POS tags within the sentence.
* @return true if the rule matches the input sentence, false otherwise.
**/
- public boolean match(int currentToken, List<String> words, List<String>
tags, List<String> pos)
- {
- //if the rule doesn't fit within the sentence then it can never
- //match so simply return false
- if (!withinSentence(words.size(), currentToken)) return false;
+ public boolean match(int currentToken, List<String> words,
+ List<String> tags, List<String> pos) {
+ // if the rule doesn't fit within the sentence then it can never
+ // match so simply return false
+ if (!withinSentence(words.size(), currentToken))
+ return false;
- //assume the rule will match
+ // assume the rule will match
boolean matched = true;
- //loop through all the parts of this rule
- for (int i = 0 ; i < types.size() ; ++i)
- {
- //get the current type
+ // loop through all the parts of this rule
+ for (int i = 0; i < types.size(); ++i) {
+ // get the current type
String type = types.get(i);
- //get the list of offsets for the part
+ // get the list of offsets for the part
List<Integer> ofs = offsets.get(i);
- //get the value for this part
+ // get the value for this part
String value = values.get(i);
- //A placeholder for the right list
+ // A placeholder for the right list
List<String> working = null;
- if (type.equals("T"))
- {
- //if the type is "T" then the list we
- //are going to work on contains chunk tags
+ if (type.equals("T")) {
+ // if the type is "T" then the list we
+ // are going to work on contains chunk tags
working = tags;
- }
- else if (type.equals("W"))
- {
- //if the type is "W" then the list we
- //are going to work on contains words
+ } else if (type.equals("W")) {
+ // if the type is "W" then the list we
+ // are going to work on contains words
working = words;
- }
- else if (type.equals("P"))
- {
- //if the type is "P" then the list we
- //are going to work on contains POS tags
+ } else if (type.equals("P")) {
+ // if the type is "P" then the list we
+ // are going to work on contains POS tags
working = pos;
+ } else {
+ // we have an invalid rule so we can't match it
+ return false;
}
- //get the first (maybe the only) offset for this part
+ // get the first (maybe the only) offset for this part
int offset = ofs.get(0).intValue();
- //does the value of this offset match the value given
in the rule
- boolean matchOffset =
working.get(currentToken+offset).equals(value);
+ // does the value of this offset match the value given
in the rule
+ boolean matchOffset = working.get(currentToken +
offset).equals(
+ value);
- for (int j = 1 ; j < ofs.size() ; ++j)
- {
- //if there is more than one offset then...
+ for (int j = 1; j < ofs.size(); ++j) {
+ // if there is more than one offset then...
- //get the next offset
+ // get the next offset
offset = ofs.get(j).intValue();
- //or the truth of matching the value in the
rule against
- //the value of the offset
- matchOffset = matchOffset ||
working.get(currentToken+offset).equals(value);
+ // or the truth of matching the value in the
rule against
+ // the value of the offset
+ matchOffset = matchOffset
+ || working.get(currentToken +
offset).equals(value);
}
- //combine the success/failure of matching this part
with that
- //of matching the rest of the rule
+ // combine the success/failure of matching this part
with that
+ // of matching the rest of the rule
matched = matched && matchOffset;
- //if we have failed to match there is no point trying
- //to match the rest of the rule so jump out of this loop
- if (!matched) i = types.size();
+ // if we have failed to match there is no point trying
+ // to match the rest of the rule so jump out of this
loop
+ if (!matched)
+ i = types.size();
}
- //return the result of matching we have found
+ // return the result of matching we have found
return matched;
}
/**
* Simply returns the new chunk tag to use if this rule matched.
+ *
* @return the new chunk tag.
**/
- public String getNewTag()
- {
- //simply return the out tag
+ public String getNewTag() {
+ // simply return the out tag
return outTag;
}
/**
- * A method which allows you to check that this rule fits within
- * the sentence when centered on a specific token.
- * @param numTokens the total number of tokens in the sentence.
- * @param currentToken the index of the token upon which the
- * rule is going to be centered.
+ * A method which allows you to check that this rule fits within the
+ * sentence when centered on a specific token.
+ *
+ * @param numTokens
+ * the total number of tokens in the sentence.
+ * @param currentToken
+ * the index of the token upon which the rule is going to be
+ * centered.
* @return true if the rule fits within the sentence, false otherwise.
**/
- public boolean withinSentence(int numTokens, int currentToken)
- {
+ public boolean withinSentence(int numTokens, int currentToken) {
int start = currentToken + begin;
int finish = currentToken + end;
@@ -259,9 +262,9 @@
return within;
}
- @Override public String toString()
- {
- //simply return the line of the rules file
+ @Override
+ public String toString() {
+ // simply return the line of the rules file
return rule;
}
}
\ No newline at end of file
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs