chunking

markagreenwood Fri, 19 Aug 2016 03:18:14 -0700

Revision: 19514
          http://sourceforge.net/p/gate/code/19514
Author:   markagreenwood
Date:     2016-08-19 10:17:23 +0000 (Fri, 19 Aug 2016)
Log Message:
-----------
some cleaning up and fixing bugs highlighted by findbugs


Modified Paths:
--------------
    
gate/branches/sawdust2/plugins/Tagger_NP_Chunking/src/main/java/mark/chunking/Chunker.java
    
gate/branches/sawdust2/plugins/Tagger_NP_Chunking/src/main/java/mark/chunking/ChunkingApp.java
    
gate/branches/sawdust2/plugins/Tagger_NP_Chunking/src/main/java/mark/chunking/GATEWrapper.java
    
gate/branches/sawdust2/plugins/Tagger_NP_Chunking/src/main/java/mark/chunking/Rule.java

Modified: 
gate/branches/sawdust2/plugins/Tagger_NP_Chunking/src/main/java/mark/chunking/Chunker.java
===================================================================
--- 
gate/branches/sawdust2/plugins/Tagger_NP_Chunking/src/main/java/mark/chunking/Chunker.java
  2016-08-19 01:22:42 UTC (rev 19513)
+++ 
gate/branches/sawdust2/plugins/Tagger_NP_Chunking/src/main/java/mark/chunking/Chunker.java
  2016-08-19 10:17:23 UTC (rev 19514)
@@ -1,5 +1,5 @@
 /************************************************************************
- *         Copyright (C) 2004-2009 The University of Sheffield          *
+ *         Copyright (C) 2004-2016 The University of Sheffield          *
  *       Developed by Mark Greenwood <[email protected]>       *
  *                                                                      *
  * This program is free software; you can redistribute it and/or modify *
@@ -22,203 +22,105 @@
 import gate.util.BomStrippingInputStreamReader;
 
 import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileReader;
 import java.io.IOException;
-import java.io.InputStreamReader;
+import java.io.Serializable;
 import java.net.URL;
 import java.util.ArrayList;
-import java.util.HashMap;
 import java.util.Iterator;
 import java.util.List;
-import java.util.Map;
 
-public class Chunker
-{
-       private List<Rule> rules = new ArrayList<Rule>();
+public class Chunker implements Serializable {
 
-       public static void main(String args[]) throws Exception
-       {
-               Chunker c = new Chunker((new File(args[0])).toURI().toURL());
+       private static final long serialVersionUID = 9147365383638459068L;
 
-               BufferedReader in = new BufferedReader(new FileReader(args[1]));
+       private List<Rule> rules = new ArrayList<Rule>();
 
-               String line = in.readLine();
-
-               Map<String,String> chunkTags = new HashMap<String,String>();
-
-               while (line != null)
-               {
-                       if (!line.trim().equals(""))
-                       {
-                               String[] tags = line.split(" ");
-                               chunkTags.put(tags[0],tags[1]);
-                       }
-
-                       line = in.readLine();
-               }
-
-               in.close();
-
-               in = new BomStrippingInputStreamReader(System.in);
-
-               line = in.readLine();
-
-               while (line != null)
-               {
-                       String[] tokens = line.split(" ");
-
-                       List<String> wl = new ArrayList<String>();
-                       List<String> tl = new ArrayList<String>();
-                       List<String> pl = new ArrayList<String>();
-
-                       for (int i = 0 ; i < tokens.length ; ++i)
-                       {
-                               String[] data = tokens[i].split("/");
-
-                               wl.add(data[0]);
-                               pl.add(data[1]);
-
-                               String ct = chunkTags.get(data[1]);
-
-                               if (ct == null) ct = "I";
-
-                               tl.add(ct);
-                       }
-
-                       tl = c.chunkSentence(wl,tl,pl);
-
-                       boolean inBaseNP = false;
-                       boolean lineBegin = true;
-
-                       for (int i = 0 ; i < wl.size() ; ++i)
-                       {
-                               String ct = tl.get(i);
-
-                               if (inBaseNP)
-                               {
-                                       if (ct.equals("B"))
-                                       {
-                                               System.out.print(" ] [");
-                                       }
-                                       else if (ct.equals("O"))
-                                       {
-                                               System.out.print(" ]");
-                                               inBaseNP = false;
-                                       }
-                               }
-                               else
-                               {
-                                       if (ct.equals("B") || ct.equals("I"))
-                                       {
-                                               if (!lineBegin) 
System.out.print(" ");
-                                               lineBegin = false;
-                                               System.out.print("[");
-                                               inBaseNP = true;
-                                       }
-                               }
-                               if (!lineBegin) System.out.print(" ");
-                               lineBegin = false;
-                               System.out.print(wl.get(i) + "/" + pl.get(i));
-                       }
-
-                       if (inBaseNP)
-                       {
-                               System.out.print("]");
-                       }
-
-                       System.out.println();
-
-                       line = in.readLine();
-               }
-       }
-
        /**
         * The only constructor that reads the rules from a URL.
-        * @param u the URL of the rules file.
+        * 
+        * @param u
+        *            the URL of the rules file.
         **/
-       public Chunker(URL u) throws IOException
-       {
-               //Open up the rules file read for reading
-               BufferedReader in = new 
BomStrippingInputStreamReader(u.openStream());
+       public Chunker(URL u) throws IOException {
+               // Open up the rules file read for reading
+               try (BufferedReader in = new BomStrippingInputStreamReader(
+                               u.openStream())) {
 
-               //read in the first rule from the file
-               String rule = in.readLine();
+                       // read in the first rule from the file
+                       String rule = in.readLine();
 
-               while (rule != null)
-               {
-                       //while there are still rules to process...
+                       while (rule != null) {
+                               // while there are still rules to process...
 
-                       if (!rule.trim().equals(""))
-                       {
-                               //create and add a rule to the list of rules
-                               rules.add(new Rule(rule));
+                               if (!rule.trim().equals("")) {
+                                       // create and add a rule to the list of 
rules
+                                       rules.add(new Rule(rule));
+                               }
+
+                               // read in the next rule;
+                               rule = in.readLine();
                        }
-
-                       //read in the next rule;
-                       rule = in.readLine();
                }
        }
 
        /**
-        * This is the method which does all the work and returns
-        * an updated set of chunk tags.
-        * @param words an ordered List of the words within the sentence.
-        * @param tags an ordered List of the chunk tags within the sentence.
-        * @param pos an ordered List of the POS tags within the sentence.
+        * This is the method which does all the work and returns an updated 
set of
+        * chunk tags.
+        * 
+        * @param words
+        *            an ordered List of the words within the sentence.
+        * @param tags
+        *            an ordered List of the chunk tags within the sentence.
+        * @param pos
+        *            an ordered List of the POS tags within the sentence.
         * @return an ordered List of the updated chunk tags for the sentence.
         **/
-       public List<String> chunkSentence(List<String> words, List<String> 
tags, List<String> pos)
-       {
-               //add the word/pos/tag that represents the end of
-               //the sentence, cos some of the rules match against
-               //the end of the sentence
+       public List<String> chunkSentence(List<String> words, List<String> tags,
+                       List<String> pos) {
+               // add the word/pos/tag that represents the end of
+               // the sentence, cos some of the rules match against
+               // the end of the sentence
                words.add("ZZZ");
                pos.add("ZZZ");
                tags.add("Z");
 
-               //Get an iterator over the rules and loop
-               //through them...
+               // Get an iterator over the rules and loop
+               // through them...
                Iterator<Rule> it = rules.iterator();
-               while (it.hasNext())
-               {
-                       //create an empty list to hold the new
-                       //chunk tags for this iterations
+               while (it.hasNext()) {
+                       // create an empty list to hold the new
+                       // chunk tags for this iterations
                        List<String> newTags = new ArrayList<String>();
 
-                       //get the next rule we are going to apply
+                       // get the next rule we are going to apply
                        Rule r = it.next();
 
-                       //loop over all the words in the sentence
-                       for (int i = 0 ; i < words.size() ; ++i)
-                       {
-                               if (r.match(i,words,tags,pos))
-                               {
-                                       //if the rule matches against the 
current
-                                       //word in the sentence then and the new 
tag
-                                       //from the rule to the new tag list
+                       // loop over all the words in the sentence
+                       for (int i = 0; i < words.size(); ++i) {
+                               if (r.match(i, words, tags, pos)) {
+                                       // if the rule matches against the 
current
+                                       // word in the sentence then and the 
new tag
+                                       // from the rule to the new tag list
                                        newTags.add(r.getNewTag());
-                               }
-                               else
-                               {
-                                       //the rule didn't match so simply copy 
the
-                                       //chunk tag that was already assigned
+                               } else {
+                                       // the rule didn't match so simply copy 
the
+                                       // chunk tag that was already assigned
                                        newTags.add(tags.get(i));
                                }
                        }
 
-                       //now replace the old tags with the new ones ready
-                       //for running the next rule, this stops rule-chaining
+                       // now replace the old tags with the new ones ready
+                       // for running the next rule, this stops rule-chaining
                        tags = newTags;
                }
 
-               //remove the last token from each list as these
-               //are not part of the original input sentence
-               words.remove(words.size()-1);
-               pos.remove(pos.size()-1);
-               tags.remove(tags.size()-1);
+               // remove the last token from each list as these
+               // are not part of the original input sentence
+               words.remove(words.size() - 1);
+               pos.remove(pos.size() - 1);
+               tags.remove(tags.size() - 1);
 
-               //return the final updated chunk tag lists
+               // return the final updated chunk tag lists
                return tags;
        }
 }
\ No newline at end of file

Modified: 
gate/branches/sawdust2/plugins/Tagger_NP_Chunking/src/main/java/mark/chunking/ChunkingApp.java
===================================================================
--- 
gate/branches/sawdust2/plugins/Tagger_NP_Chunking/src/main/java/mark/chunking/ChunkingApp.java
      2016-08-19 01:22:42 UTC (rev 19513)
+++ 
gate/branches/sawdust2/plugins/Tagger_NP_Chunking/src/main/java/mark/chunking/ChunkingApp.java
      2016-08-19 10:17:23 UTC (rev 19514)
@@ -1,3 +1,22 @@
+/************************************************************************
+ *         Copyright (C) 2004-2016 The University of Sheffield          *
+ *       Developed by Mark Greenwood <[email protected]>       *
+ *                                                                      *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU Lesser General Public License as       *
+ * published by the Free Software Foundation; either version 2.1 of the *
+ * License, or (at your option) any later version.                      *
+ *                                                                      *
+ * This program is distributed in the hope that it will be useful,      *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of       *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the        *
+ * GNU General Public License for more details.                         *
+ *                                                                      *
+ * You should have received a copy of the GNU Lesser General Public     *
+ * License along with this program; if not, write to the Free Software  *
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.            *
+ ************************************************************************/
+
 package mark.chunking;
 
 import gate.creole.PackagedController;

Modified: 
gate/branches/sawdust2/plugins/Tagger_NP_Chunking/src/main/java/mark/chunking/GATEWrapper.java
===================================================================
--- 
gate/branches/sawdust2/plugins/Tagger_NP_Chunking/src/main/java/mark/chunking/GATEWrapper.java
      2016-08-19 01:22:42 UTC (rev 19513)
+++ 
gate/branches/sawdust2/plugins/Tagger_NP_Chunking/src/main/java/mark/chunking/GATEWrapper.java
      2016-08-19 10:17:23 UTC (rev 19514)
@@ -1,5 +1,5 @@
 /************************************************************************
- *         Copyright (C) 2004-2009 The University of Sheffield          *
+ *         Copyright (C) 2004-2016 The University of Sheffield          *
  *       Developed by Mark Greenwood <[email protected]>       *
  *                                                                      *
  * This program is free software; you can redistribute it and/or modify *
@@ -23,7 +23,6 @@
 import gate.AnnotationSet;
 import gate.Factory;
 import gate.FeatureMap;
-import gate.ProcessingResource;
 import gate.Resource;
 import gate.creole.AbstractLanguageAnalyser;
 import gate.creole.ExecutionException;
@@ -37,7 +36,7 @@
 import gate.util.OffsetComparator;
 
 import java.io.BufferedReader;
-import java.io.Serializable;
+import java.io.IOException;
 import java.net.URL;
 import java.text.NumberFormat;
 import java.util.ArrayList;
@@ -48,343 +47,340 @@
 import java.util.Map;
 
 @CreoleResource(name = "Noun Phrase Chunker", comment = "Implementation of the 
Ramshaw and Marcus base noun phrase chunker", helpURL = 
"http://gate.ac.uk/userguide/sec:parsers:npchunker";, icon = "NpChunker")
-public class GATEWrapper extends AbstractLanguageAnalyser implements
-                                                         ProcessingResource,
-                                                         Serializable {
- 
+public class GATEWrapper extends AbstractLanguageAnalyser {
+
        private static final long serialVersionUID = -801244032207014722L;
 
-  private Chunker c = null;
+       private Chunker c = null;
 
-  private Map<String,String> chunkTags = null;
+       private Map<String, String> chunkTags = null;
 
-  private OffsetComparator offsetComparator = new OffsetComparator();
+       private URL posTagURL;
 
-  private URL posTagURL;
+       @CreoleParameter(defaultValue = "resources/pos_tag_dict", comment = 
"The URL of the pos_tag_dict file.")
+       public void setPosTagURL(URL posTagURL) {
+               this.posTagURL = posTagURL;
+       }
 
-  @CreoleParameter(defaultValue="resources/pos_tag_dict",comment="The URL of 
the pos_tag_dict file.")
-  public void setPosTagURL(URL posTagURL) {
-    this.posTagURL = posTagURL;
-  }
+       public URL getPosTagURL() {
+               return posTagURL;
+       }
 
-  public URL getPosTagURL() {
-    return posTagURL;
-  }
+       private URL rulesURL;
 
-  private URL rulesURL;
+       @CreoleParameter(defaultValue = "resources/rules", comment = "The URL 
of the rules file.")
+       public void setRulesURL(URL rulesURL) {
+               this.rulesURL = rulesURL;
+       }
 
-  @CreoleParameter(defaultValue="resources/rules",comment="The URL of the 
rules file.")
-  public void setRulesURL(URL rulesURL) {
-    this.rulesURL = rulesURL;
-  }
+       public URL getRulesURL() {
+               return rulesURL;
+       }
 
-  public URL getRulesURL() {
-    return rulesURL;
-  }
+       private String posFeature;
 
-  private String posFeature;
+       @RunTime
+       @CreoleParameter(defaultValue = "category", comment = "The name of the 
feature which holds the POS tag.")
+       public void setPosFeature(String posFeature) {
+               this.posFeature = posFeature;
+       }
 
-  @RunTime
-  @CreoleParameter(defaultValue="category",comment="The name of the feature 
which holds the POS tag.")
-  public void setPosFeature(String posFeature) {
-    this.posFeature = posFeature;
-  }
+       public String getPosFeature() {
+               return posFeature;
+       }
 
-  public String getPosFeature() {
-    return posFeature;
-  }
+       private String unknownTag;
 
-  private String unknownTag;
+       @RunTime
+       @CreoleParameter(defaultValue = "I", comment = "The chunk tag to use 
for an unknown POS tag.")
+       public void setUnknownTag(String unknownTag) {
+               this.unknownTag = unknownTag;
+       }
 
-  @RunTime
-  @CreoleParameter(defaultValue="I",comment="The chunk tag to use for an 
unknown POS tag.")
-  public void setUnknownTag(String unknownTag) {
-    this.unknownTag = unknownTag;
-  }
+       public String getUnknownTag() {
+               return unknownTag;
+       }
 
-  public String getUnknownTag() {
-    return unknownTag;
-  }
+       private String inputASName;
 
-  private String inputASName;
+       @Optional
+       @RunTime
+       @CreoleParameter(comment = "The name of the annotation set used for 
input.")
+       public void setInputASName(String inputASName) {
+               this.inputASName = inputASName;
+       }
 
-  @Optional
-  @RunTime
-  @CreoleParameter(comment="The name of the annotation set used for input.")
-  public void setInputASName(String inputASName) {
-    this.inputASName = inputASName;
-  }
+       public String getInputASName() {
+               return inputASName;
+       }
 
-  public String getInputASName() {
-    return inputASName;
-  }
+       private String outputASName;
 
-  private String outputASName;
+       @Optional
+       @RunTime
+       @CreoleParameter(comment = "The name of the annotation set used for 
output.")
+       public void setOutputASName(String outputASName) {
+               this.outputASName = outputASName;
+       }
 
-  @Optional
-  @RunTime
-  @CreoleParameter(comment="The name of the annotation set used for output.")
-  public void setOutputASName(String outputASName) {
-    this.outputASName = outputASName;
-  }
+       public String getOutputASName() {
+               return outputASName;
+       }
 
-  public String getOutputASName() {
-    return outputASName;
-  }
+       private String annotationName;
 
-  private String annotationName;
+       @RunTime
+       @CreoleParameter(defaultValue = "NounChunk", comment = "The name of the 
annotations added to mark noun chunks.")
+       public void setAnnotationName(String annotationName) {
+               this.annotationName = annotationName;
+       }
 
-  @RunTime
-  @CreoleParameter(defaultValue="NounChunk",comment="The name of the 
annotations added to mark noun chunks.")
-  public void setAnnotationName(String annotationName) {
-    this.annotationName = annotationName;
-  }
+       public String getAnnotationName() {
+               return annotationName;
+       }
 
-  public String getAnnotationName() {
-    return annotationName;
-  }
+       public Resource init() throws ResourceInstantiationException {
+               if (rulesURL == null) {
+                       throw new ResourceInstantiationException(
+                                       "Rules URL must be specified");
+               }
 
-  public Resource init() throws ResourceInstantiationException {
-    if(rulesURL == null) {
-      throw new ResourceInstantiationException("Rules URL must be specified");
-    }
+               if (posTagURL == null) {
+                       throw new ResourceInstantiationException(
+                                       "POS tag dictionary URL must be 
specified");
+               }
 
-    if(posTagURL == null) {
-      throw new ResourceInstantiationException(
-              "POS tag dictionary URL must be specified");
-    }
+               try (BufferedReader in = new BomStrippingInputStreamReader(
+                               posTagURL.openStream())) {
+                       // lets create a new Chunker using the URL provided 
(which we know
+                       // is not null as we already checked it).
+                       c = new Chunker(rulesURL);
 
-    try {
-      // lets create a new Chunker using the URL provided (which we know
-      // is not null as we already checked it).
-      c = new Chunker(rulesURL);
+                       // read in the first line of the file
+                       String line = in.readLine();
 
-      // Open a reader over the pos_tag_dict file so we can load
-      // the database
-      BufferedReader in = new BomStrippingInputStreamReader(posTagURL
-              .openStream());
+                       // create a new empty map to hold the pos and chunk tags
+                       chunkTags = new HashMap<String, String>();
 
-      // read in the first line of the file
-      String line = in.readLine();
+                       while (line != null) {
+                               // while there is still data in the file...
 
-      // create a new empty map to hold the pos and chunk tags
-      chunkTags = new HashMap<String,String>();
+                               // split the current line into two parts
+                               String[] tags = line.split(" ");
 
-      while(line != null) {
-        // while there is still data in the file...
+                               // put the data in the map, POS tags as key
+                               // chunk tag as value
+                               chunkTags.put(tags[0], tags[1]);
 
-        // split the current line into two parts
-        String[] tags = line.split(" ");
+                               // get the next line from the data file
+                               line = in.readLine();
+                       }
 
-        // put the data in the map, POS tags as key
-        // chunk tag as value
-        chunkTags.put(tags[0], tags[1]);
+                       // close the data file now we have finished with it
+                       in.close();
+               } catch (IOException e) {
+                       // if an error occurred then throw an exception so that 
the user
+                       // knows
+                       throw new ResourceInstantiationException(
+                                       "Unable to correctly init the chunker: 
" + e.getMessage());
+               }
 
-        // get the next line from the data file
-        line = in.readLine();
-      }
+               // if we get to here then everything has initialised correctly
+               // so return this instance
+               return this;
+       }
 
-      // close the data file now we have finished with it
-      in.close();
-    }
-    catch(Exception e) {
-      // if an error occurred then throw an exception so that the user
-      // knows
-      throw new ResourceInstantiationException(
-              "Unable to correctly init the chunker: " + e.getMessage());
-    }
+       public void execute() throws ExecutionException {
+               // lets get the AnnotationSet we are using as input. Get either 
the
+               // set the user has asked for or if they haven't specified use 
the
+               // default set
+               if (inputASName != null && inputASName.equals(""))
+                       inputASName = null;
+               AnnotationSet inputAS = (inputASName == null) ? document
+                               .getAnnotations() : 
document.getAnnotations(inputASName);
 
-    // if we get to here then everything has initialised correctly
-    // so return this instance
-    return this;
-  }
+               // lets get the AnnotationSet we are using as output. Get 
either the
+               // set the user has asked for or if they haven't specified use 
the
+               // default set
+               if (outputASName != null && outputASName.equals(""))
+                       outputASName = null;
+               AnnotationSet outputAS = (outputASName == null) ? document
+                               .getAnnotations() : 
document.getAnnotations(outputASName);
 
-  public void execute() throws ExecutionException {
-    // lets get the AnnotationSet we are using as input. Get either the
-    // set the user has asked for or if they haven't specified use the
-    // default set
-    if(inputASName != null && inputASName.equals("")) inputASName = null;
-    AnnotationSet inputAS = (inputASName == null)
-            ? document.getAnnotations()
-            : document.getAnnotations(inputASName);
+               // Get the set of sentences contained within the current 
document
+               AnnotationSet sentences = inputAS.get(SENTENCE_ANNOTATION_TYPE);
 
-    // lets get the AnnotationSet we are using as output. Get either the
-    // set the user has asked for or if they haven't specified use the
-    // default set
-    if(outputASName != null && outputASName.equals("")) outputASName = null;
-    AnnotationSet outputAS = (outputASName == null)
-            ? document.getAnnotations()
-            : document.getAnnotations(outputASName);
+               // All annotations of type tokens
+               AnnotationSet tokenas = inputAS.get(TOKEN_ANNOTATION_TYPE);
 
-    // Get the set of sentences contained within the current document
-    AnnotationSet sentences = inputAS.get(SENTENCE_ANNOTATION_TYPE);
+               if (sentences != null && sentences.size() > 0) {
+                       // assuming there are sentences...
 
-    // All annotations of type tokens
-    AnnotationSet tokenas = inputAS.get(TOKEN_ANNOTATION_TYPE);
+                       // get the current time to use as part of the progress 
feedback
+                       long startTime = System.currentTimeMillis();
 
-    if(sentences != null && sentences.size() > 0) {
-      // assuming there are sentences...
+                       // tell the user we are just starting to chunk the 
document
+                       fireStatusChanged("Chunking " + document.getName());
+                       fireProgressChanged(0);
 
-      // get the current time to use as part of the progress feedback
-      long startTime = System.currentTimeMillis();
+                       // we are just starting so we haven't processed a 
document yet
+                       // so remember this ready for the progress feedback
+                       int i = 0;
 
-      // tell the user we are just starting to chunk the document
-      fireStatusChanged("Chunking " + document.getName());
-      fireProgressChanged(0);
+                       // Loop through all the sentences
+                       Iterator<Annotation> sit = sentences.iterator();
+                       while (sit.hasNext()) {
+                               // get the current sentence to process
+                               Annotation sentence = sit.next();
 
-      // we are just starting so we haven't processed a document yet
-      // so remember this ready for the progress feedback
-      int i = 0;
+                               // Get a sorted list of the tokens within the 
current sentence
+                               List<Annotation> tokens = new 
ArrayList<Annotation>();
+                               
tokens.addAll(tokenas.getContained(sentence.getStartNode()
+                                               .getOffset(), 
sentence.getEndNode().getOffset()));
+                               Collections.sort(tokens, new 
OffsetComparator());
 
-      // Loop through all the sentences
-      Iterator<Annotation> sit = sentences.iterator();
-      while(sit.hasNext()) {
-        // get the current sentence to process
-        Annotation sentence = sit.next();
+                               // Create three empty lists to hold the words, 
pos and chunk
+                               // tags of the tokens in the current sentence
+                               List<String> wl = new ArrayList<String>();
+                               List<String> tl = new ArrayList<String>();
+                               List<String> pl = new ArrayList<String>();
 
-        // Get a sorted list of the tokens within the current sentence
-        List<Annotation> tokens = new ArrayList<Annotation>();
-        tokens.addAll(tokenas.getContained(sentence.getStartNode().getOffset(),
-                sentence.getEndNode().getOffset()));
-        Collections.sort(tokens, offsetComparator);
+                               // Loop through all the tokens in the current 
sentence
+                               Iterator<Annotation> tit = tokens.iterator();
+                               while (tit.hasNext()) {
+                                       // get the current token to process
+                                       Annotation token = tit.next();
 
-        // Create three empty lists to hold the words, pos and chunk
-        // tags of the tokens in the current sentence
-        List<String> wl = new ArrayList<String>();
-        List<String> tl = new ArrayList<String>();
-        List<String> pl = new ArrayList<String>();
+                                       // add the string spanned by the 
current token to the list
+                                       // of
+                                       // words
+                                       wl.add((String) 
token.getFeatures().get("string"));
 
-        // Loop through all the tokens in the current sentence
-        Iterator<Annotation> tit = tokens.iterator();
-        while(tit.hasNext()) {
-          // get the current token to process
-          Annotation token = tit.next();
+                                       // get the POS tag for the current token
+                                       String pos = (String) 
token.getFeatures().get(posFeature);
 
-          // add the string spanned by the current token to the list of
-          // words
-          wl.add((String)token.getFeatures().get("string"));
+                                       // add the POS tag to the list of POS 
tags
+                                       pl.add(pos);
 
-          // get the POS tag for the current token
-          String pos = (String)token.getFeatures().get(posFeature);
+                                       // get the initial chunk tag for this 
POS tag
+                                       String chunkTag = chunkTags.get(pos);
 
-          // add the POS tag to the list of POS tags
-          pl.add(pos);
+                                       // if the chunk tag is null then use 
the unknown chunk tag
+                                       if (chunkTag == null)
+                                               chunkTag = unknownTag;
 
-          // get the initial chunk tag for this POS tag
-          String chunkTag = chunkTags.get(pos);
+                                       // now add the chunk tag to the list of 
chunk tags
+                                       tl.add(chunkTag);
+                               }
 
-          // if the chunk tag is null then use the unknown chunk tag
-          if(chunkTag == null) chunkTag = unknownTag;
+                               // run the chunker over the current sentence 
and get back
+                               // an updated list of chunk tags
+                               tl = c.chunkSentence(wl, tl, pl);
 
-          // now add the chunk tag to the list of chunk tags
-          tl.add(chunkTag);
-        }
+                               // a variable to hold the index of the token 
which
+                               // starts the current noun chunk
+                               int start = 0;
 
-        // run the chunker over the current sentence and get back
-        // an updated list of chunk tags
-        tl = c.chunkSentence(wl, tl, pl);
+                               // a flag so we know if we are in an NP or not
+                               boolean inBaseNP = false;
 
-        // a variable to hold the index of the token which
-        // starts the current noun chunk
-        int start = 0;
+                               // Loop through all the chunk tags in the 
current sentence
+                               // so we can find the noun chunks
+                               for (int tIndex = 0; tIndex < tl.size(); 
++tIndex) {
+                                       // get the current chunk tag
+                                       String ct = tl.get(tIndex);
 
-        // a flag so we know if we are in an NP or not
-        boolean inBaseNP = false;
+                                       if (inBaseNP) {
+                                               // if we are currently inside a 
noun chunk then...
 
-        // Loop through all the chunk tags in the current sentence
-        // so we can find the noun chunks
-        for(int tIndex = 0; tIndex < tl.size(); ++tIndex) {
-          // get the current chunk tag
-          String ct = tl.get(tIndex);
+                                               if (ct.equals("B")) {
+                                                       // if the chunk tag is 
"B" then we are about to
+                                                       // start a
+                                                       // new chunk so record 
the one that has just
+                                                       // finished
+                                                       addAnnotation(outputAS, 
tokens, start, tIndex - 1);
 
-          if(inBaseNP) {
-            // if we are currently inside a noun chunk then...
+                                                       // now reset the 
beginning of the chunk to the
+                                                       // current
+                                                       // token
+                                                       start = tIndex;
+                                               } else if (ct.equals("O")) {
+                                                       // if the chunk tag is 
"O" then we have dropped out
+                                                       // the end of a chunk 
so add the chunk we just
+                                                       // finished
+                                                       addAnnotation(outputAS, 
tokens, start, tIndex - 1);
 
-            if(ct.equals("B")) {
-              // if the chunk tag is "B" then we are about to start a
-              // new chunk so record the one that has just finished
-              addAnnotation(outputAS, tokens, start, tIndex - 1);
+                                                       // now flag that we are 
outside of any chunk
+                                                       inBaseNP = false;
+                                               }
+                                       } else {
+                                               // we aren't currently in a 
noun chunk so...
 
-              // now reset the beginning of the chunk to the current
-              // token
-              start = tIndex;
-            }
-            else if(ct.equals("O")) {
-              // if the chunk tag is "O" then we have dropped out
-              // the end of a chunk so add the chunk we just finished
-              addAnnotation(outputAS, tokens, start, tIndex - 1);
+                                               if (ct.equals("B") || 
ct.equals("I")) {
+                                                       // if the chunk tag is 
"B" or "I" then we have found
+                                                       // the beginning of a 
chunk, so....
 
-              // now flag that we are outside of any chunk
-              inBaseNP = false;
-            }
-          }
-          else {
-            // we aren't currently in a noun chunk so...
+                                                       // record the start 
index
+                                                       start = tIndex;
 
-            if(ct.equals("B") || ct.equals("I")) {
-              // if the chunk tag is "B" or "I" then we have found
-              // the beginning of a chunk, so....
+                                                       // and flag that we are 
now inside a chunk
+                                                       inBaseNP = true;
+                                               }
+                                       }
+                               }
 
-              // record the start index
-              start = tIndex;
+                               if (inBaseNP) {
+                                       // if we got to the end of a sentence 
and we are still in a
+                                       // noun chunk then we need to close the 
end and add the
+                                       // annotation
+                                       addAnnotation(outputAS, tokens, start, 
tl.size() - 1);
+                               }
 
-              // and flag that we are now inside a chunk
-              inBaseNP = true;
-            }
-          }
-        }
+                               // update the progress stuff to show the 
precentage of sentences
+                               // we have processed so far
+                               fireProgressChanged(i++ * 100 / 
sentences.size());
+                       }
 
-        if(inBaseNP) {
-          // if we got to the end of a sentence and we are still in a
-          // noun chunk then we need to close the end and add the
-          // annotation
-          addAnnotation(outputAS, tokens, start, tl.size() - 1);
-        }
+                       // we have finished! so update the progress and tell
+                       // the user how long it took to chunk the document
+                       fireProcessFinished();
+                       fireStatusChanged(document.getName()
+                                       + " chunked in "
+                                       + NumberFormat
+                                                       .getInstance()
+                                                       .format((double) 
(System.currentTimeMillis() - startTime) / 1000)
+                                       + " seconds!");
+               } else {
+                       // if there are no sentence annotations then throw an 
exception as
+                       // theres
+                       // not much we can do
+                       throw new GateRuntimeException(
+                                       "No sentences to process! Please run a 
sentence splitter first!");
+               }
+       }
 
-        // update the progress stuff to show the precentage of sentences
-        // we have processed so far
-        fireProgressChanged(i++ * 100 / sentences.size());
-      }
+       private void addAnnotation(AnnotationSet outputAS, List<Annotation> 
tokens,
+                       int start, int end) {
+               // Create a new FeatureMap to act as the features for the new
+               // annotation
+               // but we will leave it blank for now as we don't have anything 
to
+               // add
+               FeatureMap params = Factory.newFeatureMap();
 
-      // we have finished! so update the progress and tell
-      // the user how long it took to chunk the document
-      fireProcessFinished();
-      fireStatusChanged(document.getName()
-              + " chunked in "
-              + NumberFormat.getInstance().format(
-                      (double)(System.currentTimeMillis() - startTime) / 1000)
-              + " seconds!");
-    }
-    else {
-      // if there are no sentence annotations then throw an exception as
-      // theres
-      // not much we can do
-      throw new GateRuntimeException(
-              "No sentences to process! Please run a sentence splitter 
first!");
-    }
-  }
+               // Get the token annotation from the beginning of the chunk
+               Annotation aStart = tokens.get(start);
 
-  private void addAnnotation(AnnotationSet outputAS, List<Annotation> tokens, 
int start,
-          int end) {
-    // Create a new FeatureMap to act as the features for the new
-    // annotation
-    // but we will leave it blank for now as we don't have anything to
-    // add
-    FeatureMap params = Factory.newFeatureMap();
+               // Get the token annotation from the end of the chunk
+               Annotation aEnd = tokens.get(end);
 
-    // Get the token annotation from the beginning of the chunk
-    Annotation aStart = tokens.get(start);
+               // This spots errors where the start is after the end. What
+               // we should do is figure out why this occurs in the first place
+               if (aStart.getStartNode().getOffset().longValue() >= 
aEnd.getEndNode()
+                               .getOffset().longValue())
+                       return;
 
-    // Get the token annotation from the end of the chunk
-    Annotation aEnd = tokens.get(end);
-
-    // This spots errors where the start is after the end. What
-    // we should do is figure out why this occurs in the first place
-    if(aStart.getStartNode().getOffset().longValue() >= aEnd.getEndNode()
-            .getOffset().longValue()) return;
-
-    // add a new annotation to mark the noun chunk
-    outputAS.add(aStart.getStartNode(), aEnd.getEndNode(), annotationName,
-            params);
-  }
+               // add a new annotation to mark the noun chunk
+               outputAS.add(aStart.getStartNode(), aEnd.getEndNode(), 
annotationName,
+                               params);
+       }
 }

Modified: 
gate/branches/sawdust2/plugins/Tagger_NP_Chunking/src/main/java/mark/chunking/Rule.java
===================================================================
--- 
gate/branches/sawdust2/plugins/Tagger_NP_Chunking/src/main/java/mark/chunking/Rule.java
     2016-08-19 01:22:42 UTC (rev 19513)
+++ 
gate/branches/sawdust2/plugins/Tagger_NP_Chunking/src/main/java/mark/chunking/Rule.java
     2016-08-19 10:17:23 UTC (rev 19514)
@@ -1,5 +1,5 @@
 /************************************************************************
- *         Copyright (C) 2004-2009 The University of Sheffield          *
+ *         Copyright (C) 2004-2016 The University of Sheffield          *
  *       Developed by Mark Greenwood <[email protected]>       *
  *                                                                      *
  * This program is free software; you can redistribute it and/or modify *
@@ -19,18 +19,19 @@
 
 package mark.chunking;
 
+import java.io.Serializable;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
 /**
- * This class encapulates chunking rules, providing methods to
- * see if it matches against text and to see if it fits within
- * a sentence.
+ * This class encapsulates chunking rules, providing methods to see if it
+ * matches against text and to see if it fits within a sentence.
  **/
-public class Rule
-{
+public class Rule implements Serializable {
+       private static final long serialVersionUID = 787395188113920930L;
+
        /**
         * A Pattern which will split the rule name into type and offsets.
         **/
@@ -42,32 +43,28 @@
        private static final Pattern po = Pattern.compile("_?[0-3]");
 
        /**
-        * The new chunk tag that is assigned if this rule matches the
-        * input text.
+        * The new chunk tag that is assigned if this rule matches the input 
text.
         **/
        private String outTag = null;
 
        /**
-        * The full line of the rules file which created this rule,
-        * used mainly for the toString() method.
+        * The full line of the rules file which created this rule, used mainly 
for
+        * the toString() method.
         **/
        private String rule = null;
 
        /**
-        * A List to hold the types (T, W or P)
-        * of the parts of the rule.
+        * A List to hold the types (T, W or P) of the parts of the rule.
         **/
        private List<String> types = new ArrayList<String>();
 
        /**
-        * A List to hold the offsets for the
-        * parts of the rule.
+        * A List to hold the offsets for the parts of the rule.
         **/
        private List<List<Integer>> offsets = new ArrayList<List<Integer>>();
 
        /**
-        * A List to hold the values for the
-        * parts of the rule.
+        * A List to hold the values for the parts of the rule.
         **/
        private List<String> values = new ArrayList<String>();
 
@@ -81,174 +78,180 @@
         **/
        private int end = Integer.MIN_VALUE;
 
-       public Rule(String rule)
-       {
-               //store a copy of the defining text
+       public Rule(String rule) {
+               // store a copy of the defining text
                this.rule = rule;
 
-               //split the rule into pieces at the spaces
+               // split the rule into pieces at the spaces
                String[] parts = rule.split(" ");
 
-               //store the last part of the rule as the out tag
-               outTag = parts[parts.length-1];
+               // store the last part of the rule as the out tag
+               outTag = parts[parts.length - 1];
 
-               //Use the Pattern to split the template type
-               //into the different token/offsets
+               // Use the Pattern to split the template type
+               // into the different token/offsets
                Matcher mt = pt.matcher(parts[0]);
 
-               //We know that the first value is at position 1
-               //in the split array
+               // We know that the first value is at position 1
+               // in the split array
                int index = 1;
 
-               while (mt.find())
-               {
-                       //while there are still parts to process,
-                       //get the next one
+               while (mt.find()) {
+                       // while there are still parts to process,
+                       // get the next one
                        String to = mt.group();
 
-                       //store the type of this part
-                       types.add(to.substring(0,1));
+                       // store the type of this part
+                       types.add(to.substring(0, 1));
 
-                       //create a new list to hold the offsets
-                       //for this part
+                       // create a new list to hold the offsets
+                       // for this part
                        List<Integer> ofs = new ArrayList<Integer>();
 
-                       //split the offsets into separate parts
+                       // split the offsets into separate parts
                        Matcher mo = po.matcher(to.substring(1));
 
-                       //store the value associated with this part of the rule
+                       // store the value associated with this part of the rule
                        values.add(parts[index++]);
 
-                       while (mo.find())
-                       {
-                               //while there are more offsets,
+                       while (mo.find()) {
+                               // while there are more offsets,
 
-                               //get the next one and make an Integer from it
-                               //(we have to replace '_' by '-' first for it 
to work)
-                               Integer offset = new 
Integer(mo.group().replaceAll("_","-"));
+                               // get the next one and make an Integer from it
+                               // (we have to replace '_' by '-' first for it 
to work)
+                               Integer offset = 
Integer.valueOf(mo.group().replaceAll("_", "-"));
 
-                               //if the current offset is before the known 
beginning then
-                               //make this the beginning
-                               if (offset.intValue() < begin) begin = 
offset.intValue();
+                               // if the current offset is before the known 
beginning then
+                               // make this the beginning
+                               if (offset.intValue() < begin)
+                                       begin = offset.intValue();
 
-                               //if the current offset is after the known 
ending then
-                               //make this offset the end
-                               if (offset.intValue() > end) end = 
offset.intValue();
+                               // if the current offset is after the known 
ending then
+                               // make this offset the end
+                               if (offset.intValue() > end)
+                                       end = offset.intValue();
 
-                               //store the offset in the list
+                               // store the offset in the list
                                ofs.add(offset);
                        }
 
-                       //store the list of offsets for this part
+                       // store the list of offsets for this part
                        offsets.add(ofs);
                }
        }
 
        /**
-        * Simply returns true if this rule matches against the sentence at
-        * a given position. This method makes no alterations to the
-        * tags assigned to any specific offset.
-        * @param currentToken the index within the sentece of the token
-        *                     upon which the rule is centered.
-        * @param words an ordered List of the words within the sentence.
-        * @param tags an ordered List of the chunk tags within the sentence.
-        * @param pos an ordered List of the POS tags within the sentence.
+        * Simply returns true if this rule matches against the sentence at a 
given
+        * position. This method makes no alterations to the tags assigned to 
any
+        * specific offset.
+        * 
+        * @param currentToken
+        *            the index within the sentece of the token upon which the 
rule
+        *            is centered.
+        * @param words
+        *            an ordered List of the words within the sentence.
+        * @param tags
+        *            an ordered List of the chunk tags within the sentence.
+        * @param pos
+        *            an ordered List of the POS tags within the sentence.
         * @return true if the rule matches the input sentence, false otherwise.
         **/
-       public boolean match(int currentToken, List<String> words, List<String> 
tags, List<String> pos)
-       {
-               //if the rule doesn't fit within the sentence then it can never
-               //match so simply return false
-               if (!withinSentence(words.size(), currentToken)) return false;
+       public boolean match(int currentToken, List<String> words,
+                       List<String> tags, List<String> pos) {
+               // if the rule doesn't fit within the sentence then it can never
+               // match so simply return false
+               if (!withinSentence(words.size(), currentToken))
+                       return false;
 
-               //assume the rule will match
+               // assume the rule will match
                boolean matched = true;
 
-               //loop through all the parts of this rule
-               for (int i = 0 ; i < types.size() ; ++i)
-               {
-                       //get the current type
+               // loop through all the parts of this rule
+               for (int i = 0; i < types.size(); ++i) {
+                       // get the current type
                        String type = types.get(i);
 
-                       //get the list of offsets for the part
+                       // get the list of offsets for the part
                        List<Integer> ofs = offsets.get(i);
 
-                       //get the value for this part
+                       // get the value for this part
                        String value = values.get(i);
 
-                       //A placeholder for the right list
+                       // A placeholder for the right list
                        List<String> working = null;
 
-                       if (type.equals("T"))
-                       {
-                               //if the type is "T" then the list we
-                               //are going to work on contains chunk tags
+                       if (type.equals("T")) {
+                               // if the type is "T" then the list we
+                               // are going to work on contains chunk tags
                                working = tags;
-                       }
-                       else if (type.equals("W"))
-                       {
-                               //if the type is "W" then the list we
-                               //are going to work on contains words
+                       } else if (type.equals("W")) {
+                               // if the type is "W" then the list we
+                               // are going to work on contains words
                                working = words;
-                       }
-                       else if (type.equals("P"))
-                       {
-                               //if the type is "P" then the list we
-                               //are going to work on contains POS tags
+                       } else if (type.equals("P")) {
+                               // if the type is "P" then the list we
+                               // are going to work on contains POS tags
                                working = pos;
+                       } else {
+                               // we have an invalid rule so we can't match it
+                               return false;
                        }
 
-                       //get the first (maybe the only) offset for this part
+                       // get the first (maybe the only) offset for this part
                        int offset = ofs.get(0).intValue();
 
-                       //does the value of this offset match the value given 
in the rule
-                       boolean matchOffset = 
working.get(currentToken+offset).equals(value);
+                       // does the value of this offset match the value given 
in the rule
+                       boolean matchOffset = working.get(currentToken + 
offset).equals(
+                                       value);
 
-                       for (int j = 1 ; j < ofs.size() ; ++j)
-                       {
-                               //if there is more than one offset then...
+                       for (int j = 1; j < ofs.size(); ++j) {
+                               // if there is more than one offset then...
 
-                               //get the next offset
+                               // get the next offset
                                offset = ofs.get(j).intValue();
 
-                               //or the truth of matching the value in the 
rule against
-                               //the value of the offset
-                               matchOffset = matchOffset || 
working.get(currentToken+offset).equals(value);
+                               // or the truth of matching the value in the 
rule against
+                               // the value of the offset
+                               matchOffset = matchOffset
+                                               || working.get(currentToken + 
offset).equals(value);
                        }
 
-                       //combine the success/failure of matching this part 
with that
-                       //of matching the rest of the rule
+                       // combine the success/failure of matching this part 
with that
+                       // of matching the rest of the rule
                        matched = matched && matchOffset;
 
-                       //if we have failed to match there is no point trying
-                       //to match the rest of the rule so jump out of this loop
-                       if (!matched) i = types.size();
+                       // if we have failed to match there is no point trying
+                       // to match the rest of the rule so jump out of this 
loop
+                       if (!matched)
+                               i = types.size();
                }
 
-               //return the result of matching we have found
+               // return the result of matching we have found
                return matched;
        }
 
        /**
         * Simply returns the new chunk tag to use if this rule matched.
+        * 
         * @return the new chunk tag.
         **/
-       public String getNewTag()
-       {
-               //simply return the out tag
+       public String getNewTag() {
+               // simply return the out tag
                return outTag;
        }
 
        /**
-        * A method which allows you to check that this rule fits within
-        * the sentence when centered on a specific token.
-        * @param numTokens the total number of tokens in the sentence.
-        * @param currentToken the index of the token upon which the
-        *        rule is going to be centered.
+        * A method which allows you to check that this rule fits within the
+        * sentence when centered on a specific token.
+        * 
+        * @param numTokens
+        *            the total number of tokens in the sentence.
+        * @param currentToken
+        *            the index of the token upon which the rule is going to be
+        *            centered.
         * @return true if the rule fits within the sentence, false otherwise.
         **/
-       public boolean withinSentence(int numTokens, int currentToken)
-       {
+       public boolean withinSentence(int numTokens, int currentToken) {
                int start = currentToken + begin;
                int finish = currentToken + end;
 
@@ -259,9 +262,9 @@
                return within;
        }
 
-       @Override public String toString()
-       {
-               //simply return the line of the rules file
+       @Override
+       public String toString() {
+               // simply return the line of the rules file
                return rule;
        }
 }
\ No newline at end of file

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs

[gate-cvs] SF.net SVN: gate:[19514] gate/branches/sawdust2/plugins/Tagger_NP_Chunking/ src/main/java/mark/chunking

Reply via email to