OpenNLP.java

rwesten Thu, 02 Aug 2012 08:26:23 -0700

Author: rwesten
Date: Thu Aug  2 15:25:34 2012
New Revision: 1368534

URL: http://svn.apache.org/viewvc?rev=1368534&view=rev
Log:
implementation for STANBOL-713 and STANBOL-705


Modified:
    
incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/OpenNLP.java

Modified: 
incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/OpenNLP.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/OpenNLP.java?rev=1368534&r1=1368533&r2=1368534&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/OpenNLP.java
 (original)
+++ 
incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/OpenNLP.java
 Thu Aug  2 15:25:34 2012
@@ -24,9 +24,17 @@ import java.nio.charset.Charset;
 import java.util.HashMap;
 import java.util.Map;
 
+import opennlp.tools.chunker.Chunker;
+import opennlp.tools.chunker.ChunkerME;
 import opennlp.tools.chunker.ChunkerModel;
+import opennlp.tools.namefind.NameFinderME;
+import opennlp.tools.namefind.TokenNameFinder;
 import opennlp.tools.namefind.TokenNameFinderModel;
 import opennlp.tools.postag.POSModel;
+import opennlp.tools.postag.POSTagger;
+import opennlp.tools.postag.POSTaggerME;
+import opennlp.tools.sentdetect.SentenceDetector;
+import opennlp.tools.sentdetect.SentenceDetectorME;
 import opennlp.tools.sentdetect.SentenceModel;
 import opennlp.tools.tokenize.SimpleTokenizer;
 import opennlp.tools.tokenize.Tokenizer;
@@ -43,7 +51,12 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 /**
- * Core of our EnhancementEngine, separated from the OSGi service to make it 
easier to test this.
+ * OSGI service that let you load OpenNLP Models via the Stanbol 
+ * {@link DataFileProvider} infrastructure. This allows users to copy models
+ * to the 'datafiles' directory or developer to provide models via via OSGI
+ * bundles.<p>
+ * This service also provides methods that directly return the OpenNLP 
component
+ * wrapping the model.
  */
 @Component(immediate=true)
 @Service(value=OpenNLP.class)
@@ -95,6 +108,24 @@ public class OpenNLP {
         return initModel(String.format("%s-sent.bin", language),
             SentenceModel.class);
     }
+    
+    /**
+     * Getter for the sentence detector of the parsed language. 
+     * @param language the language
+     * @return the model or <code>null</code> if no model data are found
+     * @throws InvalidFormatException in case the found model data are in the 
wrong format
+     * @throws IOException on any error while reading the model data
+     */
+    public SentenceDetector getSentenceDetector(String language) throws 
IOException {
+        SentenceModel sentModel = getSentenceModel(language);
+        if(sentModel != null){
+            return new SentenceDetectorME(sentModel);
+        } else {
+            log.debug("No Sentence Detection Model for language 
'{}'",language);
+            return null;
+        }
+    }
+    
     /**
      * Getter for the named entity finder model for the parsed entity type and 
language.
      * If the model is not yet available a new one is built. The required data
@@ -109,6 +140,25 @@ public class OpenNLP {
         return initModel(String.format("%s-ner-%s.bin", language, type),
             TokenNameFinderModel.class);
     }
+    
+    /**
+     * Getter for the {@link TokenNameFinder} for the parsed entity type and 
language.
+     * @param type the type of the named entities to find (person, 
organization)
+     * @param language the language
+     * @return the model or <code>null</code> if no model data are found
+     * @throws InvalidFormatException in case the found model data are in the 
wrong format
+     * @throws IOException on any error while reading the model data
+     */
+    public TokenNameFinder getNameFinder(String type, String language) throws 
IOException {
+        TokenNameFinderModel model = getNameModel(type, language);
+        if(model != null){
+            return new NameFinderME(model);
+        } else {
+            log.debug("TokenNameFinder model for type {} and langauge {} not 
present",type,language);
+            return null;
+        }
+    }
+    
     /**
      * Getter for the tokenizer model for the parsed language.
      * If the model is not yet available a new one is built. The required data
@@ -155,7 +205,7 @@ public class OpenNLP {
         return tokenizer;
     }
     /**
-     * Getter for the "part-of-speach" model for the parsed language.
+     * Getter for the "part-of-speech" model for the parsed language.
      * If the model is not yet available a new one is built. The required data
      * are loaded by using the {@link DataFileProvider} service.  
      * @param language the language
@@ -190,6 +240,41 @@ public class OpenNLP {
         }
         return model;
     }
+    
+    /**
+     * Getter for the "part-of-speech" tagger for the parsed language.
+     * @param language the language
+     * @return the model or <code>null</code> if no model data are found
+     * @throws InvalidFormatException in case the found model data are in the 
wrong format
+     * @throws IOException on any error while reading the model data
+     */
+    public POSTagger getPartOfSpeechTagger(String language) throws IOException 
{
+        POSModel posModel = getPartOfSpeachModel(language);
+        if(posModel != null){
+            return new POSTaggerME(posModel);
+        } else {
+            log.debug("No POS Model for language '{}'",language);
+            return null;
+        }
+    }
+    
+    /**
+     * Getter for the Model with the parsed type, name and properties.
+     * @param modelType the type of the Model (e.g. {@link ChunkerModel})
+     * @param modelName the name of the model file. MUST BE available via the
+     * {@link DataFileProvider}.
+     * @param properties additional properties about the model (parsed to the
+     * {@link DataFileProvider}. NOTE that "Description", "Model Type" and
+     * "Download Location" are set to default values if not defined in the
+     * parsed value.
+     * @return the loaded (or cached) model
+     * @throws InvalidFormatException in case the found model data are in the 
wrong format
+     * @throws IOException on any error while reading the model data
+     */
+    public <T> T getModel(Class<T> modelType,String modelName, 
Map<String,String> properties) throws InvalidFormatException, IOException {
+        return initModel(modelName, modelType, properties);
+    }
+    
     /**
      * Getter for the chunker model for the parsed language.
      * If the model is not yet available a new one is built. The required data
@@ -203,6 +288,23 @@ public class OpenNLP {
         return initModel(String.format("%s-chunker.bin", language), 
ChunkerModel.class);
     }
     
+    /**
+     * Getter for the {@link Chunker} for a given language
+     * @param language the language
+     * @return the {@link Chunker} or <code>null</code> if no model is present
+     * @throws InvalidFormatException in case the found model data are in the 
wrong format
+     * @throws IOException on any error while reading the model data
+     */
+    public Chunker getChunker(String language) throws IOException {
+        ChunkerModel chunkerModel = getChunkerModel(language);
+        if(chunkerModel != null){
+             return new ChunkerME(chunkerModel);
+        } else {
+            log.debug("No Chunker Model for language {}",language);
+            return null;
+        }
+    }
+    
 //    /**
 //     * Activates the component and re-enables all {@link DataFileProvider}s
 //     * previously {@link #registerModelLocation(BundleContext, String...) 
registered}.
@@ -330,8 +432,23 @@ public class OpenNLP {
      * @throws IOException on any error while loading the model data
      * @throws IllegalStateException on any Exception while creating the model
      */
-    @SuppressWarnings("unchecked")
     private <T> T initModel(String name,Class<T> modelType) throws 
InvalidFormatException, IOException {
+        return initModel(name, modelType,null);
+    }
+    /**
+     * Uses generics to build models of the parsed type. The {@link #models}
+     * map is used to lookup already created models.
+     * @param <T> the type of the model to create
+     * @param name the name of the file with the model data
+     * @param modelType the class object representing the model to create
+     * @param modelProperties additional metadata about the requested model
+     * @return the model or <code>null</code> if the model data where not found
+     * @throws InvalidFormatException if the model data are in an invalid 
format
+     * @throws IOException on any error while loading the model data
+     * @throws IllegalStateException on any Exception while creating the model
+     */
+    @SuppressWarnings("unchecked")
+    private <T> T initModel(String name,Class<T> modelType, Map<String,String> 
modelProperties) throws InvalidFormatException, IOException {
         Object model = models.get(name);
         if(model != null) {
             if(modelType.isAssignableFrom(model.getClass())){
@@ -342,10 +459,20 @@ public class OpenNLP {
                     name,model.getClass(),modelType));
             }
         } else { //create new model
-            Map<String,String> modelProperties = new HashMap<String,String>();
-            modelProperties.put("Description", "Statistical model for 
OpenNLP");
-            modelProperties.put("Model Type:", modelType.getSimpleName());
-            modelProperties.put("Download Location", DOWNLOAD_ROOT+name);
+            if(modelProperties != null){ //copy the data to avoid external 
modifications
+                modelProperties = new HashMap<String,String>(modelProperties);
+            }else {
+                modelProperties = new HashMap<String,String>();
+            }
+            if(!modelProperties.containsKey("Description")){
+                modelProperties.put("Description", "Statistical model for 
OpenNLP");
+            }
+            if(!modelProperties.containsKey("Model Type")){
+                modelProperties.put("Model Type", modelType.getSimpleName());
+            }
+            if(!modelProperties.containsKey("Download Location")){
+                modelProperties.put("Download Location", DOWNLOAD_ROOT+name);
+            }
             InputStream modelDataStream;
             try {
                 modelDataStream = lookupModelStream(name,modelProperties);

svn commit: r1368534 - /incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/OpenNLP.java

Reply via email to