la...

ogrisel Tue, 22 Feb 2011 08:58:45 -0800

Author: ogrisel
Date: Tue Feb 22 16:58:15 2011
New Revision: 1073406

URL: http://svn.apache.org/viewvc?rev=1073406&view=rev
Log:
STANBOL-13 / STANBOL-90: upgrade the NER engine to use the OpenNLP 1.5 API and 
models packaged in the new defaultdata artifact


Modified:
    incubator/stanbol/trunk/enhancer/engines/opennlp-ner/pom.xml
    
incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NamedEntityExtractionEnhancementEngine.java
    
incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java
    incubator/stanbol/trunk/enhancer/launchers/full/src/main/bundles/list.xml
    incubator/stanbol/trunk/enhancer/launchers/lite/src/main/bundles/list.xml
    incubator/stanbol/trunk/enhancer/parent/pom.xml

Modified: incubator/stanbol/trunk/enhancer/engines/opennlp-ner/pom.xml
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/opennlp-ner/pom.xml?rev=1073406&r1=1073405&r2=1073406&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/opennlp-ner/pom.xml (original)
+++ incubator/stanbol/trunk/enhancer/engines/opennlp-ner/pom.xml Tue Feb 22 
16:58:15 2011
@@ -49,23 +49,25 @@
        <dependencies>
                <!-- obviously we need opennlp -->
                <dependency>
-                       <groupId>org.clojars.pjt</groupId>
+                       <groupId>org.clojars.zaxtax</groupId>
                        <artifactId>opennlp-tools</artifactId>
                        <scope>compile</scope>
                </dependency>
+               <dependency>
+                       <groupId>org.clojars.zaxtax</groupId>
+                       <artifactId>maxent</artifactId>
+                       <scope>compile</scope>
+               </dependency>
 
                <dependency>
                        <groupId>org.apache.stanbol</groupId>
                        
<artifactId>org.apache.stanbol.enhancer.servicesapi</artifactId>
                </dependency>
-               <!--
-                       TODO: remove dependency after the openNLP Models are no 
longer loaded
-                       via the AutotaggingProvider
-               -->
+
                <dependency>
                        <groupId>org.apache.stanbol</groupId>
-                       
<artifactId>org.apache.stanbol.enhancer.engines.autotagging</artifactId>
-               </dependency>           
+                       <artifactId>org.apache.stanbol.defaultdata</artifactId>
+               </dependency>
 
                <dependency>
                        <groupId>org.apache.clerezza</groupId>
@@ -119,16 +121,13 @@
                                                <Private-Package>
                                                        
org.apache.stanbol.enhancer.engines.opennlp.impl.*
                                </Private-Package>
-                                               
<Embed-Dependency>opennlp-tools,opennlp-maxent,trove</Embed-Dependency>
+                                               
<Embed-Dependency>opennlp-tools,maxent</Embed-Dependency>
                                                
<Embed-Transitive>true</Embed-Transitive>
                                                <Import-Package>
+                                                   
org.apache.stanbol.defaultdata.opennlp,
                                                        !net.didion.*,
-                                                       !gnu.getopt,
-                                                       !org.apache.log.*,
-                                                       !junit.framework.*,
-                                                       
!org.apache.avalon.framework.*,
                                                        *
-                                       </Import-Package>                       
                        
+                                       </Import-Package>
                                        </instructions>
                                </configuration>
                        </plugin>

Modified: 
incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NamedEntityExtractionEnhancementEngine.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NamedEntityExtractionEnhancementEngine.java?rev=1073406&r1=1073405&r2=1073406&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NamedEntityExtractionEnhancementEngine.java
 (original)
+++ 
incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NamedEntityExtractionEnhancementEngine.java
 Tue Feb 22 16:58:15 2011
@@ -16,11 +16,18 @@
  */
 package org.apache.stanbol.enhancer.engines.opennlp.impl;
 
-import java.io.DataInputStream;
+import static 
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_RELATION;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE;
+import static 
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE;
+import static 
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_END;
+import static 
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTED_TEXT;
+import static 
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTION_CONTEXT;
+import static 
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_START;
+
 import java.io.File;
+import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
-import java.net.URL;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
@@ -29,14 +36,13 @@ import java.util.HashMap;
 import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
-import java.util.zip.GZIPInputStream;
 
-import opennlp.maxent.GISModel;
-import opennlp.maxent.io.BinaryGISModelReader;
 import opennlp.tools.namefind.NameFinderME;
+import opennlp.tools.namefind.TokenNameFinderModel;
 import opennlp.tools.sentdetect.SentenceDetectorME;
-import opennlp.tools.tokenize.SimpleTokenizer;
+import opennlp.tools.sentdetect.SentenceModel;
 import opennlp.tools.tokenize.Tokenizer;
+import opennlp.tools.tokenize.WhitespaceTokenizer;
 import opennlp.tools.util.Span;
 
 import org.apache.clerezza.rdf.core.LiteralFactory;
@@ -49,9 +55,7 @@ import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.felix.scr.annotations.Component;
 import org.apache.felix.scr.annotations.Property;
-import org.apache.felix.scr.annotations.Reference;
 import org.apache.felix.scr.annotations.Service;
-import org.apache.stanbol.enhancer.engines.autotagging.AutotaggerProvider;
 import org.apache.stanbol.enhancer.servicesapi.ContentItem;
 import org.apache.stanbol.enhancer.servicesapi.EngineException;
 import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
@@ -62,18 +66,13 @@ import org.apache.stanbol.enhancer.servi
 import org.osgi.framework.BundleContext;
 import org.osgi.service.component.ComponentContext;
 
-
-import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.*;
-
 /**
- * Apache Stanbol Enhancer Named Entity Recognition enhancement engine based 
on 
- * opennlp's Maximum Entropy models and a DBpedia index for optionally 
matching 
- * them to well know DBpedia entities.
+ * Apache Stanbol Enhancer Named Entity Recognition enhancement engine based 
on opennlp's Maximum Entropy
+ * models and a DBpedia index for optionally matching them to well know 
DBpedia entities.
  */
 @Component(immediate = true, metatype = true)
 @Service
-public class NamedEntityExtractionEnhancementEngine implements
-        EnhancementEngine, ServiceProperties {
+public class NamedEntityExtractionEnhancementEngine implements 
EnhancementEngine, ServiceProperties {
 
     /**
      * The default value for the Execution of this Engine. Currently set to
@@ -88,42 +87,35 @@ public class NamedEntityExtractionEnhanc
 
     public static final Log log = 
LogFactory.getLog(NamedEntityExtractionEnhancementEngine.class);
 
-    protected GISModel sentenceModel;
+    protected SentenceModel sentenceModel;
 
-    protected GISModel personNameModel;
+    protected TokenNameFinderModel personNameModel;
 
-    protected GISModel locationNameModel;
+    protected TokenNameFinderModel locationNameModel;
 
-    protected GISModel organizationNameModel;
+    protected TokenNameFinderModel organizationNameModel;
 
-    protected Map<String, Object[]> entityTypes = new HashMap<String, 
Object[]>();
+    protected Map<String,Object[]> entityTypes = new 
HashMap<String,Object[]>();
 
     protected BundleContext bundleContext;
 
-    @Reference
-    protected AutotaggerProvider autotaggerProvider;
-
-    // @Activate
     @SuppressWarnings("unchecked")
     protected void activate(ComponentContext ce) throws IOException {
         bundleContext = ce.getBundleContext();
 
         String directoryPath = null;
         if (ce != null) {
-            Dictionary<String, String> properties = ce.getProperties();
+            Dictionary<String,String> properties = ce.getProperties();
             directoryPath = properties.get(MODELS_PATH);
         }
-        sentenceModel = loadModel(directoryPath,
-                "english/sentdetect/EnglishSD.bin.gz");
+        sentenceModel = new SentenceModel(lookupModelStream(directoryPath, 
"en-sent.bin"));
 
-        personNameModel = buildNameModel(directoryPath, "person",
-                OntologicalClasses.DBPEDIA_PERSON);
+        personNameModel = buildNameModel(directoryPath, "person", 
OntologicalClasses.DBPEDIA_PERSON);
 
-        locationNameModel = buildNameModel(directoryPath, "location",
-                OntologicalClasses.DBPEDIA_PLACE);
+        locationNameModel = buildNameModel(directoryPath, "location", 
OntologicalClasses.DBPEDIA_PLACE);
 
         organizationNameModel = buildNameModel(directoryPath, "organization",
-                OntologicalClasses.DBPEDIA_ORGANISATION);
+            OntologicalClasses.DBPEDIA_ORGANISATION);
     }
 
     // @Deactivate
@@ -134,47 +126,30 @@ public class NamedEntityExtractionEnhanc
         organizationNameModel = null;
     }
 
-    protected GISModel loadModel(String directoryPath, String 
modelRelativePath)
-            throws IOException {
+    protected InputStream lookupModelStream(String directoryPath, String 
modelRelativePath) throws IOException {
 
         ClassLoader loader = this.getClass().getClassLoader();
         if (directoryPath != null && directoryPath.length() > 0) {
             // load custom models from the provided FS directory
-            File modelData = new File(new File(directoryPath),
-                    modelRelativePath);
-            return new BinaryGISModelReader(modelData).getModel();
+            File modelData = new File(new File(directoryPath), 
modelRelativePath);
+            return new FileInputStream(modelData);
         } else {
-            // load default OpenNLP models from jars
-            String resourcePath = "opennlp/" + modelRelativePath;
-            InputStream in = null;
-            if (autotaggerProvider != null) {
-                // Lookup the OSGI bundle of the autotagger that embeds the
-                // default opennlp models data: this is hackish, the
-                // iks-autotagging project should be refactored to do all of
-                // this by it-self
-                URL entry = 
autotaggerProvider.getBundleContext().getBundle().getEntry(
-                        resourcePath);
-                in = entry != null ? entry.openStream() : null;
-            } else {
-                // regular classloading for the tests
-                in = loader.getResourceAsStream(resourcePath);
-            }
+            // load default OpenNLP models from classpath (embedded in the 
defaultdata bundle)
+            String resourcePath = "org/apache/stanbol/defaultdata/opennlp/" + 
modelRelativePath;
+            InputStream in = loader.getResourceAsStream(resourcePath);
             if (in == null) {
-                throw new IOException("coult not find resource: "
-                        + resourcePath);
+                throw new IOException("Coult not find resource from the 
classpath: " + resourcePath);
             }
-            return new BinaryGISModelReader(new DataInputStream(
-                    new GZIPInputStream(in))).getModel();
+            return in;
         }
     }
 
-    protected GISModel buildNameModel(String directoryPath, String name,
-            UriRef typeUri) throws IOException {
-        String modelRelativePath = String.format("english/namefind/%s.bin.gz",
-                name);
-        GISModel model = loadModel(directoryPath, modelRelativePath);
+    protected TokenNameFinderModel buildNameModel(String directoryPath, String 
name, UriRef typeUri) throws IOException {
+        String modelRelativePath = String.format("en-ner-%s.bin", name);
+        TokenNameFinderModel model = new 
TokenNameFinderModel(lookupModelStream(directoryPath,
+            modelRelativePath));
         // register the name finder instances for matching owl class
-        entityTypes.put(name, new Object[] { typeUri, model });
+        entityTypes.put(name, new Object[] {typeUri, model});
         return model;
     }
 
@@ -194,11 +169,11 @@ public class NamedEntityExtractionEnhanc
         }
 
         try {
-            for (Map.Entry<String, Object[]> type : entityTypes.entrySet()) {
+            for (Map.Entry<String,Object[]> type : entityTypes.entrySet()) {
                 String typeLabel = type.getKey();
                 Object[] typeInfo = type.getValue();
                 UriRef typeUri = (UriRef) typeInfo[0];
-                GISModel nameFinderModel = (GISModel) typeInfo[1];
+                TokenNameFinderModel nameFinderModel = (TokenNameFinderModel) 
typeInfo[1];
                 findNamedEntities(ci, text, typeUri, typeLabel, 
nameFinderModel);
             }
         } catch (Exception e) { // TODO: makes it sense to catch Exception 
here?
@@ -206,26 +181,25 @@ public class NamedEntityExtractionEnhanc
         }
     }
 
-    protected void findNamedEntities(final ContentItem ci, final String text,
-            final UriRef typeUri, final String typeLabel,
-            final GISModel nameFinderModel) {
+    protected void findNamedEntities(final ContentItem ci,
+                                     final String text,
+                                     final UriRef typeUri,
+                                     final String typeLabel,
+                                     final TokenNameFinderModel 
nameFinderModel) {
 
         if (ci == null) {
-            throw new IllegalArgumentException(
-                    "Parsed ContentItem MUST NOT be NULL");
+            throw new IllegalArgumentException("Parsed ContentItem MUST NOT be 
NULL");
         }
         if (text == null) {
-            log.warn("NULL was parsed as text for content item " + ci.getId()
-                    + "! -> call ignored");
+            log.warn("NULL was parsed as text for content item " + ci.getId() 
+ "! -> call ignored");
             return;
         }
         LiteralFactory literalFactory = LiteralFactory.getInstance();
         MGraph g = ci.getMetadata();
-        Map<String, List<NameOccurrence>> entityNames = extractNameOccurrences(
-                nameFinderModel, text);
+        Map<String,List<NameOccurrence>> entityNames = 
extractNameOccurrences(nameFinderModel, text);
 
-        Map<String, UriRef> previousAnnotations = new LinkedHashMap<String, 
UriRef>();
-        for (Map.Entry<String, List<NameOccurrence>> nameInContext : 
entityNames.entrySet()) {
+        Map<String,UriRef> previousAnnotations = new 
LinkedHashMap<String,UriRef>();
+        for (Map.Entry<String,List<NameOccurrence>> nameInContext : 
entityNames.entrySet()) {
 
             String name = nameInContext.getKey();
             List<NameOccurrence> occurrences = nameInContext.getValue();
@@ -233,25 +207,19 @@ public class NamedEntityExtractionEnhanc
             UriRef firstOccurrenceAnnotation = null;
 
             for (NameOccurrence occurrence : occurrences) {
-                UriRef textAnnotation = 
EnhancementEngineHelper.createTextEnhancement(
-                        ci, this);
-                g.add(new TripleImpl(textAnnotation,
-                        ENHANCER_SELECTED_TEXT,
-                        literalFactory.createTypedLiteral(name)));
-                g.add(new TripleImpl(textAnnotation,
-                        ENHANCER_SELECTION_CONTEXT,
-                        
literalFactory.createTypedLiteral(occurrence.context)));
-                g.add(new TripleImpl(textAnnotation, DC_TYPE,
-                        typeUri));
-                g.add(new TripleImpl(
-                        textAnnotation,
-                        ENHANCER_CONFIDENCE,
-                        
literalFactory.createTypedLiteral(occurrence.confidence)));
+                UriRef textAnnotation = 
EnhancementEngineHelper.createTextEnhancement(ci, this);
+                g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTED_TEXT, 
literalFactory
+                        .createTypedLiteral(name)));
+                g.add(new TripleImpl(textAnnotation, 
ENHANCER_SELECTION_CONTEXT, literalFactory
+                        .createTypedLiteral(occurrence.context)));
+                g.add(new TripleImpl(textAnnotation, DC_TYPE, typeUri));
+                g.add(new TripleImpl(textAnnotation, ENHANCER_CONFIDENCE, 
literalFactory
+                        .createTypedLiteral(occurrence.confidence)));
                 if (occurrence.start != null && occurrence.end != null) {
-                    g.add(new TripleImpl(textAnnotation, ENHANCER_START,
-                            
literalFactory.createTypedLiteral(occurrence.start)));
-                    g.add(new TripleImpl(textAnnotation, ENHANCER_END,
-                            
literalFactory.createTypedLiteral(occurrence.end)));
+                    g.add(new TripleImpl(textAnnotation, ENHANCER_START, 
literalFactory
+                            .createTypedLiteral(occurrence.start)));
+                    g.add(new TripleImpl(textAnnotation, ENHANCER_END, 
literalFactory
+                            .createTypedLiteral(occurrence.end)));
                 }
 
                 // add the subsumption relationship among occurrences of the 
same
@@ -259,14 +227,12 @@ public class NamedEntityExtractionEnhanc
                 if (firstOccurrenceAnnotation == null) {
                     // check already extracted annotations to find a first most
                     // specific occurrence
-                    for (Map.Entry<String, UriRef> entry : 
previousAnnotations.entrySet()) {
+                    for (Map.Entry<String,UriRef> entry : 
previousAnnotations.entrySet()) {
                         if (entry.getKey().contains(name)) {
                             // we have found a most specific previous
                             // occurrence, use it as subsumption target
                             firstOccurrenceAnnotation = entry.getValue();
-                            g.add(new TripleImpl(textAnnotation,
-                                    DC_RELATION,
-                                    firstOccurrenceAnnotation));
+                            g.add(new TripleImpl(textAnnotation, DC_RELATION, 
firstOccurrenceAnnotation));
                             break;
                         }
                     }
@@ -279,8 +245,7 @@ public class NamedEntityExtractionEnhanc
                 } else {
                     // I am referring to a most specific first occurrence of 
the
                     // same name
-                    g.add(new TripleImpl(textAnnotation,
-                            DC_RELATION, firstOccurrenceAnnotation));
+                    g.add(new TripleImpl(textAnnotation, DC_RELATION, 
firstOccurrenceAnnotation));
                 }
             }
         }
@@ -298,77 +263,52 @@ public class NamedEntityExtractionEnhanc
         return extractNames(organizationNameModel, text);
     }
 
-    public Map<String, List<NameOccurrence>> extractPersonNameOccurrences(
-            String text) {
+    public Map<String,List<NameOccurrence>> 
extractPersonNameOccurrences(String text) {
         return extractNameOccurrences(personNameModel, text);
     }
 
-    public Map<String, List<NameOccurrence>> extractLocationNameOccurrences(
-            String text) {
+    public Map<String,List<NameOccurrence>> 
extractLocationNameOccurrences(String text) {
         return extractNameOccurrences(locationNameModel, text);
     }
 
-    public Map<String, List<NameOccurrence>> 
extractOrganizationNameOccurrences(
-            String text) {
+    public Map<String,List<NameOccurrence>> 
extractOrganizationNameOccurrences(String text) {
         return extractNameOccurrences(organizationNameModel, text);
     }
 
-    protected Collection<String> extractNames(GISModel nameFinderModel,
-            String text) {
+    protected Collection<String> extractNames(TokenNameFinderModel 
nameFinderModel, String text) {
         return extractNameOccurrences(nameFinderModel, text).keySet();
     }
 
-    protected Map<String, List<NameOccurrence>> extractNameOccurrences(
-            GISModel nameFinderModel, String text) {
+    protected Map<String,List<NameOccurrence>> 
extractNameOccurrences(TokenNameFinderModel nameFinderModel,
+                                                                      String 
text) {
 
         // version with explicit sentence endings to reflect heading / 
paragraph
         // structure of an HTML or PDF document converted to text
         String textWithDots = text.replaceAll("\\n\\n", ".\n");
 
-        SentenceDetectorME sentenceDetector = new SentenceDetectorME(
-                sentenceModel);
+        SentenceDetectorME sentenceDetector = new 
SentenceDetectorME(sentenceModel);
 
-        int[] sentenceEndings = sentenceDetector.sentPosDetect(textWithDots);
-        int[] sentencePositions = new int[sentenceEndings.length + 1];
-        sentencePositions[0] = 0;
-        System.arraycopy(sentenceEndings, 0, sentencePositions, 1,
-                sentenceEndings.length);
-        String[] sentences;
-        if(sentenceEndings.length<1){
-            //STANBOL-60: if no sentence is detected treat the whole text as 
-            //one sentence.
-            log.debug("No sentence detected -> use whole text as one element");
-            sentences = new String[] {text};
-        } else {
-            sentences = new String[sentenceEndings.length];
-            for (int i = 0; i < sentences.length; i++) {
-                log.debug(String.format("Sentence %d from char %d to %d", i,
-                        sentencePositions[i], sentencePositions[i + 1]));
-                sentences[i] = text.substring(sentencePositions[i],
-                        sentencePositions[i + 1]);
-                log.debug("Sentence: " + sentences[i]);
-            }
-        }
+        Span[] sentenceSpans = sentenceDetector.sentPosDetect(textWithDots);
 
         NameFinderME finder = new NameFinderME(nameFinderModel);
 
-        Map<String, List<NameOccurrence>> nameOccurrences = new 
LinkedHashMap<String, List<NameOccurrence>>();
-        Tokenizer tokenizer = new SimpleTokenizer();
-        for (int i = 0; i < sentences.length; i++) {
-            String sentence = sentences[i];
+        Map<String,List<NameOccurrence>> nameOccurrences = new 
LinkedHashMap<String,List<NameOccurrence>>();
+        Tokenizer tokenizer = WhitespaceTokenizer.INSTANCE;
+        for (int i = 0; i < sentenceSpans.length; i++) {
+            String sentence = 
sentenceSpans[i].getCoveredText(text).toString().trim();
 
             // build a context by concatenating three sentences to be used for
             // similarity ranking / disambiguation + contextual snippet in the
             // extraction structure
             List<String> contextElements = new ArrayList<String>();
-            if (i - 1 > 0) {
-                String previousSentence = sentences[i - 1];
-                contextElements.add(previousSentence.trim());
+            if (i > 0) {
+                CharSequence previousSentence = sentenceSpans[i - 
1].getCoveredText(text);
+                contextElements.add(previousSentence.toString().trim());
             }
-            contextElements.add(sentence.trim());
-            if (i + 1 < sentences.length) {
-                String nextSentence = sentences[i + 1];
-                contextElements.add(nextSentence.trim());
+            contextElements.add(sentence.toString().trim());
+            if (i + 1 < sentenceSpans.length) {
+                CharSequence nextSentence = sentenceSpans[i + 
1].getCoveredText(text);
+                contextElements.add(nextSentence.toString().trim());
             }
             String context = StringUtils.join(contextElements, " ");
 
@@ -390,19 +330,17 @@ public class NamedEntityExtractionEnhanc
                 Integer absoluteEnd = null;
                 if (start != -1) {
                     /*
-                     * NOTE (rw, issue 19, 20100615) Here we need to set the 
new
-                     * start position, by adding the current start to the
-                     * lastStartPosion. we need also to use the
-                     * lastStartPosition to calculate the start of the element.
-                     * The old code had not worked if names contains more than 
a
-                     * single element!
+                     * NOTE (rw, issue 19, 20100615) Here we need to set the 
new start position, by adding the
+                     * current start to the lastStartPosion. we need also to 
use the lastStartPosition to
+                     * calculate the start of the element. The old code had 
not worked if names contains more
+                     * than a single element!
                      */
                     lastStartPosition += start;
-                    absoluteStart = sentencePositions[i] + lastStartPosition;
+                    absoluteStart = sentenceSpans[i].getStart() + 
lastStartPosition;
                     absoluteEnd = absoluteStart + name.length();
                 }
-                NameOccurrence occurrence = new NameOccurrence(name,
-                        absoluteStart, absoluteEnd, context, confidence);
+                NameOccurrence occurrence = new NameOccurrence(name, 
absoluteStart, absoluteEnd, context,
+                        confidence);
 
                 List<NameOccurrence> occurrences = nameOccurrences.get(name);
                 if (occurrences == null) {
@@ -416,16 +354,15 @@ public class NamedEntityExtractionEnhanc
 
         if (log.isDebugEnabled()) {
             for (List<NameOccurrence> occurrences : nameOccurrences.values()) {
-                log.debug("Occurrences found: "
-                        + StringUtils.join(occurrences, ", "));
+                log.debug("Occurrences found: " + 
StringUtils.join(occurrences, ", "));
             }
         }
         return nameOccurrences;
     }
 
     public int canEnhance(ContentItem ci) {
-        //in case text/pain;charSet=UTF8 is parsed
-        String mimeType = ci.getMimeType().split(";",2)[0];
+        // in case text/pain;charSet=UTF8 is parsed
+        String mimeType = ci.getMimeType().split(";", 2)[0];
         if (TEXT_PLAIN_MIMETYPE.equalsIgnoreCase(mimeType)) {
             return ENHANCE_SYNCHRONOUS;
         }
@@ -433,10 +370,9 @@ public class NamedEntityExtractionEnhanc
     }
 
     @Override
-    public Map<String, Object> getServiceProperties() {
-        return Collections.unmodifiableMap(Collections.singletonMap(
-                ENHANCEMENT_ENGINE_ORDERING,
-                (Object) defaultOrder));
+    public Map<String,Object> getServiceProperties() {
+        return 
Collections.unmodifiableMap(Collections.singletonMap(ENHANCEMENT_ENGINE_ORDERING,
+            (Object) defaultOrder));
     }
 
 }

Modified: 
incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java?rev=1073406&r1=1073405&r2=1073406&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java
 (original)
+++ 
incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java
 Tue Feb 22 16:58:15 2011
@@ -34,7 +34,6 @@ import org.apache.clerezza.rdf.core.Trip
 import org.apache.clerezza.rdf.core.TypedLiteral;
 import org.apache.clerezza.rdf.core.UriRef;
 import org.apache.clerezza.rdf.core.impl.SimpleMGraph;
-import 
org.apache.stanbol.enhancer.engines.autotagging.impl.ConfiguredAutotaggerProvider;
 import org.apache.stanbol.enhancer.engines.opennlp.impl.NameOccurrence;
 import 
org.apache.stanbol.enhancer.engines.opennlp.impl.NamedEntityExtractionEnhancementEngine;
 import org.apache.stanbol.enhancer.servicesapi.ContentItem;
@@ -50,7 +49,7 @@ import static org.apache.stanbol.enhance
 
 public class TestNamedEntityExtractionEnhancementEngine extends Assert {
 
-    public static final String SINGLE_SENTENCE = "Dr. Patrick Marshall (1869 - 
November 1950) was a"
+    public static final String SINGLE_SENTENCE = "Dr Patrick Marshall (1869 - 
November 1950) was a"
             + " geologist who lived in New Zealand and worked at the 
University of Otago.";
 
     public static final String MULTI_SENTENCES = "The life of Patrick 
Marshall\n\n"
@@ -65,7 +64,6 @@ public class TestNamedEntityExtractionEn
     @BeforeClass
     public static void setUpServices() throws IOException {
         Dictionary<String, Object> properties = new Hashtable<String, 
Object>();
-        properties.put(ConfiguredAutotaggerProvider.LUCENE_INDEX_PATH, "");
         MockComponentContext context = new MockComponentContext(properties);
         nerEngine.activate(context);
     }
@@ -119,13 +117,13 @@ public class TestNamedEntityExtractionEn
         assertEquals("Patrick Marshall", firstOccurrence.name);
         assertEquals(12, firstOccurrence.start.intValue());
         assertEquals(28, firstOccurrence.end.intValue());
-        assertEquals(0.98, firstOccurrence.confidence, 0.005);
+        assertEquals(0.998, firstOccurrence.confidence, 0.005);
 
         NameOccurrence secondOccurrence = pmOccurrences.get(1);
         assertEquals("Patrick Marshall", secondOccurrence.name);
         assertEquals(33, secondOccurrence.start.intValue());
         assertEquals(49, secondOccurrence.end.intValue());
-        assertEquals(0.97, secondOccurrence.confidence, 0.005);
+        assertEquals(0.85, secondOccurrence.confidence, 0.005);
     }
 
     @Test

Modified: 
incubator/stanbol/trunk/enhancer/launchers/full/src/main/bundles/list.xml
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/launchers/full/src/main/bundles/list.xml?rev=1073406&r1=1073405&r2=1073406&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/launchers/full/src/main/bundles/list.xml 
(original)
+++ incubator/stanbol/trunk/enhancer/launchers/full/src/main/bundles/list.xml 
Tue Feb 22 16:58:15 2011
@@ -318,6 +318,11 @@
                </bundle>
                <bundle>
                        <groupId>org.apache.stanbol</groupId>
+                       <artifactId>org.apache.stanbol.defaultdata</artifactId>
+                       <version>0.0.1</version>
+               </bundle>
+               <bundle>
+                       <groupId>org.apache.stanbol</groupId>
                        
<artifactId>org.apache.stanbol.enhancer.engines.opennlp.ner</artifactId>
                        <version>0.9-SNAPSHOT</version>
                </bundle>

Modified: 
incubator/stanbol/trunk/enhancer/launchers/lite/src/main/bundles/list.xml
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/launchers/lite/src/main/bundles/list.xml?rev=1073406&r1=1073405&r2=1073406&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/launchers/lite/src/main/bundles/list.xml 
(original)
+++ incubator/stanbol/trunk/enhancer/launchers/lite/src/main/bundles/list.xml 
Tue Feb 22 16:58:15 2011
@@ -297,6 +297,11 @@
                </bundle>
                <bundle>
                        <groupId>org.apache.stanbol</groupId>
+                       <artifactId>org.apache.stanbol.defaultdata</artifactId>
+                       <version>0.0.1</version>
+               </bundle>
+               <bundle>
+                       <groupId>org.apache.stanbol</groupId>
                        
<artifactId>org.apache.stanbol.enhancer.engines.opennlp.ner</artifactId>
                        <version>0.9-SNAPSHOT</version>
                </bundle>

Modified: incubator/stanbol/trunk/enhancer/parent/pom.xml
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/parent/pom.xml?rev=1073406&r1=1073405&r2=1073406&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/parent/pom.xml (original)
+++ incubator/stanbol/trunk/enhancer/parent/pom.xml Tue Feb 22 16:58:15 2011
@@ -119,7 +119,7 @@
                                                <instructions>
                                                        
<Bundle-Category>Stanbol Enhancer</Bundle-Category>
                                                        
<Bundle-DocURL>http://incubator.apache.org/stanbol</Bundle-DocURL>
-                                                       <Bundle-Vendor>Apache 
Stanbol (incubation)</Bundle-Vendor>
+                                                       <Bundle-Vendor>Apache 
Stanbol (Incubating)</Bundle-Vendor>
                                                        
<Bundle-SymbolicName>${project.artifactId}</Bundle-SymbolicName>
                                                        
<_versionpolicy>$${version;===;${@}}</_versionpolicy>
                                                </instructions>
@@ -232,7 +232,13 @@
 
        <dependencyManagement>
                <dependencies>
-                       <!-- FISE Deps -->
+                       <!-- Stanbol Deps -->
+                       <dependency>
+                               <groupId>org.apache.stanbol</groupId>
+                               
<artifactId>org.apache.stanbol.defaultdata</artifactId>
+                               <version>0.0.1</version>
+                               <scope>provided</scope>
+                       </dependency>
                        <dependency>
                                <groupId>org.apache.stanbol</groupId>
                                
<artifactId>org.apache.stanbol.enhancer.servicesapi</artifactId>
@@ -633,9 +639,15 @@
 
                        <!-- OpenNLP -->
                        <dependency>
-                               <groupId>org.clojars.pjt</groupId>
+                               <groupId>org.clojars.zaxtax</groupId>
                                <artifactId>opennlp-tools</artifactId>
-                               <version>1.4.3</version>
+                               <version>1.5.0</version>
+                               <scope>provided</scope>
+                       </dependency>
+                       <dependency>
+                               <groupId>org.clojars.zaxtax</groupId>
+                               <artifactId>maxent</artifactId>
+                               <version>3.0.0</version>
                                <scope>provided</scope>
                        </dependency>

svn commit: r1073406 - in /incubator/stanbol/trunk/enhancer: engines/opennlp-ner/ engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/ engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/ la...

Reply via email to