Author: rwesten
Date: Tue May 22 12:17:12 2012
New Revision: 1341438

URL: http://svn.apache.org/viewvc?rev=1341438&view=rev
Log:
* implementation of STANBOL-624 and STANBOL-625 for the Keywordextration and 
NamedEntityLinking engine

Added:
    
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/Suggestion.java
    
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/DbPediaDataFileProvider.java
   (contents, props changed)
      - copied, changed from r1340995, 
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/TestDataFileProvider.java
Removed:
    
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/TestDataFileProvider.java
Modified:
    incubator/stanbol/trunk/enhancer/engines/entitytagging/pom.xml
    
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/EnhancementRDFUtils.java
    
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/NamedEntityTaggingEngine.java
    
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/MockEntityhub.java
    
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/TestEntityLinkingEnhancementEngine.java
    
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/resources/META-INF/services/org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileProvider
    
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java

Modified: incubator/stanbol/trunk/enhancer/engines/entitytagging/pom.xml
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/entitytagging/pom.xml?rev=1341438&r1=1341437&r2=1341438&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/entitytagging/pom.xml (original)
+++ incubator/stanbol/trunk/enhancer/engines/entitytagging/pom.xml Tue May 22 
12:17:12 2012
@@ -93,7 +93,7 @@
     <dependency>
       <groupId>org.apache.stanbol</groupId>
       <artifactId>org.apache.stanbol.entityhub.servicesapi</artifactId>
-      <version>0.9.0-incubating</version>
+      <version>0.10.0-incubating-SNAPSHOT</version>
       <scope>compile</scope>
     </dependency>
     <dependency>
@@ -107,6 +107,10 @@
       <groupId>commons-io</groupId>
       <artifactId>commons-io</artifactId>
     </dependency>
+    <dependency>
+      <groupId>commons-lang</groupId>
+      <artifactId>commons-lang</artifactId>
+    </dependency>
 
     <dependency>
       <groupId>org.apache.felix</groupId>
@@ -137,13 +141,13 @@
      <dependency>
       <groupId>org.apache.stanbol</groupId>
       <artifactId>org.apache.stanbol.entityhub.core</artifactId>
-      <version>0.9.0-incubating</version>
+      <version>0.10.0-incubating-SNAPSHOT</version>
       <scope>test</scope>
     </dependency>
      <dependency>
       <groupId>org.apache.stanbol</groupId>
       <artifactId>org.apache.stanbol.entityhub.yard.solr</artifactId>
-      <version>0.9.0-incubating</version>
+      <version>0.10.0-incubating-SNAPSHOT</version>
       <scope>test</scope>
     </dependency>
      <dependency>

Modified: 
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/EnhancementRDFUtils.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/EnhancementRDFUtils.java?rev=1341438&r1=1341437&r2=1341438&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/EnhancementRDFUtils.java
 (original)
+++ 
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/EnhancementRDFUtils.java
 Tue May 22 12:17:12 2012
@@ -21,7 +21,6 @@ import static org.apache.stanbol.enhance
 import static 
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_LABEL;
 import static 
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_REFERENCE;
 import static 
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_TYPE;
-import static 
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDFS_LABEL;
 import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE;
 
 import java.util.Collection;
@@ -60,37 +59,39 @@ public class EnhancementRDFUtils {
      *            the contentItemId the enhancement is extracted from
      * @param relatedEnhancements
      *            enhancements this textAnnotation is related to
-     * @param entity
-     *            the related entity
+     * @param suggestion
+     *            the entity suggestion
      * @param nameField the field used to extract the name
-     * @param lang the preferred language to include
+     * @param lang the preferred language to include or <code>null</code> if 
none
      */
     public static UriRef writeEntityAnnotation(EnhancementEngine engine,
                                                LiteralFactory literalFactory,
                                                MGraph graph,
                                                UriRef contentItemId,
                                                Collection<NonLiteral> 
relatedEnhancements,
-                                               Representation rep,
+                                               Suggestion suggestion,
                                                String nameField, 
                                                String lang) {
-        // 1. check if the returned Entity does has a label -> if not return 
null
-        // add labels (set only a single label. Use "en" if available!
-        Text label = null;
-        Iterator<Text> labels = rep.getText(nameField);
-        while (labels.hasNext()) {
-            Text actLabel = labels.next();
-            if (label == null) {
-                label = actLabel;
-            } else {
-                //use startWith to match also en-GB and en-US ...
-                if (actLabel.getLanguage() != null && 
actLabel.getLanguage().startsWith(lang)) {
+        Representation rep = suggestion.getEntity().getRepresentation();
+        // 1. extract the "best label"
+        //Start with the matched one
+        Text label = suggestion.getMatchedLabel();
+        //if the matched label is not in the requested language
+        boolean langMatch = (lang == null && label.getLanguage() == null) ||
+                (label.getLanguage() != null && 
label.getLanguage().startsWith(lang));
+            //search if a better label is available for this Entity
+        if(!langMatch){
+            Iterator<Text> labels = rep.getText(nameField);
+            while (labels.hasNext() && !langMatch) {
+                Text actLabel = labels.next();
+                langMatch = (lang == null && actLabel.getLanguage() == null) ||
+                        (actLabel.getLanguage() != null && 
actLabel.getLanguage().startsWith(lang));
+                if(langMatch){ //if the language matches ->
+                    //override the matched label
                     label = actLabel;
                 }
             }
-        }
-        if (label == null) {
-            return null;
-        }
+        } //else the matched label will be the best to use
         Literal literal;
         if (label.getLanguage() == null) {
             literal = new PlainLiteralImpl(label.getText());
@@ -109,31 +110,23 @@ public class EnhancementRDFUtils {
         graph.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_REFERENCE, 
entityUri));
         // add the label parsed above
         graph.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_LABEL, 
literal));
-        // TODO: add real confidence values!
-        // -> in case of SolrYards this will be a Lucene score and not within 
the range [0..1]
-        // -> in case of SPARQL there will be no score information at all.
-        Object score = rep.getFirst(RdfResourceEnum.resultScore.getUri());
-        Double scoreValue = new Double(-1); // use -1 if no score is available!
-        if (score != null) {
-            try {
-                scoreValue = Double.valueOf(score.toString());
-            } catch (NumberFormatException e) {
-                // ignore
-            }
+        if (suggestion.getScore() != null) {
+            graph.add(new TripleImpl(entityAnnotation, ENHANCER_CONFIDENCE, 
literalFactory
+                .createTypedLiteral(suggestion.getScore())));
         }
-        graph.add(new TripleImpl(entityAnnotation, ENHANCER_CONFIDENCE, 
literalFactory
-                .createTypedLiteral(scoreValue)));
 
         Iterator<Reference> types = 
rep.getReferences(RDF_TYPE.getUnicodeString());
         while (types.hasNext()) {
             graph.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_TYPE, 
new UriRef(types.next()
                     .getReference())));
         }
-        // TODO: for now add the information about this entity to the graph
-        // -> this might be replaced by some additional engine at the end
-        // RdfValueFactory rdfValueFactory = RdfValueFactory.getInstance();
-        // RdfRepresentation representation = 
rdfValueFactory.toRdfRepresentation(entity.getRepresentation());
-        // graph.addAll(representation.getRdfGraph());
+        //add the name of the ReferencedSite that manages the Entity
+        if(suggestion.getEntity().getSite() != null){
+            graph.add(new TripleImpl(entityAnnotation, 
+                new UriRef(RdfResourceEnum.site.getUri()), 
+                new PlainLiteralImpl(suggestion.getEntity().getSite())));
+        }
+        
         return entityAnnotation;
     }
 

Modified: 
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/NamedEntityTaggingEngine.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/NamedEntityTaggingEngine.java?rev=1341438&r1=1341437&r2=1341438&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/NamedEntityTaggingEngine.java
 (original)
+++ 
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/NamedEntityTaggingEngine.java
 Tue May 22 12:17:12 2012
@@ -16,6 +16,7 @@
  */
 package org.apache.stanbol.enhancer.engines.entitytagging.impl;
 
+import static org.apache.commons.lang.StringUtils.getLevenshteinDistance;
 import static 
org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses.DBPEDIA_ORGANISATION;
 import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE;
 
@@ -33,6 +34,7 @@ import org.apache.clerezza.rdf.core.MGra
 import org.apache.clerezza.rdf.core.NonLiteral;
 import org.apache.clerezza.rdf.core.Triple;
 import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.commons.lang.StringUtils;
 import org.apache.felix.scr.annotations.Activate;
 import org.apache.felix.scr.annotations.Component;
 import org.apache.felix.scr.annotations.ConfigurationPolicy;
@@ -353,10 +355,10 @@ public class NamedEntityTaggingEngine 
             ci.getLock().readLock().unlock();
         }
         //search the suggestions
-        Map<NamedEntity,List<Entity>> suggestions = new 
HashMap<NamedEntity,List<Entity>>(textAnnotations.size());
+        Map<NamedEntity,List<Suggestion>> suggestions = new 
HashMap<NamedEntity,List<Suggestion>>(textAnnotations.size());
         for (Entry<NamedEntity,List<UriRef>> entry : 
textAnnotations.entrySet()) {
             try {
-                List<Entity> entitySuggestions = computeEntityRecommentations(
+                List<Suggestion> entitySuggestions = 
computeEntityRecommentations(
                     site, entry.getKey(),entry.getValue(),contentLangauge);
                 if(entitySuggestions != null && !entitySuggestions.isEmpty()){
                     suggestions.put(entry.getKey(), entitySuggestions);
@@ -370,19 +372,19 @@ public class NamedEntityTaggingEngine 
         try {
             RdfValueFactory factory = RdfValueFactory.getInstance();
             Map<String, Representation> entityData = new 
HashMap<String,Representation>();
-            for(Entry<NamedEntity,List<Entity>> entitySuggestions : 
suggestions.entrySet()){
+            for(Entry<NamedEntity,List<Suggestion>> entitySuggestions : 
suggestions.entrySet()){
                 List<UriRef> subsumed = 
textAnnotations.get(entitySuggestions.getKey());
                 List<NonLiteral> annotationsToRelate = new 
ArrayList<NonLiteral>(subsumed);
                 
annotationsToRelate.add(entitySuggestions.getKey().getEntity());
-                for(Entity suggestion : entitySuggestions.getValue()){
-                    log.debug("Add Suggestion {} for {}", suggestion.getId(), 
entitySuggestions.getKey());
+                for(Suggestion suggestion : entitySuggestions.getValue()){
+                    log.debug("Add Suggestion {} for {}", 
suggestion.getEntity().getId(), entitySuggestions.getKey());
                     EnhancementRDFUtils.writeEntityAnnotation(this, 
literalFactory, graph, ci.getUri(),
-                        annotationsToRelate, suggestion.getRepresentation(), 
nameField,
+                        annotationsToRelate, suggestion, nameField,
                         //TODO: maybe we want labels in a different language 
than the
                         //      language of the content (e.g. Accept-Language 
header)?!
                         contentLangauge == null ? DEFAULT_LANGUAGE : 
contentLangauge);
                     if (dereferenceEntities) {
-                        entityData.put(suggestion.getId(), 
suggestion.getRepresentation());
+                        entityData.put(suggestion.getEntity().getId(), 
suggestion.getEntity().getRepresentation());
                     }
                 }
             }
@@ -405,19 +407,19 @@ public class NamedEntityTaggingEngine 
      * @param contentItemId the id of the contentItem
      * @param textAnnotation the text annotation to enhance
      * @param subsumedAnnotations other text annotations for the same entity 
-     * @param language the language of the analyzed text or <code>null</code>
+     * @param language the language of the analysed text or <code>null</code>
      * if not available.
-     * @return the suggested {@link Entity entities}
+     * @return the suggestions for the parsed {@link NamedEntity}
      * @throws EntityhubException On any Error while looking up Entities via
      * the Entityhub
      */
-    protected final List<Entity> computeEntityRecommentations(ReferencedSite 
site,
+    protected final List<Suggestion> 
computeEntityRecommentations(ReferencedSite site,
             NamedEntity namedEntity,
             List<UriRef> subsumedAnnotations, String language) throws 
EntityhubException {
         // First get the required properties for the parsed textAnnotation
         // ... and check the values
 
-        log.debug("Process {}", namedEntity);
+        log.info("Process {}", namedEntity);
         FieldQuery query = site == null ? //if site is NULL use the Entityhub
                 entityhub.getQueryFactory().createFieldQuery() : 
                     site.getQueryFactory().createFieldQuery();
@@ -425,11 +427,12 @@ public class NamedEntityTaggingEngine 
         Constraint labelConstraint;
         //TODO: make case sensitivity configurable
         boolean casesensitive = false;
+        String namedEntityLabel = casesensitive ? namedEntity.getName() : 
namedEntity.getName().toLowerCase();
         if(language != null){
             //search labels in the language and without language
-            labelConstraint = new 
TextConstraint(namedEntity.getName(),casesensitive,language,null);
+            labelConstraint = new 
TextConstraint(namedEntityLabel,casesensitive,language,null);
         } else {
-            labelConstraint = new 
TextConstraint(namedEntity.getName(),casesensitive);
+            labelConstraint = new 
TextConstraint(namedEntityLabel,casesensitive);
         }
         query.setConstraint(nameField, labelConstraint);
         if (OntologicalClasses.DBPEDIA_PERSON.equals(namedEntity.getType())) {
@@ -467,55 +470,68 @@ public class NamedEntityTaggingEngine 
         QueryResultList<Entity> results = site == null? //if site is NULL
                 entityhub.findEntities(query) : //use the Entityhub
                     site.findEntities(query); //else the referenced site
-        log.debug("{} results returned by query {}", results.size(), query);
-
+        log.info(" - {} results returned by query {}", results.size(), 
results.getQuery());
+        if(results.isEmpty()){ //no results nothing to do
+            return Collections.emptyList();
+        }
+        //we need to normalise the confidence values from [0..1]
+        // * levenshtein distance as absolute (1.0 for exact match)
+        // * Solr scores * levenshtein to rank entities relative to each other
         Float maxScore = null;
-        int exactCount = 0;
-        List<Entity> matches = new ArrayList<Entity>(numSuggestions);
-        for (Iterator<Entity> guesses = results.iterator();guesses.hasNext() 
&& exactCount<numSuggestions;) {
-            Entity guess = guesses.next();
-            Representation rep = guess.getRepresentation();
+        Float maxExactScore = null;
+        List<Suggestion> matches = new ArrayList<Suggestion>(numSuggestions);
+        //assumes entities are sorted by score
+        for (Iterator<Entity> guesses = results.iterator();guesses.hasNext();) 
{
+            Suggestion match = new Suggestion(guesses.next());
+            Representation rep = match.getEntity().getRepresentation();
+            Float score = 
rep.getFirst(RdfResourceEnum.resultScore.getUri(),Float.class);
             if(maxScore == null){
-                maxScore = 
rep.getFirst(RdfResourceEnum.resultScore.getUri(),Float.class);
+                maxScore = score;
             }
             Iterator<Text> labels = rep.getText(nameField);
-            boolean found = false;
-            while(labels.hasNext() && !found){
+            while(labels.hasNext() && match.getLevenshtein() < 1.0){
                 Text label = labels.next();
-                if(label.getLanguage() == null || (language != null && 
label.getLanguage().startsWith(language))){
-                    
if(label.getText().equalsIgnoreCase(namedEntity.getName())){
-                        found = true;
+                if(language == null || //if the content language is unknown -> 
accept all labels
+                        label.getLanguage() == null ||  //accept labels with 
no language
+                        //and labels in the same language as the content
+                        (language != null && 
label.getLanguage().startsWith(language))){
+                    double actMatch = levenshtein(
+                        casesensitive ? label.getText().toLowerCase() : 
label.getText(), 
+                                namedEntityLabel);
+                    if(actMatch > match.getLevenshtein()){
+                        match.setLevenshtein(actMatch);
+                        match.setMatchedLabel(label);
                     }
                 }
             }
-            if(found){
-                matches.add(exactCount,guess);
-                exactCount++;
-            } else if(matches.size()<numSuggestions){
-                matches.add(guess);
-            }
-        }
-        //now write the results
-        for(int i=0;i<matches.size();i++){
-            Representation rep = matches.get(i).getRepresentation();
-            if(i<exactCount){ //and boost the scores of the exact matches
-                if(maxScore == null){
-                    rep.set(RdfResourceEnum.resultScore.getUri(), 1.0f);
+            if(match.getMatchedLabel() != null){
+                if(match.getLevenshtein() == 1.0){
+                    if(maxExactScore == null){
+                        maxExactScore = score;
+                    }
+                    //normalise exact matches against the best exact score
+                    
match.setScore(score.doubleValue()/maxExactScore.doubleValue());
                 } else {
-                    Float score = 
rep.getFirst(RdfResourceEnum.resultScore.getUri(), Float.class);
-                    rep.set(RdfResourceEnum.resultScore.getUri(), 
-                        maxScore.doubleValue()+(score != 
null?score.doubleValue():0));
+                    //normalise partial matches against the best match and the
+                    //Levenshtein similarity with the label
+                    
match.setScore(score.doubleValue()*match.getLevenshtein()/maxScore.doubleValue());
                 }
+                matches.add(match);
+            } else {
+                log.info("No value of {} for Entity 
{}!",nameField,match.getEntity().getId());
             }
         }
-        return matches;
+        //now sort the results
+        Collections.sort(matches);
+        return matches.subList(0, Math.min(matches.size(),numSuggestions));
     }
 
+    /**
+     * This EnhancementEngine can enhance any ContentItem as it does consume
+     * existing TextAnnotations with the configured dc:type's
+     * @see 
org.apache.stanbol.enhancer.servicesapi.EnhancementEngine#canEnhance(org.apache.stanbol.enhancer.servicesapi.ContentItem)
+     */
     public int canEnhance(ContentItem ci) {
-        /*
-         * This engine consumes existing enhancements because of that it can 
enhance any type of ci! TODO: It
-         * would also be possible to check here if there is an TextAnnotation 
and use that as result!
-         */
         return ENHANCE_ASYNC; //Entity tagging now supports asyc processing
     }
 
@@ -524,5 +540,23 @@ public class NamedEntityTaggingEngine 
         return 
Collections.unmodifiableMap(Collections.singletonMap(ENHANCEMENT_ENGINE_ORDERING,
             (Object) defaultOrder));
     }
-
+    /**
+     * Compares two strings (after {@link StringUtils#trim(String) trimming})
+     * by using the Levenshtein's Edit Distance of the two
+     * strings. Does not return the {@link Integer} number of changes but
+     * <code>1-(changes/maxStringSizeAfterTrim)</code><p>
+     * @param s1 the first string
+     * @param s2 the second string
+     * @return the distance
+     * @throws IllegalArgumentException if any of the two parsed strings is 
NULL
+     */
+    private  static double levenshtein(String s1, String s2) {
+        if(s1 == null || s2 == null){
+            throw new IllegalArgumentException("NONE of the parsed String MUST 
BE NULL!");
+        }
+        s1 = StringUtils.trim(s1);
+        s2 = StringUtils.trim(s2);
+        return s1.isEmpty() || s2.isEmpty() ? 0 :
+            1.0 - (((double)getLevenshteinDistance(s1, s2)) / 
((double)(Math.max(s1.length(), s2.length()))));
+    }
 }

Added: 
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/Suggestion.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/Suggestion.java?rev=1341438&view=auto
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/Suggestion.java
 (added)
+++ 
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/Suggestion.java
 Tue May 22 12:17:12 2012
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.engines.entitytagging.impl;
+
+import org.apache.stanbol.entityhub.servicesapi.model.Entity;
+import org.apache.stanbol.entityhub.servicesapi.model.Text;
+
+/**
+ * A suggestion of an {@link Entity} for a fise:TextAnnotation processed
+ * by the NamedEntityTaggingEngine
+ * @author Rupert Westenthaler
+ */
+public class Suggestion implements Comparable<Suggestion>{
+    private final Entity entity;
+    private double levenshtein = -1;
+    private Double score;
+    private Text matchedLabel;
+
+    protected Suggestion(Entity entity){
+        this.entity = entity;
+    }
+    
+    
+    /**
+     * @return the levenshtein
+     */
+    public final double getLevenshtein() {
+        return levenshtein;
+    }
+
+
+    /**
+     * @param levenshtein the levenshtein to set
+     */
+    protected final void setLevenshtein(double levenshtein) {
+        this.levenshtein = levenshtein;
+    }
+
+
+    /**
+     * @return the score
+     */
+    public final Double getScore() {
+        return score;
+    }
+
+
+    /**
+     * @param score the score to set
+     */
+    protected final void setScore(Double score) {
+        this.score = score;
+    }
+
+
+    /**
+     * @return the matchedLabel
+     */
+    public final Text getMatchedLabel() {
+        return matchedLabel;
+    }
+
+
+    /**
+     * @param matchedLabel the matchedLabel to set
+     */
+    protected final void setMatchedLabel(Text matchedLabel) {
+        this.matchedLabel = matchedLabel;
+    }
+
+
+    /**
+     * @return the entity
+     */
+    public final Entity getEntity() {
+        return entity;
+    }
+
+
+    @Override
+    public int compareTo(Suggestion other) {
+        return other.score.compareTo(score);
+    }
+    
+    
+}
\ No newline at end of file

Copied: 
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/DbPediaDataFileProvider.java
 (from r1340995, 
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/TestDataFileProvider.java)
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/DbPediaDataFileProvider.java?p2=incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/DbPediaDataFileProvider.java&p1=incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/TestDataFileProvider.java&r1=1340995&r2=1341438&rev=1341438&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/TestDataFileProvider.java
 (original)
+++ 
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/DbPediaDataFileProvider.java
 Tue May 22 12:17:12 2012
@@ -34,7 +34,7 @@ import org.apache.stanbol.commons.stanbo
  * @author Rupert Westenthaler
  *
  */
-public class TestDataFileProvider implements DataFileProvider {
+public class DbPediaDataFileProvider implements DataFileProvider {
 
     private static String DBPEDIA_PREFIX = 
"org/apache/stanbol/data/site/dbpedia/default/index/";
     
@@ -58,7 +58,7 @@ public class TestDataFileProvider implem
      * @return
      */
     private URL lookupResource(String resource) {
-        ClassLoader cl = TestDataFileProvider.class.getClassLoader();
+        ClassLoader cl = DbPediaDataFileProvider.class.getClassLoader();
         URL resourceUri = cl.getResource(resource);
         if(resourceUri == null){
             cl = Thread.currentThread().getContextClassLoader();

Propchange: 
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/DbPediaDataFileProvider.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Modified: 
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/MockEntityhub.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/MockEntityhub.java?rev=1341438&r1=1341437&r2=1341438&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/MockEntityhub.java
 (original)
+++ 
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/MockEntityhub.java
 Tue May 22 12:17:12 2012
@@ -37,6 +37,8 @@ import org.apache.stanbol.entityhub.serv
 import org.apache.stanbol.entityhub.servicesapi.yard.YardException;
 import org.apache.stanbol.entityhub.yard.solr.impl.SolrYard;
 import org.apache.stanbol.entityhub.yard.solr.impl.SolrYardConfig;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 /**
  * Mocks an Entityhub for the {@link NamedEntityTaggingEngine} for Unit 
Testing<p>
@@ -46,12 +48,18 @@ import org.apache.stanbol.entityhub.yard
  */
 class MockEntityhub implements Entityhub {
 
+    private static final Logger log = 
LoggerFactory.getLogger(MockEntityhub.class);
+    
     protected SolrYard yard;
     
     protected MockEntityhub(){
         SolrYardConfig config = new SolrYardConfig("dbpedia", "dbpedia_43k");
         try {
             yard = new SolrYard(config);
+            Representation paris = 
yard.getRepresentation("http://dbpedia.org/resource/Paris";);
+            if(paris == null){
+                throw new IllegalStateException("Initialised Yard does not 
contain the expected resource dbpedia:Paris!");
+            }
         } catch (YardException e) {
             throw new IllegalStateException("Unable to init Yard!",e);
         }
@@ -67,9 +75,12 @@ class MockEntityhub implements Entityhub
     }
     @Override
     public QueryResultList<Entity> findEntities(FieldQuery query) throws 
EntityhubException {
+        log.info("Performing Query: {}",query);
         QueryResultList<Representation> results = 
yard.findRepresentation(query);
+        log.info("  ... {} results",results.size());
         Collection<Entity> entities = new ArrayList<Entity>(results.size());
         for(Representation r : results){
+            log.info("    > {}",r.getId());
             entities.add(new EntityImpl("dbpedia", r, null));
         }
         return new 
QueryResultListImpl<Entity>(results.getQuery(),entities,Entity.class);

Modified: 
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/TestEntityLinkingEnhancementEngine.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/TestEntityLinkingEnhancementEngine.java?rev=1341438&r1=1341437&r2=1341438&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/TestEntityLinkingEnhancementEngine.java
 (original)
+++ 
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/TestEntityLinkingEnhancementEngine.java
 Tue May 22 12:17:12 2012
@@ -16,11 +16,13 @@
  */
 package org.apache.stanbol.enhancer.engines.entitytagging.impl;
 
+import static 
org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper.randomUUID;
 import static 
org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses.DBPEDIA_ORGANISATION;
 import static 
org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses.DBPEDIA_PERSON;
 import static 
org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses.DBPEDIA_PLACE;
 import static 
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_CREATOR;
 import static 
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_RELATION;
+import static 
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE;
 import static 
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_LABEL;
 import static 
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_REFERENCE;
 import static 
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_EXTRACTED_FROM;
@@ -28,12 +30,14 @@ import static org.apache.stanbol.enhance
 import static 
org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses.ENHANCER_ENTITYANNOTATION;
 import static 
org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses.ENHANCER_TEXTANNOTATION;
 import static 
org.apache.stanbol.enhancer.test.helper.EnhancementStructureHelper.validateAllEntityAnnotations;
+import static 
org.apache.stanbol.enhancer.test.helper.EnhancementStructureHelper.validateEntityAnnotation;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
 
 import java.io.File;
 import java.io.IOException;
+import java.util.Collections;
 import java.util.Date;
 import java.util.HashMap;
 import java.util.Iterator;
@@ -43,13 +47,17 @@ import org.apache.clerezza.rdf.core.Lite
 import org.apache.clerezza.rdf.core.MGraph;
 import org.apache.clerezza.rdf.core.Resource;
 import org.apache.clerezza.rdf.core.Triple;
+import org.apache.clerezza.rdf.core.TypedLiteral;
 import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
+import org.apache.clerezza.rdf.core.impl.TripleImpl;
 import org.apache.commons.io.IOUtils;
 import 
org.apache.stanbol.enhancer.contentitem.inmemory.InMemoryContentItemFactory;
 import org.apache.stanbol.enhancer.rdfentities.RdfEntityFactory;
 import org.apache.stanbol.enhancer.rdfentities.fise.TextAnnotation;
 import org.apache.stanbol.enhancer.servicesapi.ContentItem;
 import org.apache.stanbol.enhancer.servicesapi.ContentItemFactory;
+import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
 import org.apache.stanbol.enhancer.servicesapi.impl.StringSource;
 import org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses;
 import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;
@@ -60,10 +68,14 @@ import org.junit.Assert;
 import org.junit.Before;
 import org.junit.BeforeClass;
 import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 
 public class TestEntityLinkingEnhancementEngine {
-
+    
+    private static final Logger log = 
LoggerFactory.getLogger(TestEntityLinkingEnhancementEngine.class);
+    
     /**
      * The context for the tests (same as in TestOpenNLPEnhancementEngine)
      */
@@ -97,8 +109,9 @@ public class TestEntityLinkingEnhancemen
                 throw new IOException("Unable to create directory for test 
files "+testFiles);
             }
         }
-        
-        System.getProperties().setProperty("user.dir", 
testFiles.getCanonicalPath());
+        String testRootDir = testFiles.getCanonicalPath();
+        log.info("Test 'user.dir' folder {}",testRootDir);
+        System.getProperties().setProperty("user.dir", testRootDir);
         entityLinkingEngine = new NamedEntityTaggingEngine();
         //instead of calling activate we directly set the required fields
         //we need a data source for linking
@@ -140,20 +153,20 @@ public class TestEntityLinkingEnhancemen
             content = "";
         }
         RdfEntityFactory factory = 
RdfEntityFactory.createInstance(ci.getMetadata());
-        TextAnnotation testAnnotation = factory.getProxy(
-                new 
UriRef("urn:iks-project:enhancer:test:text-annotation:person"), 
TextAnnotation.class);
-        testAnnotation.setCreator(new 
UriRef("urn:iks-project:enhancer:test:dummyEngine"));
-        testAnnotation.setCreated(new Date());
-        testAnnotation.setSelectedText(name);
-        testAnnotation.setSelectionContext(context);
-        testAnnotation.getDcType().add(type);
+        TextAnnotation textAnnotation = factory.getProxy(
+                new 
UriRef("urn:iks-project:enhancer:test:text-annotation:"+randomUUID()), 
TextAnnotation.class);
+        textAnnotation.setCreator(new 
UriRef("urn:iks-project:enhancer:test:dummyEngine"));
+        textAnnotation.setCreated(new Date());
+        textAnnotation.setSelectedText(name);
+        textAnnotation.setSelectionContext(context);
+        textAnnotation.getDcType().add(type);
         Integer start = content.indexOf(name);
         if (start < 0){ //if not found in the content
             //set some random numbers for start/end
             start = (int)Math.random()*100;
         }
-        testAnnotation.setStart(start);
-        testAnnotation.setEnd(start+name.length());
+        textAnnotation.setStart(start);
+        textAnnotation.setEnd(start+name.length());
     }
 
     @Test
@@ -164,16 +177,46 @@ public class TestEntityLinkingEnhancemen
         getTextAnnotation(ci, PERSON, CONTEXT, DBPEDIA_PERSON);
         getTextAnnotation(ci, ORGANISATION, CONTEXT, DBPEDIA_ORGANISATION);
         getTextAnnotation(ci, PLACE, CONTEXT, DBPEDIA_PLACE);
+        //add the language
+        ci.getMetadata().add(new TripleImpl(ci.getUri(), 
Properties.DC_LANGUAGE, new PlainLiteralImpl("en")));
         //perform the computation of the enhancements
         entityLinkingEngine.computeEnhancements(ci);
+        int entityAnnotationCount = validateAllEntityAnnotations(ci);
+        assertEquals(4, entityAnnotationCount);
+    }
+    
+    private static int validateAllEntityAnnotations(ContentItem ci){
         Map<UriRef,Resource> expectedValues = new HashMap<UriRef,Resource>();
         expectedValues.put(ENHANCER_EXTRACTED_FROM, ci.getUri());
         
expectedValues.put(DC_CREATOR,LiteralFactory.getInstance().createTypedLiteral(
             entityLinkingEngine.getClass().getName()));
-        int entityAnnotationCount = 
validateAllEntityAnnotations(ci.getMetadata(),expectedValues);
-        assertEquals(3, entityAnnotationCount);
+        Iterator<Triple> entityAnnotationIterator = 
ci.getMetadata().filter(null,
+                RDF_TYPE, ENHANCER_ENTITYANNOTATION);
+        int entityAnnotationCount = 0;
+        while (entityAnnotationIterator.hasNext()) {
+            UriRef entityAnnotation = (UriRef) 
entityAnnotationIterator.next().getSubject();
+            // test if selected Text is added
+            validateEntityAnnotation(ci.getMetadata(), entityAnnotation, 
expectedValues);
+            //validate also that the confidence is between [0..1]
+            Iterator<Triple> confidenceIterator = 
ci.getMetadata().filter(entityAnnotation, ENHANCER_CONFIDENCE, null);
+            //NOTE: the fact that fise:confidence values are TypedLiterals of 
type xsd:double
+            //      is already validated at this point
+            //      Also that there are only [0..1] confidence values
+            assertTrue("Expected fise:confidence value is missing 
(entityAnnotation "
+                    +entityAnnotation+")",confidenceIterator.hasNext());
+            Double confidence = 
LiteralFactory.getInstance().createObject(Double.class,
+                (TypedLiteral)confidenceIterator.next().getObject());
+            assertTrue("fise:confidence MUST BE <= 1 (value= '"+confidence
+                    + "',entityAnnotation " +entityAnnotation+")",
+                    1.0 >= confidence.doubleValue());
+            assertTrue("fise:confidence MUST BE >= 0 (value= '"+confidence
+                    +"',entityAnnotation "+entityAnnotation+")",
+                    0.0 <= confidence.doubleValue());
+            entityAnnotationCount++;
+        }
+        return entityAnnotationCount;
+        
     }
 
 
-
 }

Modified: 
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/resources/META-INF/services/org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileProvider
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/resources/META-INF/services/org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileProvider?rev=1341438&r1=1341437&r2=1341438&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/resources/META-INF/services/org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileProvider
 (original)
+++ 
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/resources/META-INF/services/org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileProvider
 Tue May 22 12:17:12 2012
@@ -13,4 +13,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-org.apache.stanbol.enhancer.engines.entitytagging.impl.TestDataFileProvider
\ No newline at end of file
+org.apache.stanbol.enhancer.engines.entitytagging.impl.DbPediaDataFileProvider
\ No newline at end of file

Modified: 
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java?rev=1341438&r1=1341437&r2=1341438&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java
 (original)
+++ 
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java
 Tue May 22 12:17:12 2012
@@ -82,6 +82,7 @@ import org.apache.stanbol.entityhub.serv
 import org.apache.stanbol.entityhub.servicesapi.defaults.NamespaceEnum;
 import org.apache.stanbol.entityhub.servicesapi.model.Reference;
 import org.apache.stanbol.entityhub.servicesapi.model.Text;
+import org.apache.stanbol.entityhub.servicesapi.model.rdf.RdfResourceEnum;
 import org.apache.stanbol.entityhub.servicesapi.site.ReferencedSite;
 import org.osgi.framework.Constants;
 import org.osgi.service.cm.ConfigurationException;
@@ -239,6 +240,7 @@ public class KeywordLinkingEngine 
         unbind = "disableOfflineMode", 
         strategy = ReferenceStrategy.EVENT)
     private OfflineMode offlineMode;
+    private String referencedSiteName;
 
     /**
      * Called by the ConfigurationAdmin to bind the {@link #offlineMode} if 
the service becomes available
@@ -452,6 +454,10 @@ public class KeywordLinkingEngine 
                     metadata.add(new TripleImpl(entityAnnotation, 
                         Properties.DC_RELATION, textAnnotation));
                 }
+                //add the name of the ReferencedSite providing this suggestion
+                metadata.add(new TripleImpl(entityAnnotation, 
+                    new UriRef(RdfResourceEnum.site.getUri()), 
+                    new PlainLiteralImpl(referencedSiteName)));
                 //in case dereferencing of Entities is enabled we need also to
                 //add the RDF data for entities
                 if(dereferenceEntitiesState){
@@ -827,16 +833,16 @@ public class KeywordLinkingEngine 
             throw new ConfigurationException(REFERENCED_SITE_ID,
                     "The ID of the Referenced Site is a required Parameter and 
MUST NOT be NULL!");
         }
-        String refSiteId = value.toString();
-        if (refSiteId.isEmpty()) {
+        referencedSiteName = value.toString();
+        if (referencedSiteName.isEmpty()) {
             throw new ConfigurationException(REFERENCED_SITE_ID,
                     "The ID of the Referenced Site is a required Parameter and 
MUST NOT be an empty String!");
         }
         //TODO: make limit configurable!
-        if(Entityhub.ENTITYHUB_IDS.contains(refSiteId.toLowerCase())){
+        if(Entityhub.ENTITYHUB_IDS.contains(referencedSiteName.toLowerCase())){
             entitySearcher = new 
EntityhubSearcher(context.getBundleContext(),10);
         } else {
-            entitySearcher = new 
ReferencedSiteSearcher(context.getBundleContext(),refSiteId,10);
+            entitySearcher = new 
ReferencedSiteSearcher(context.getBundleContext(),referencedSiteName,10);
         }
     }
     /**
@@ -891,5 +897,6 @@ public class KeywordLinkingEngine 
             ((TrackingEntitySearcher<?>)entitySearcher).close();
         }
         entitySearcher = null;
+        referencedSiteName = null;
     }
 }


Reply via email to