Author: rwesten
Date: Tue May 22 12:17:12 2012
New Revision: 1341438
URL: http://svn.apache.org/viewvc?rev=1341438&view=rev
Log:
* implementation of STANBOL-624 and STANBOL-625 for the Keywordextration and
NamedEntityLinking engine
Added:
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/Suggestion.java
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/DbPediaDataFileProvider.java
(contents, props changed)
- copied, changed from r1340995,
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/TestDataFileProvider.java
Removed:
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/TestDataFileProvider.java
Modified:
incubator/stanbol/trunk/enhancer/engines/entitytagging/pom.xml
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/EnhancementRDFUtils.java
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/NamedEntityTaggingEngine.java
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/MockEntityhub.java
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/TestEntityLinkingEnhancementEngine.java
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/resources/META-INF/services/org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileProvider
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java
Modified: incubator/stanbol/trunk/enhancer/engines/entitytagging/pom.xml
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/entitytagging/pom.xml?rev=1341438&r1=1341437&r2=1341438&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/entitytagging/pom.xml (original)
+++ incubator/stanbol/trunk/enhancer/engines/entitytagging/pom.xml Tue May 22
12:17:12 2012
@@ -93,7 +93,7 @@
<dependency>
<groupId>org.apache.stanbol</groupId>
<artifactId>org.apache.stanbol.entityhub.servicesapi</artifactId>
- <version>0.9.0-incubating</version>
+ <version>0.10.0-incubating-SNAPSHOT</version>
<scope>compile</scope>
</dependency>
<dependency>
@@ -107,6 +107,10 @@
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
</dependency>
+ <dependency>
+ <groupId>commons-lang</groupId>
+ <artifactId>commons-lang</artifactId>
+ </dependency>
<dependency>
<groupId>org.apache.felix</groupId>
@@ -137,13 +141,13 @@
<dependency>
<groupId>org.apache.stanbol</groupId>
<artifactId>org.apache.stanbol.entityhub.core</artifactId>
- <version>0.9.0-incubating</version>
+ <version>0.10.0-incubating-SNAPSHOT</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.stanbol</groupId>
<artifactId>org.apache.stanbol.entityhub.yard.solr</artifactId>
- <version>0.9.0-incubating</version>
+ <version>0.10.0-incubating-SNAPSHOT</version>
<scope>test</scope>
</dependency>
<dependency>
Modified:
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/EnhancementRDFUtils.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/EnhancementRDFUtils.java?rev=1341438&r1=1341437&r2=1341438&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/EnhancementRDFUtils.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/EnhancementRDFUtils.java
Tue May 22 12:17:12 2012
@@ -21,7 +21,6 @@ import static org.apache.stanbol.enhance
import static
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_LABEL;
import static
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_REFERENCE;
import static
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_TYPE;
-import static
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDFS_LABEL;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE;
import java.util.Collection;
@@ -60,37 +59,39 @@ public class EnhancementRDFUtils {
* the contentItemId the enhancement is extracted from
* @param relatedEnhancements
* enhancements this textAnnotation is related to
- * @param entity
- * the related entity
+ * @param suggestion
+ * the entity suggestion
* @param nameField the field used to extract the name
- * @param lang the preferred language to include
+ * @param lang the preferred language to include or <code>null</code> if
none
*/
public static UriRef writeEntityAnnotation(EnhancementEngine engine,
LiteralFactory literalFactory,
MGraph graph,
UriRef contentItemId,
Collection<NonLiteral>
relatedEnhancements,
- Representation rep,
+ Suggestion suggestion,
String nameField,
String lang) {
- // 1. check if the returned Entity does has a label -> if not return
null
- // add labels (set only a single label. Use "en" if available!
- Text label = null;
- Iterator<Text> labels = rep.getText(nameField);
- while (labels.hasNext()) {
- Text actLabel = labels.next();
- if (label == null) {
- label = actLabel;
- } else {
- //use startWith to match also en-GB and en-US ...
- if (actLabel.getLanguage() != null &&
actLabel.getLanguage().startsWith(lang)) {
+ Representation rep = suggestion.getEntity().getRepresentation();
+ // 1. extract the "best label"
+ //Start with the matched one
+ Text label = suggestion.getMatchedLabel();
+ //if the matched label is not in the requested language
+ boolean langMatch = (lang == null && label.getLanguage() == null) ||
+ (label.getLanguage() != null &&
label.getLanguage().startsWith(lang));
+ //search if a better label is available for this Entity
+ if(!langMatch){
+ Iterator<Text> labels = rep.getText(nameField);
+ while (labels.hasNext() && !langMatch) {
+ Text actLabel = labels.next();
+ langMatch = (lang == null && actLabel.getLanguage() == null) ||
+ (actLabel.getLanguage() != null &&
actLabel.getLanguage().startsWith(lang));
+ if(langMatch){ //if the language matches ->
+ //override the matched label
label = actLabel;
}
}
- }
- if (label == null) {
- return null;
- }
+ } //else the matched label will be the best to use
Literal literal;
if (label.getLanguage() == null) {
literal = new PlainLiteralImpl(label.getText());
@@ -109,31 +110,23 @@ public class EnhancementRDFUtils {
graph.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_REFERENCE,
entityUri));
// add the label parsed above
graph.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_LABEL,
literal));
- // TODO: add real confidence values!
- // -> in case of SolrYards this will be a Lucene score and not within
the range [0..1]
- // -> in case of SPARQL there will be no score information at all.
- Object score = rep.getFirst(RdfResourceEnum.resultScore.getUri());
- Double scoreValue = new Double(-1); // use -1 if no score is available!
- if (score != null) {
- try {
- scoreValue = Double.valueOf(score.toString());
- } catch (NumberFormatException e) {
- // ignore
- }
+ if (suggestion.getScore() != null) {
+ graph.add(new TripleImpl(entityAnnotation, ENHANCER_CONFIDENCE,
literalFactory
+ .createTypedLiteral(suggestion.getScore())));
}
- graph.add(new TripleImpl(entityAnnotation, ENHANCER_CONFIDENCE,
literalFactory
- .createTypedLiteral(scoreValue)));
Iterator<Reference> types =
rep.getReferences(RDF_TYPE.getUnicodeString());
while (types.hasNext()) {
graph.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_TYPE,
new UriRef(types.next()
.getReference())));
}
- // TODO: for now add the information about this entity to the graph
- // -> this might be replaced by some additional engine at the end
- // RdfValueFactory rdfValueFactory = RdfValueFactory.getInstance();
- // RdfRepresentation representation =
rdfValueFactory.toRdfRepresentation(entity.getRepresentation());
- // graph.addAll(representation.getRdfGraph());
+ //add the name of the ReferencedSite that manages the Entity
+ if(suggestion.getEntity().getSite() != null){
+ graph.add(new TripleImpl(entityAnnotation,
+ new UriRef(RdfResourceEnum.site.getUri()),
+ new PlainLiteralImpl(suggestion.getEntity().getSite())));
+ }
+
return entityAnnotation;
}
Modified:
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/NamedEntityTaggingEngine.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/NamedEntityTaggingEngine.java?rev=1341438&r1=1341437&r2=1341438&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/NamedEntityTaggingEngine.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/NamedEntityTaggingEngine.java
Tue May 22 12:17:12 2012
@@ -16,6 +16,7 @@
*/
package org.apache.stanbol.enhancer.engines.entitytagging.impl;
+import static org.apache.commons.lang.StringUtils.getLevenshteinDistance;
import static
org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses.DBPEDIA_ORGANISATION;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE;
@@ -33,6 +34,7 @@ import org.apache.clerezza.rdf.core.MGra
import org.apache.clerezza.rdf.core.NonLiteral;
import org.apache.clerezza.rdf.core.Triple;
import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.commons.lang.StringUtils;
import org.apache.felix.scr.annotations.Activate;
import org.apache.felix.scr.annotations.Component;
import org.apache.felix.scr.annotations.ConfigurationPolicy;
@@ -353,10 +355,10 @@ public class NamedEntityTaggingEngine
ci.getLock().readLock().unlock();
}
//search the suggestions
- Map<NamedEntity,List<Entity>> suggestions = new
HashMap<NamedEntity,List<Entity>>(textAnnotations.size());
+ Map<NamedEntity,List<Suggestion>> suggestions = new
HashMap<NamedEntity,List<Suggestion>>(textAnnotations.size());
for (Entry<NamedEntity,List<UriRef>> entry :
textAnnotations.entrySet()) {
try {
- List<Entity> entitySuggestions = computeEntityRecommentations(
+ List<Suggestion> entitySuggestions =
computeEntityRecommentations(
site, entry.getKey(),entry.getValue(),contentLangauge);
if(entitySuggestions != null && !entitySuggestions.isEmpty()){
suggestions.put(entry.getKey(), entitySuggestions);
@@ -370,19 +372,19 @@ public class NamedEntityTaggingEngine
try {
RdfValueFactory factory = RdfValueFactory.getInstance();
Map<String, Representation> entityData = new
HashMap<String,Representation>();
- for(Entry<NamedEntity,List<Entity>> entitySuggestions :
suggestions.entrySet()){
+ for(Entry<NamedEntity,List<Suggestion>> entitySuggestions :
suggestions.entrySet()){
List<UriRef> subsumed =
textAnnotations.get(entitySuggestions.getKey());
List<NonLiteral> annotationsToRelate = new
ArrayList<NonLiteral>(subsumed);
annotationsToRelate.add(entitySuggestions.getKey().getEntity());
- for(Entity suggestion : entitySuggestions.getValue()){
- log.debug("Add Suggestion {} for {}", suggestion.getId(),
entitySuggestions.getKey());
+ for(Suggestion suggestion : entitySuggestions.getValue()){
+ log.debug("Add Suggestion {} for {}",
suggestion.getEntity().getId(), entitySuggestions.getKey());
EnhancementRDFUtils.writeEntityAnnotation(this,
literalFactory, graph, ci.getUri(),
- annotationsToRelate, suggestion.getRepresentation(),
nameField,
+ annotationsToRelate, suggestion, nameField,
//TODO: maybe we want labels in a different language
than the
// language of the content (e.g. Accept-Language
header)?!
contentLangauge == null ? DEFAULT_LANGUAGE :
contentLangauge);
if (dereferenceEntities) {
- entityData.put(suggestion.getId(),
suggestion.getRepresentation());
+ entityData.put(suggestion.getEntity().getId(),
suggestion.getEntity().getRepresentation());
}
}
}
@@ -405,19 +407,19 @@ public class NamedEntityTaggingEngine
* @param contentItemId the id of the contentItem
* @param textAnnotation the text annotation to enhance
* @param subsumedAnnotations other text annotations for the same entity
- * @param language the language of the analyzed text or <code>null</code>
+ * @param language the language of the analysed text or <code>null</code>
* if not available.
- * @return the suggested {@link Entity entities}
+ * @return the suggestions for the parsed {@link NamedEntity}
* @throws EntityhubException On any Error while looking up Entities via
* the Entityhub
*/
- protected final List<Entity> computeEntityRecommentations(ReferencedSite
site,
+ protected final List<Suggestion>
computeEntityRecommentations(ReferencedSite site,
NamedEntity namedEntity,
List<UriRef> subsumedAnnotations, String language) throws
EntityhubException {
// First get the required properties for the parsed textAnnotation
// ... and check the values
- log.debug("Process {}", namedEntity);
+ log.info("Process {}", namedEntity);
FieldQuery query = site == null ? //if site is NULL use the Entityhub
entityhub.getQueryFactory().createFieldQuery() :
site.getQueryFactory().createFieldQuery();
@@ -425,11 +427,12 @@ public class NamedEntityTaggingEngine
Constraint labelConstraint;
//TODO: make case sensitivity configurable
boolean casesensitive = false;
+ String namedEntityLabel = casesensitive ? namedEntity.getName() :
namedEntity.getName().toLowerCase();
if(language != null){
//search labels in the language and without language
- labelConstraint = new
TextConstraint(namedEntity.getName(),casesensitive,language,null);
+ labelConstraint = new
TextConstraint(namedEntityLabel,casesensitive,language,null);
} else {
- labelConstraint = new
TextConstraint(namedEntity.getName(),casesensitive);
+ labelConstraint = new
TextConstraint(namedEntityLabel,casesensitive);
}
query.setConstraint(nameField, labelConstraint);
if (OntologicalClasses.DBPEDIA_PERSON.equals(namedEntity.getType())) {
@@ -467,55 +470,68 @@ public class NamedEntityTaggingEngine
QueryResultList<Entity> results = site == null? //if site is NULL
entityhub.findEntities(query) : //use the Entityhub
site.findEntities(query); //else the referenced site
- log.debug("{} results returned by query {}", results.size(), query);
-
+ log.info(" - {} results returned by query {}", results.size(),
results.getQuery());
+ if(results.isEmpty()){ //no results nothing to do
+ return Collections.emptyList();
+ }
+ //we need to normalise the confidence values from [0..1]
+ // * levenshtein distance as absolute (1.0 for exact match)
+ // * Solr scores * levenshtein to rank entities relative to each other
Float maxScore = null;
- int exactCount = 0;
- List<Entity> matches = new ArrayList<Entity>(numSuggestions);
- for (Iterator<Entity> guesses = results.iterator();guesses.hasNext()
&& exactCount<numSuggestions;) {
- Entity guess = guesses.next();
- Representation rep = guess.getRepresentation();
+ Float maxExactScore = null;
+ List<Suggestion> matches = new ArrayList<Suggestion>(numSuggestions);
+ //assumes entities are sorted by score
+ for (Iterator<Entity> guesses = results.iterator();guesses.hasNext();)
{
+ Suggestion match = new Suggestion(guesses.next());
+ Representation rep = match.getEntity().getRepresentation();
+ Float score =
rep.getFirst(RdfResourceEnum.resultScore.getUri(),Float.class);
if(maxScore == null){
- maxScore =
rep.getFirst(RdfResourceEnum.resultScore.getUri(),Float.class);
+ maxScore = score;
}
Iterator<Text> labels = rep.getText(nameField);
- boolean found = false;
- while(labels.hasNext() && !found){
+ while(labels.hasNext() && match.getLevenshtein() < 1.0){
Text label = labels.next();
- if(label.getLanguage() == null || (language != null &&
label.getLanguage().startsWith(language))){
-
if(label.getText().equalsIgnoreCase(namedEntity.getName())){
- found = true;
+ if(language == null || //if the content language is unknown ->
accept all labels
+ label.getLanguage() == null || //accept labels with
no language
+ //and labels in the same language as the content
+ (language != null &&
label.getLanguage().startsWith(language))){
+ double actMatch = levenshtein(
+ casesensitive ? label.getText().toLowerCase() :
label.getText(),
+ namedEntityLabel);
+ if(actMatch > match.getLevenshtein()){
+ match.setLevenshtein(actMatch);
+ match.setMatchedLabel(label);
}
}
}
- if(found){
- matches.add(exactCount,guess);
- exactCount++;
- } else if(matches.size()<numSuggestions){
- matches.add(guess);
- }
- }
- //now write the results
- for(int i=0;i<matches.size();i++){
- Representation rep = matches.get(i).getRepresentation();
- if(i<exactCount){ //and boost the scores of the exact matches
- if(maxScore == null){
- rep.set(RdfResourceEnum.resultScore.getUri(), 1.0f);
+ if(match.getMatchedLabel() != null){
+ if(match.getLevenshtein() == 1.0){
+ if(maxExactScore == null){
+ maxExactScore = score;
+ }
+ //normalise exact matches against the best exact score
+
match.setScore(score.doubleValue()/maxExactScore.doubleValue());
} else {
- Float score =
rep.getFirst(RdfResourceEnum.resultScore.getUri(), Float.class);
- rep.set(RdfResourceEnum.resultScore.getUri(),
- maxScore.doubleValue()+(score !=
null?score.doubleValue():0));
+ //normalise partial matches against the best match and the
+ //Levenshtein similarity with the label
+
match.setScore(score.doubleValue()*match.getLevenshtein()/maxScore.doubleValue());
}
+ matches.add(match);
+ } else {
+ log.info("No value of {} for Entity
{}!",nameField,match.getEntity().getId());
}
}
- return matches;
+ //now sort the results
+ Collections.sort(matches);
+ return matches.subList(0, Math.min(matches.size(),numSuggestions));
}
+ /**
+ * This EnhancementEngine can enhance any ContentItem as it does consume
+ * existing TextAnnotations with the configured dc:type's
+ * @see
org.apache.stanbol.enhancer.servicesapi.EnhancementEngine#canEnhance(org.apache.stanbol.enhancer.servicesapi.ContentItem)
+ */
public int canEnhance(ContentItem ci) {
- /*
- * This engine consumes existing enhancements because of that it can
enhance any type of ci! TODO: It
- * would also be possible to check here if there is an TextAnnotation
and use that as result!
- */
return ENHANCE_ASYNC; //Entity tagging now supports asyc processing
}
@@ -524,5 +540,23 @@ public class NamedEntityTaggingEngine
return
Collections.unmodifiableMap(Collections.singletonMap(ENHANCEMENT_ENGINE_ORDERING,
(Object) defaultOrder));
}
-
+ /**
+ * Compares two strings (after {@link StringUtils#trim(String) trimming})
+ * by using the Levenshtein's Edit Distance of the two
+ * strings. Does not return the {@link Integer} number of changes but
+ * <code>1-(changes/maxStringSizeAfterTrim)</code><p>
+ * @param s1 the first string
+ * @param s2 the second string
+ * @return the distance
+ * @throws IllegalArgumentException if any of the two parsed strings is
NULL
+ */
+ private static double levenshtein(String s1, String s2) {
+ if(s1 == null || s2 == null){
+ throw new IllegalArgumentException("NONE of the parsed String MUST
BE NULL!");
+ }
+ s1 = StringUtils.trim(s1);
+ s2 = StringUtils.trim(s2);
+ return s1.isEmpty() || s2.isEmpty() ? 0 :
+ 1.0 - (((double)getLevenshteinDistance(s1, s2)) /
((double)(Math.max(s1.length(), s2.length()))));
+ }
}
Added:
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/Suggestion.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/Suggestion.java?rev=1341438&view=auto
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/Suggestion.java
(added)
+++
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/Suggestion.java
Tue May 22 12:17:12 2012
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.engines.entitytagging.impl;
+
+import org.apache.stanbol.entityhub.servicesapi.model.Entity;
+import org.apache.stanbol.entityhub.servicesapi.model.Text;
+
+/**
+ * A suggestion of an {@link Entity} for a fise:TextAnnotation processed
+ * by the NamedEntityTaggingEngine
+ * @author Rupert Westenthaler
+ */
+public class Suggestion implements Comparable<Suggestion>{
+ private final Entity entity;
+ private double levenshtein = -1;
+ private Double score;
+ private Text matchedLabel;
+
+ protected Suggestion(Entity entity){
+ this.entity = entity;
+ }
+
+
+ /**
+ * @return the levenshtein
+ */
+ public final double getLevenshtein() {
+ return levenshtein;
+ }
+
+
+ /**
+ * @param levenshtein the levenshtein to set
+ */
+ protected final void setLevenshtein(double levenshtein) {
+ this.levenshtein = levenshtein;
+ }
+
+
+ /**
+ * @return the score
+ */
+ public final Double getScore() {
+ return score;
+ }
+
+
+ /**
+ * @param score the score to set
+ */
+ protected final void setScore(Double score) {
+ this.score = score;
+ }
+
+
+ /**
+ * @return the matchedLabel
+ */
+ public final Text getMatchedLabel() {
+ return matchedLabel;
+ }
+
+
+ /**
+ * @param matchedLabel the matchedLabel to set
+ */
+ protected final void setMatchedLabel(Text matchedLabel) {
+ this.matchedLabel = matchedLabel;
+ }
+
+
+ /**
+ * @return the entity
+ */
+ public final Entity getEntity() {
+ return entity;
+ }
+
+
+ @Override
+ public int compareTo(Suggestion other) {
+ return other.score.compareTo(score);
+ }
+
+
+}
\ No newline at end of file
Copied:
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/DbPediaDataFileProvider.java
(from r1340995,
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/TestDataFileProvider.java)
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/DbPediaDataFileProvider.java?p2=incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/DbPediaDataFileProvider.java&p1=incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/TestDataFileProvider.java&r1=1340995&r2=1341438&rev=1341438&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/TestDataFileProvider.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/DbPediaDataFileProvider.java
Tue May 22 12:17:12 2012
@@ -34,7 +34,7 @@ import org.apache.stanbol.commons.stanbo
* @author Rupert Westenthaler
*
*/
-public class TestDataFileProvider implements DataFileProvider {
+public class DbPediaDataFileProvider implements DataFileProvider {
private static String DBPEDIA_PREFIX =
"org/apache/stanbol/data/site/dbpedia/default/index/";
@@ -58,7 +58,7 @@ public class TestDataFileProvider implem
* @return
*/
private URL lookupResource(String resource) {
- ClassLoader cl = TestDataFileProvider.class.getClassLoader();
+ ClassLoader cl = DbPediaDataFileProvider.class.getClassLoader();
URL resourceUri = cl.getResource(resource);
if(resourceUri == null){
cl = Thread.currentThread().getContextClassLoader();
Propchange:
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/DbPediaDataFileProvider.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Modified:
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/MockEntityhub.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/MockEntityhub.java?rev=1341438&r1=1341437&r2=1341438&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/MockEntityhub.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/MockEntityhub.java
Tue May 22 12:17:12 2012
@@ -37,6 +37,8 @@ import org.apache.stanbol.entityhub.serv
import org.apache.stanbol.entityhub.servicesapi.yard.YardException;
import org.apache.stanbol.entityhub.yard.solr.impl.SolrYard;
import org.apache.stanbol.entityhub.yard.solr.impl.SolrYardConfig;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
/**
* Mocks an Entityhub for the {@link NamedEntityTaggingEngine} for Unit
Testing<p>
@@ -46,12 +48,18 @@ import org.apache.stanbol.entityhub.yard
*/
class MockEntityhub implements Entityhub {
+ private static final Logger log =
LoggerFactory.getLogger(MockEntityhub.class);
+
protected SolrYard yard;
protected MockEntityhub(){
SolrYardConfig config = new SolrYardConfig("dbpedia", "dbpedia_43k");
try {
yard = new SolrYard(config);
+ Representation paris =
yard.getRepresentation("http://dbpedia.org/resource/Paris");
+ if(paris == null){
+ throw new IllegalStateException("Initialised Yard does not
contain the expected resource dbpedia:Paris!");
+ }
} catch (YardException e) {
throw new IllegalStateException("Unable to init Yard!",e);
}
@@ -67,9 +75,12 @@ class MockEntityhub implements Entityhub
}
@Override
public QueryResultList<Entity> findEntities(FieldQuery query) throws
EntityhubException {
+ log.info("Performing Query: {}",query);
QueryResultList<Representation> results =
yard.findRepresentation(query);
+ log.info(" ... {} results",results.size());
Collection<Entity> entities = new ArrayList<Entity>(results.size());
for(Representation r : results){
+ log.info(" > {}",r.getId());
entities.add(new EntityImpl("dbpedia", r, null));
}
return new
QueryResultListImpl<Entity>(results.getQuery(),entities,Entity.class);
Modified:
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/TestEntityLinkingEnhancementEngine.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/TestEntityLinkingEnhancementEngine.java?rev=1341438&r1=1341437&r2=1341438&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/TestEntityLinkingEnhancementEngine.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/TestEntityLinkingEnhancementEngine.java
Tue May 22 12:17:12 2012
@@ -16,11 +16,13 @@
*/
package org.apache.stanbol.enhancer.engines.entitytagging.impl;
+import static
org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper.randomUUID;
import static
org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses.DBPEDIA_ORGANISATION;
import static
org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses.DBPEDIA_PERSON;
import static
org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses.DBPEDIA_PLACE;
import static
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_CREATOR;
import static
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_RELATION;
+import static
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE;
import static
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_LABEL;
import static
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_REFERENCE;
import static
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_EXTRACTED_FROM;
@@ -28,12 +30,14 @@ import static org.apache.stanbol.enhance
import static
org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses.ENHANCER_ENTITYANNOTATION;
import static
org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses.ENHANCER_TEXTANNOTATION;
import static
org.apache.stanbol.enhancer.test.helper.EnhancementStructureHelper.validateAllEntityAnnotations;
+import static
org.apache.stanbol.enhancer.test.helper.EnhancementStructureHelper.validateEntityAnnotation;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import java.io.File;
import java.io.IOException;
+import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
@@ -43,13 +47,17 @@ import org.apache.clerezza.rdf.core.Lite
import org.apache.clerezza.rdf.core.MGraph;
import org.apache.clerezza.rdf.core.Resource;
import org.apache.clerezza.rdf.core.Triple;
+import org.apache.clerezza.rdf.core.TypedLiteral;
import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
+import org.apache.clerezza.rdf.core.impl.TripleImpl;
import org.apache.commons.io.IOUtils;
import
org.apache.stanbol.enhancer.contentitem.inmemory.InMemoryContentItemFactory;
import org.apache.stanbol.enhancer.rdfentities.RdfEntityFactory;
import org.apache.stanbol.enhancer.rdfentities.fise.TextAnnotation;
import org.apache.stanbol.enhancer.servicesapi.ContentItem;
import org.apache.stanbol.enhancer.servicesapi.ContentItemFactory;
+import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
import org.apache.stanbol.enhancer.servicesapi.impl.StringSource;
import org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses;
import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;
@@ -60,10 +68,14 @@ import org.junit.Assert;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
public class TestEntityLinkingEnhancementEngine {
-
+
+ private static final Logger log =
LoggerFactory.getLogger(TestEntityLinkingEnhancementEngine.class);
+
/**
* The context for the tests (same as in TestOpenNLPEnhancementEngine)
*/
@@ -97,8 +109,9 @@ public class TestEntityLinkingEnhancemen
throw new IOException("Unable to create directory for test
files "+testFiles);
}
}
-
- System.getProperties().setProperty("user.dir",
testFiles.getCanonicalPath());
+ String testRootDir = testFiles.getCanonicalPath();
+ log.info("Test 'user.dir' folder {}",testRootDir);
+ System.getProperties().setProperty("user.dir", testRootDir);
entityLinkingEngine = new NamedEntityTaggingEngine();
//instead of calling activate we directly set the required fields
//we need a data source for linking
@@ -140,20 +153,20 @@ public class TestEntityLinkingEnhancemen
content = "";
}
RdfEntityFactory factory =
RdfEntityFactory.createInstance(ci.getMetadata());
- TextAnnotation testAnnotation = factory.getProxy(
- new
UriRef("urn:iks-project:enhancer:test:text-annotation:person"),
TextAnnotation.class);
- testAnnotation.setCreator(new
UriRef("urn:iks-project:enhancer:test:dummyEngine"));
- testAnnotation.setCreated(new Date());
- testAnnotation.setSelectedText(name);
- testAnnotation.setSelectionContext(context);
- testAnnotation.getDcType().add(type);
+ TextAnnotation textAnnotation = factory.getProxy(
+ new
UriRef("urn:iks-project:enhancer:test:text-annotation:"+randomUUID()),
TextAnnotation.class);
+ textAnnotation.setCreator(new
UriRef("urn:iks-project:enhancer:test:dummyEngine"));
+ textAnnotation.setCreated(new Date());
+ textAnnotation.setSelectedText(name);
+ textAnnotation.setSelectionContext(context);
+ textAnnotation.getDcType().add(type);
Integer start = content.indexOf(name);
if (start < 0){ //if not found in the content
//set some random numbers for start/end
start = (int)Math.random()*100;
}
- testAnnotation.setStart(start);
- testAnnotation.setEnd(start+name.length());
+ textAnnotation.setStart(start);
+ textAnnotation.setEnd(start+name.length());
}
@Test
@@ -164,16 +177,46 @@ public class TestEntityLinkingEnhancemen
getTextAnnotation(ci, PERSON, CONTEXT, DBPEDIA_PERSON);
getTextAnnotation(ci, ORGANISATION, CONTEXT, DBPEDIA_ORGANISATION);
getTextAnnotation(ci, PLACE, CONTEXT, DBPEDIA_PLACE);
+ //add the language
+ ci.getMetadata().add(new TripleImpl(ci.getUri(),
Properties.DC_LANGUAGE, new PlainLiteralImpl("en")));
//perform the computation of the enhancements
entityLinkingEngine.computeEnhancements(ci);
+ int entityAnnotationCount = validateAllEntityAnnotations(ci);
+ assertEquals(4, entityAnnotationCount);
+ }
+
+ private static int validateAllEntityAnnotations(ContentItem ci){
Map<UriRef,Resource> expectedValues = new HashMap<UriRef,Resource>();
expectedValues.put(ENHANCER_EXTRACTED_FROM, ci.getUri());
expectedValues.put(DC_CREATOR,LiteralFactory.getInstance().createTypedLiteral(
entityLinkingEngine.getClass().getName()));
- int entityAnnotationCount =
validateAllEntityAnnotations(ci.getMetadata(),expectedValues);
- assertEquals(3, entityAnnotationCount);
+ Iterator<Triple> entityAnnotationIterator =
ci.getMetadata().filter(null,
+ RDF_TYPE, ENHANCER_ENTITYANNOTATION);
+ int entityAnnotationCount = 0;
+ while (entityAnnotationIterator.hasNext()) {
+ UriRef entityAnnotation = (UriRef)
entityAnnotationIterator.next().getSubject();
+ // test if selected Text is added
+ validateEntityAnnotation(ci.getMetadata(), entityAnnotation,
expectedValues);
+ //validate also that the confidence is between [0..1]
+ Iterator<Triple> confidenceIterator =
ci.getMetadata().filter(entityAnnotation, ENHANCER_CONFIDENCE, null);
+ //NOTE: the fact that fise:confidence values are TypedLiterals of
type xsd:double
+ // is already validated at this point
+ // Also that there are only [0..1] confidence values
+ assertTrue("Expected fise:confidence value is missing
(entityAnnotation "
+ +entityAnnotation+")",confidenceIterator.hasNext());
+ Double confidence =
LiteralFactory.getInstance().createObject(Double.class,
+ (TypedLiteral)confidenceIterator.next().getObject());
+ assertTrue("fise:confidence MUST BE <= 1 (value= '"+confidence
+ + "',entityAnnotation " +entityAnnotation+")",
+ 1.0 >= confidence.doubleValue());
+ assertTrue("fise:confidence MUST BE >= 0 (value= '"+confidence
+ +"',entityAnnotation "+entityAnnotation+")",
+ 0.0 <= confidence.doubleValue());
+ entityAnnotationCount++;
+ }
+ return entityAnnotationCount;
+
}
-
}
Modified:
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/resources/META-INF/services/org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileProvider
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/resources/META-INF/services/org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileProvider?rev=1341438&r1=1341437&r2=1341438&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/resources/META-INF/services/org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileProvider
(original)
+++
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/test/resources/META-INF/services/org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileProvider
Tue May 22 12:17:12 2012
@@ -13,4 +13,4 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-org.apache.stanbol.enhancer.engines.entitytagging.impl.TestDataFileProvider
\ No newline at end of file
+org.apache.stanbol.enhancer.engines.entitytagging.impl.DbPediaDataFileProvider
\ No newline at end of file
Modified:
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java?rev=1341438&r1=1341437&r2=1341438&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java
Tue May 22 12:17:12 2012
@@ -82,6 +82,7 @@ import org.apache.stanbol.entityhub.serv
import org.apache.stanbol.entityhub.servicesapi.defaults.NamespaceEnum;
import org.apache.stanbol.entityhub.servicesapi.model.Reference;
import org.apache.stanbol.entityhub.servicesapi.model.Text;
+import org.apache.stanbol.entityhub.servicesapi.model.rdf.RdfResourceEnum;
import org.apache.stanbol.entityhub.servicesapi.site.ReferencedSite;
import org.osgi.framework.Constants;
import org.osgi.service.cm.ConfigurationException;
@@ -239,6 +240,7 @@ public class KeywordLinkingEngine
unbind = "disableOfflineMode",
strategy = ReferenceStrategy.EVENT)
private OfflineMode offlineMode;
+ private String referencedSiteName;
/**
* Called by the ConfigurationAdmin to bind the {@link #offlineMode} if
the service becomes available
@@ -452,6 +454,10 @@ public class KeywordLinkingEngine
metadata.add(new TripleImpl(entityAnnotation,
Properties.DC_RELATION, textAnnotation));
}
+ //add the name of the ReferencedSite providing this suggestion
+ metadata.add(new TripleImpl(entityAnnotation,
+ new UriRef(RdfResourceEnum.site.getUri()),
+ new PlainLiteralImpl(referencedSiteName)));
//in case dereferencing of Entities is enabled we need also to
//add the RDF data for entities
if(dereferenceEntitiesState){
@@ -827,16 +833,16 @@ public class KeywordLinkingEngine
throw new ConfigurationException(REFERENCED_SITE_ID,
"The ID of the Referenced Site is a required Parameter and
MUST NOT be NULL!");
}
- String refSiteId = value.toString();
- if (refSiteId.isEmpty()) {
+ referencedSiteName = value.toString();
+ if (referencedSiteName.isEmpty()) {
throw new ConfigurationException(REFERENCED_SITE_ID,
"The ID of the Referenced Site is a required Parameter and
MUST NOT be an empty String!");
}
//TODO: make limit configurable!
- if(Entityhub.ENTITYHUB_IDS.contains(refSiteId.toLowerCase())){
+ if(Entityhub.ENTITYHUB_IDS.contains(referencedSiteName.toLowerCase())){
entitySearcher = new
EntityhubSearcher(context.getBundleContext(),10);
} else {
- entitySearcher = new
ReferencedSiteSearcher(context.getBundleContext(),refSiteId,10);
+ entitySearcher = new
ReferencedSiteSearcher(context.getBundleContext(),referencedSiteName,10);
}
}
/**
@@ -891,5 +897,6 @@ public class KeywordLinkingEngine
((TrackingEntitySearcher<?>)entitySearcher).close();
}
entitySearcher = null;
+ referencedSiteName = null;
}
}