Author: ogrisel
Date: Wed Mar 28 12:45:25 2012
New Revision: 1306292
URL: http://svn.apache.org/viewvc?rev=1306292&view=rev
Log:
STANBOL-197: add entity labels from the entityhub
Modified:
incubator/stanbol/trunk/enhancer/engines/topic/pom.xml
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
Modified: incubator/stanbol/trunk/enhancer/engines/topic/pom.xml
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/pom.xml?rev=1306292&r1=1306291&r2=1306292&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/topic/pom.xml (original)
+++ incubator/stanbol/trunk/enhancer/engines/topic/pom.xml Wed Mar 28 12:45:25
2012
@@ -245,6 +245,10 @@
<!-- Normal build dependencies -->
<dependency>
<groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.entityhub.servicesapi</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.stanbol</groupId>
<artifactId>org.apache.stanbol.enhancer.servicesapi</artifactId>
</dependency>
<dependency>
Modified:
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java?rev=1306292&r1=1306291&r2=1306292&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
Wed Mar 28 12:45:25 2012
@@ -35,10 +35,12 @@ import java.util.Set;
import java.util.UUID;
import org.apache.clerezza.rdf.core.Graph;
+import org.apache.clerezza.rdf.core.LiteralFactory;
import org.apache.clerezza.rdf.core.MGraph;
import org.apache.clerezza.rdf.core.Resource;
import org.apache.clerezza.rdf.core.Triple;
import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
import org.apache.clerezza.rdf.core.impl.TripleImpl;
import org.apache.clerezza.rdf.utils.GraphNode;
import org.apache.commons.io.FileUtils;
@@ -78,6 +80,7 @@ import org.apache.stanbol.enhancer.servi
import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
+import org.apache.stanbol.enhancer.servicesapi.rdf.NamespaceEnum;
import org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses;
import org.apache.stanbol.enhancer.topic.Batch;
import org.apache.stanbol.enhancer.topic.BatchProcessor;
@@ -92,6 +95,12 @@ import org.apache.stanbol.enhancer.topic
import org.apache.stanbol.enhancer.topic.training.SolrTrainingSet;
import org.apache.stanbol.enhancer.topic.training.TrainingSet;
import org.apache.stanbol.enhancer.topic.training.TrainingSetException;
+import org.apache.stanbol.entityhub.servicesapi.Entityhub;
+import org.apache.stanbol.entityhub.servicesapi.EntityhubException;
+import org.apache.stanbol.entityhub.servicesapi.model.Entity;
+import org.apache.stanbol.entityhub.servicesapi.model.Representation;
+import org.apache.stanbol.entityhub.servicesapi.model.Text;
+import org.apache.stanbol.entityhub.servicesapi.site.ReferencedSiteManager;
import org.osgi.framework.BundleContext;
import org.osgi.framework.Constants;
import org.osgi.framework.InvalidSyntaxException;
@@ -203,14 +212,21 @@ public class TopicClassificationEngine e
/**
* The "text/plain" mime type
*/
- protected static final String PLAIN_TEXT_MIMETYPE = "text/plain";
+ public static final String PLAIN_TEXT_MIMETYPE = "text/plain";
+
/**
* Contains the only supported mime type {@link #PLAIN_TEXT_MIMETYPE}
*/
- protected static final Set<String> SUPPORTED_MIMETYPES =
Collections.singleton(PLAIN_TEXT_MIMETYPE);
+ public static final Set<String> SUPPORTED_MIMETYPES =
Collections.singleton(PLAIN_TEXT_MIMETYPE);
public static final String SOLR_NON_EMPTY_FIELD = "[\"\" TO *]";
+ @Reference
+ protected Entityhub entityhub;
+
+ @Reference
+ protected ReferencedSiteManager referencedSiteManager;
+
// TODO: make the following fields configurable
private int MAX_COLLECTED_EXAMPLES = 1000;
@@ -390,6 +406,11 @@ public class TopicClassificationEngine e
} catch (ClassifierException e) {
throw new EngineException(e);
}
+ UriRef precision = new UriRef(NamespaceEnum.fise +
"classifier/precision");
+ UriRef recall = new UriRef(NamespaceEnum.fise + "classifier/recall");
+ UriRef f1 = new UriRef(NamespaceEnum.fise + "classifier/f1");
+
+ LiteralFactory lf = LiteralFactory.getInstance();
ci.getLock().writeLock().lock();
try {
for (TopicSuggestion topic : topics) {
@@ -402,9 +423,46 @@ public class TopicClassificationEngine e
metadata.add(new TripleImpl(enhancement,
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_REFERENCE,
new UriRef(topic.conceptUri)));
- // TODO: make it possible to dereference and the path to the
root the entities according to a
- // configuration parameter
+
+ // add confidence information
+ metadata.add(new TripleImpl(enhancement,
+
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE, lf
+
.createTypedLiteral(Double.valueOf(topic.score))));
+
+ ClassificationReport perf =
getPerformanceEstimates(topic.conceptUri);
+ if (perf.uptodate) {
+ metadata.add(new TripleImpl(enhancement, precision,
lf.createTypedLiteral(Double
+ .valueOf(perf.precision))));
+ metadata.add(new TripleImpl(enhancement, recall,
lf.createTypedLiteral(Double
+ .valueOf(perf.recall))));
+ metadata.add(new TripleImpl(enhancement, f1,
lf.createTypedLiteral(Double
+ .valueOf(perf.f1))));
+ }
+ Entity entity = entityhub.getEntity(topic.conceptUri);
+ if (entity == null) {
+ entity = referencedSiteManager.getEntity(topic.conceptUri);
+ }
+ if (entity != null) {
+ Representation representation = entity.getRepresentation();
+ // TODO: extract all languages based on some configuration
instead of hardcoding English
+ Text label = representation.getFirst(NamespaceEnum.skos +
"prefLabel", "en", "en-US",
+ "en-GB");
+ if (label == null) {
+ label = representation.getFirst(NamespaceEnum.rdfs +
"label", "en", "en-US", "en-GB");
+ }
+ if (label != null) {
+ metadata.add(new TripleImpl(enhancement,
+
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_LABEL,
+ new PlainLiteralImpl(label.getText())));
+ }
+ }
}
+ } catch (ClassifierException e) {
+ throw new EngineException(e);
+ } catch (IllegalArgumentException e) {
+ throw new EngineException(e);
+ } catch (EntityhubException e) {
+ throw new EngineException(e);
} finally {
ci.getLock().writeLock().unlock();
}