Author: ogrisel
Date: Tue Jan 3 17:11:32 2012
New Revision: 1226872
URL: http://svn.apache.org/viewvc?rev=1226872&view=rev
Log:
STANBOL-197: Make it possible to programmatically define a topic classification
model
Added:
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/Batch.java
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/ClassifierException.java
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TopicClassifier.java
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TopicSuggestion.java
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TrainingSet.java
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TrainingSetException.java
Removed:
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicSuggestion.java
Modified:
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/TopicEngineTest.java
incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/test_schema.xml
incubator/stanbol/trunk/enhancer/generic/servicesapi/src/main/java/org/apache/stanbol/enhancer/servicesapi/rdf/TechnicalClasses.java
Modified:
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java?rev=1226872&r1=1226871&r2=1226872&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
Tue Jan 3 17:11:32 2012
@@ -21,14 +21,19 @@ import static org.apache.stanbol.enhance
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
+import java.util.Collection;
import java.util.Collections;
import java.util.Dictionary;
import java.util.Iterator;
+import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
+import java.util.Set;
+import org.apache.clerezza.rdf.core.MGraph;
import org.apache.clerezza.rdf.core.Triple;
import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.clerezza.rdf.core.impl.TripleImpl;
import org.apache.commons.io.IOUtils;
import org.apache.felix.scr.annotations.Activate;
import org.apache.felix.scr.annotations.Component;
@@ -43,6 +48,7 @@ import org.apache.solr.client.solrj.Solr
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
+import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.MoreLikeThisParams;
import org.apache.stanbol.commons.solr.IndexReference;
@@ -53,6 +59,13 @@ import org.apache.stanbol.enhancer.servi
import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
import org.apache.stanbol.enhancer.servicesapi.InvalidContentException;
import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
+import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
+import org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses;
+import org.apache.stanbol.enhancer.topic.ClassifierException;
+import org.apache.stanbol.enhancer.topic.TopicClassifier;
+import org.apache.stanbol.enhancer.topic.TopicSuggestion;
+import org.apache.stanbol.enhancer.topic.TrainingSet;
+import org.apache.stanbol.enhancer.topic.TrainingSetException;
import org.osgi.framework.InvalidSyntaxException;
import org.osgi.service.cm.ConfigurationException;
import org.osgi.service.component.ComponentContext;
@@ -72,17 +85,19 @@ import org.slf4j.LoggerFactory;
@Properties(value = {@Property(name = TopicClassificationEngine.ENGINE_ID),
@Property(name = TopicClassificationEngine.ORDER,
intValue = 100),
@Property(name = TopicClassificationEngine.SOLR_CORE),
- @Property(name = TopicClassificationEngine.LANGUAGE),
+ @Property(name = TopicClassificationEngine.LANGUAGES),
@Property(name =
TopicClassificationEngine.SIMILARTITY_FIELD),
@Property(name =
TopicClassificationEngine.TOPIC_URI_FIELD),
- @Property(name =
TopicClassificationEngine.MATERIALIZED_PATH_FIELD)})
-public class TopicClassificationEngine implements EnhancementEngine,
ServiceProperties {
+ @Property(name = TopicClassificationEngine.BROADER_FIELD),
+ @Property(name =
TopicClassificationEngine.MATERIALIZED_PATH_FIELD),
+ @Property(name =
TopicClassificationEngine.MODEL_UPDATE_DATE_FIELD)})
+public class TopicClassificationEngine implements EnhancementEngine,
ServiceProperties, TopicClassifier {
public static final String ENGINE_ID =
"org.apache.stanbol.enhancer.engine.id";
public static final String SOLR_CORE =
"org.apache.stanbol.enhancer.engine.topic.solrCore";
- public static final String LANGUAGE =
"org.apache.stanbol.enhancer.engine.topic.language";
+ public static final String LANGUAGES =
"org.apache.stanbol.enhancer.engine.topic.languages";
public static final String ORDER =
"org.apache.stanbol.enhancer.engine.topic.order";
@@ -90,8 +105,12 @@ public class TopicClassificationEngine i
public static final String TOPIC_URI_FIELD =
"org.apache.stanbol.enhancer.engine.topic.uriField";
+ public static final String BROADER_FIELD =
"org.apache.stanbol.enhancer.engine.topic.broaderField";
+
public static final String MATERIALIZED_PATH_FIELD =
"org.apache.stanbol.enhancer.engine.topic.materializedPathField";
+ public static final String MODEL_UPDATE_DATE_FIELD =
"org.apache.stanbol.enhancer.engine.topic.modelUpdateField";
+
private static final Logger log =
LoggerFactory.getLogger(TopicClassificationEngine.class);
protected String engineId;
@@ -111,12 +130,18 @@ public class TopicClassificationEngine i
protected String topicUriField;
+ protected String modelUpdateDateField;
+
+ protected String broaderField;
+
protected String materializedPathField;
protected ComponentContext context;
protected int numTopics = 10;
+ protected TrainingSet trainingSet;
+
@Activate
protected void activate(ComponentContext context) throws
ConfigurationException, InvalidSyntaxException {
@SuppressWarnings("unchecked")
@@ -136,7 +161,7 @@ public class TopicClassificationEngine i
engineId = getRequiredStringParam(config, ENGINE_ID);
similarityField = getRequiredStringParam(config, SIMILARTITY_FIELD);
topicUriField = getRequiredStringParam(config, TOPIC_URI_FIELD);
- acceptedLanguages = getStringListParan(config, LANGUAGE);
+ acceptedLanguages = getStringListParan(config, LANGUAGES);
if (config.get(SOLR_CORE) instanceof SolrServer) {
// Bind a fixed Solr server client instead of doing dynamic OSGi
lookup using the service tracker.
// This can be useful both for unit-testing .
@@ -159,8 +184,10 @@ public class TopicClassificationEngine i
throw new ConfigurationException(SOLR_CORE, e.getMessage(), e);
}
}
- // optional field, can be null
+ // optional fields, can be null
+ broaderField = (String) config.get(BROADER_FIELD);
materializedPathField = (String) config.get(TOPIC_URI_FIELD);
+ modelUpdateDateField = (String) config.get(MODEL_UPDATE_DATE_FIELD);
Object orderParamValue = config.get(ORDER);
if (orderParamValue != null) {
order = (Integer) orderParamValue;
@@ -218,12 +245,79 @@ public class TopicClassificationEngine i
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
String text = getTextFromContentItem(ci);
- suggestTopics(text);
+ MGraph metadata = ci.getMetadata();
+ List<TopicSuggestion> topics;
+ try {
+ topics = suggestTopics(text);
+ } catch (ClassifierException e) {
+ throw new EngineException(e);
+ }
+ for (TopicSuggestion topic : topics) {
+ UriRef enhancement =
EnhancementEngineHelper.createEntityEnhancement(ci, this);
+ metadata.add(new TripleImpl(enhancement,
+
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE,
+ TechnicalClasses.ENHANCER_TOPICANNOTATION));
+
+ // add link to entity
+ metadata.add(new TripleImpl(enhancement,
+
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_REFERENCE,
+ new UriRef(topic.uri)));
+ // TODO: make it possible to dereference and the path to the root
the entities according to a
+ // configuration parameter
+ }
+ }
- // TODO: express the results as RDF.
+ /**
+ * @return the manually bound solrServer instance or the one tracked by
the OSGi service tracker.
+ */
+ protected SolrServer getActiveSolrServer() {
+ return solrServer != null ? solrServer : indexTracker.getService();
}
- public List<TopicSuggestion> suggestTopics(String text) throws
EngineException {
+ @Override
+ public Map<String,Object> getServiceProperties() {
+ return
Collections.unmodifiableMap(Collections.singletonMap(ENHANCEMENT_ENGINE_ORDERING,
+ (Object) order));
+ }
+
+ protected String getTextFromContentItem(ContentItem ci) throws
InvalidContentException {
+ // Refactor the following using an adapter.
+ String text = "";
+ if (ci.getMimeType().startsWith("text/plain")) {
+ try {
+ // TODO: handle explicit charsets if any and fallback to UTF-8
if missing
+ text = IOUtils.toString(ci.getStream(), "UTF-8");
+ } catch (IOException e) {
+ throw new InvalidContentException(this, ci, e);
+ }
+ } else {
+ Iterator<Triple> it = ci.getMetadata().filter(ci.getUri(),
NIE_PLAINTEXTCONTENT, null);
+ while (it.hasNext()) {
+ text += it.next().getObject();
+ }
+ }
+ return text;
+ }
+
+ public static TopicClassificationEngine
fromParameters(Dictionary<String,Object> config) throws ConfigurationException {
+ TopicClassificationEngine engine = new TopicClassificationEngine();
+ engine.configure(config);
+ return engine;
+ }
+
+ // classifier API
+
+ @Override
+ public String getSchemeId() {
+ return engineId;
+ }
+
+ @Override
+ public List<String> getAcceptedLanguages() {
+ return acceptedLanguages;
+ }
+
+ public List<TopicSuggestion> suggestTopics(String text) throws
ClassifierException {
List<TopicSuggestion> suggestedTopics = new
ArrayList<TopicSuggestion>(numTopics);
SolrServer solrServer = getActiveSolrServer();
SolrQuery query = new SolrQuery();
@@ -232,7 +326,7 @@ public class TopicClassificationEngine i
query.set(MoreLikeThisParams.MIN_DOC_FREQ, 1);
query.set(MoreLikeThisParams.MIN_TERM_FREQ, 1);
// TODO: find a way to parse the interesting terms and report them
- // for debugging / explanation in dedicated RDF datastucture.
+ // for debugging / explanation in dedicated RDF data structure.
// query.set(MoreLikeThisParams.INTERESTING_TERMS, "details");
query.set(MoreLikeThisParams.SIMILARITY_FIELDS, similarityField);
query.set(CommonParams.STREAM_BODY, text);
@@ -244,8 +338,8 @@ public class TopicClassificationEngine i
for (SolrDocument result : results.toArray(new SolrDocument[0])) {
String uri = (String) result.getFirstValue(topicUriField);
if (uri == null) {
- throw new EngineException(String.format("Solr Core '%s' is
missing required field '%s'.",
- solrCoreId, topicUriField));
+ throw new ClassifierException(String.format(
+ "Solr Core '%s' is missing required field '%s'.",
solrCoreId, topicUriField));
}
suggestedTopics.add(new TopicSuggestion(uri, 0.0));
}
@@ -254,50 +348,144 @@ public class TopicClassificationEngine i
String message = String.format("SolrServer with id '%s' for
topic engine '%s' lacks"
+ " configuration for the
MoreLikeThisHandler", solrCoreId,
engineId);
- throw new EngineException(message, e);
+ throw new ClassifierException(message, e);
} else {
- throw new EngineException(e);
+ throw new ClassifierException(e);
}
}
return suggestedTopics;
}
- /**
- * @return the manually bound solrServer instance or the one tracked by
the OSGi service tracker.
- */
- protected SolrServer getActiveSolrServer() {
- return solrServer != null ? solrServer : indexTracker.getService();
+ @Override
+ public Set<String> getNarrowerTopics(String broadTopicId) throws
ClassifierException {
+ LinkedHashSet<String> narrowerTopics = new LinkedHashSet<String>();
+ if (broaderField == null) {
+ return narrowerTopics;
+ }
+ SolrServer solrServer = getActiveSolrServer();
+ SolrQuery query = new SolrQuery("*:*");
+ // use a filter query to avoid string escaping issues with special
solr chars
+ query.addFilterQuery("{!field f=" + broaderField + "}" + broadTopicId);
+ query.addField(topicUriField);
+ query.addSortField(topicUriField, SolrQuery.ORDER.asc);
+ try {
+ for (SolrDocument result : solrServer.query(query).getResults()) {
+
narrowerTopics.add(result.getFirstValue(topicUriField).toString());
+ }
+ } catch (SolrServerException e) {
+ String msg = String.format("Error while fetching narrower topics
of '%s' on Solr Core '%s'.",
+ broadTopicId, solrCoreId);
+ throw new ClassifierException(msg, e);
+ }
+ return narrowerTopics;
}
@Override
- public Map<String,Object> getServiceProperties() {
- return
Collections.unmodifiableMap(Collections.singletonMap(ENHANCEMENT_ENGINE_ORDERING,
- (Object) order));
+ public Set<String> getBroaderTopics(String id) throws ClassifierException {
+ LinkedHashSet<String> broaderTopics = new LinkedHashSet<String>();
+ if (broaderField == null) {
+ return broaderTopics;
+ }
+ SolrServer solrServer = getActiveSolrServer();
+ SolrQuery query = new SolrQuery("*:*");
+ // use a filter query to avoid string escaping issues with special
solr chars
+ query.addFilterQuery("{!field f=" + topicUriField + "}" + id);
+ query.addField(broaderField);
+ try {
+ for (SolrDocument result : solrServer.query(query).getResults()) {
+ // there should be only one results
+ Collection<Object> broaderFieldValues =
result.getFieldValues(broaderField);
+ if (broaderFieldValues == null) {
+ continue;
+ }
+ for (Object value : broaderFieldValues) {
+ broaderTopics.add(value.toString());
+ }
+ }
+ } catch (SolrServerException e) {
+ String msg = String.format("Error while fetching broader topics of
'%s' on Solr Core '%s'.", id,
+ solrCoreId);
+ throw new ClassifierException(msg, e);
+ }
+ return broaderTopics;
}
- protected String getTextFromContentItem(ContentItem ci) throws
InvalidContentException {
- // Refactor the following using an adapter.
- String text = "";
- if (ci.getMimeType().startsWith("text/plain")) {
- try {
- // TODO: handle explicit charsets if any and fallback to UTF-8
if missing
- text = IOUtils.toString(ci.getStream(), "UTF-8");
- } catch (IOException e) {
- throw new InvalidContentException(this, ci, e);
- }
+ @Override
+ public Set<String> getTopicRoots() throws ClassifierException {
+ // TODO: this can be very big on flat thesauri: should we enable a
paging API instead?
+ LinkedHashSet<String> rootTopics = new LinkedHashSet<String>();
+ SolrServer solrServer = getActiveSolrServer();
+ SolrQuery query = new SolrQuery();
+ if (broaderField != null) {
+ // find any topic with an empty broaderField
+ query.setParam("q", "-" + broaderField + ":[\"\" TO *]");
} else {
- Iterator<Triple> it = ci.getMetadata().filter(ci.getUri(),
NIE_PLAINTEXTCONTENT, null);
- while (it.hasNext()) {
- text += it.next().getObject();
+ // find any topic
+ query.setQuery("*:*");
+ }
+ try {
+ for (SolrDocument result : solrServer.query(query).getResults()) {
+ rootTopics.add(result.getFirstValue(topicUriField).toString());
}
+ } catch (SolrServerException e) {
+ String msg = String.format("Error while fetching root topics on
Solr Core '%s'.", solrCoreId);
+ throw new ClassifierException(msg, e);
}
- return text;
+ return rootTopics;
}
- public static TopicClassificationEngine
fromParameters(Dictionary<String,Object> config) throws ConfigurationException {
- TopicClassificationEngine engine = new TopicClassificationEngine();
- engine.configure(config);
- return engine;
+ @Override
+ public void addTopic(String id, Collection<String> broaderTopics) throws
ClassifierException {
+ SolrInputDocument doc = new SolrInputDocument();
+ doc.addField(topicUriField, id);
+ if (broaderTopics != null && broaderField != null) {
+ doc.addField(broaderField, broaderTopics);
+ }
+ SolrServer solrServer = getActiveSolrServer();
+ try {
+ solrServer.add(doc);
+ solrServer.commit();
+ } catch (Exception e) {
+ String msg = String.format("Error adding topic with id '%s' on
Solr Core '%s'", id, solrCoreId);
+ throw new ClassifierException(msg, e);
+ }
+ }
+
+ @Override
+ public void removeTopic(String id) throws ClassifierException {
+ SolrServer solrServer = getActiveSolrServer();
+ try {
+ solrServer.deleteByQuery(topicUriField + ":" + id);
+ solrServer.commit();
+ } catch (Exception e) {
+ String msg = String.format("Error adding topic with id '%s' on
Solr Core '%s'", id, solrCoreId);
+ throw new ClassifierException(msg, e);
+ }
}
+ @Override
+ public boolean isUpdatable() {
+ return trainingSet != null;
+ }
+
+ @Override
+ public void setTrainingSet(TrainingSet trainingSet) {
+ this.trainingSet = trainingSet;
+ }
+
+ @Override
+ public void updateModel() throws TrainingSetException {
+ checkTrainingSet();
+ // TODO:
+ // perform a first query to iterate over all the registered topics
sorted by id (to allow for paging)
+ // for each topic find the last update date of the union of the topic
and it's narrower topic
+ }
+
+ protected void checkTrainingSet() throws TrainingSetException {
+ if (trainingSet != null) {
+ throw new TrainingSetException(
+ String.format("TopicClassificationEngine %s has no
registered"
+ + " training set hence cannot be updated.",
engineId));
+ }
+ }
}
Added:
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/Batch.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/Batch.java?rev=1226872&view=auto
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/Batch.java
(added)
+++
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/Batch.java
Tue Jan 3 17:11:32 2012
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.topic;
+
+import java.io.Serializable;
+import java.util.List;
+
+/**
+ * Container data transfer object to fetch partial results over a query
results one batch at a time.
+ *
+ * @param <T>
+ * the type of the items to batch over.
+ */
+public class Batch<T> implements Serializable {
+
+ private static final long serialVersionUID = 1L;
+
+ /**
+ * Boolean marker set by the backend to tell the caller that it can expect
new items by issuing the same
+ * query by passing the companion offset marker.
+ */
+ public final boolean hasMore;
+
+ /**
+ * Marker value that the caller can pass to the dataset to fetch the next
batch and perform efficient
+ * server side batching.
+ *
+ * This value should refer to an indexed field with unique values such as
a primary key or a random uuid
+ * (good for shuffling the example in arbitrary order). The samples return
in the batches should be sorted
+ * according to this field so that the server can perform efficient range
queries that are guaranteed to
+ * return no duplicate results across batches.
+ */
+ public final Object nextOffset;
+
+ public final List<T> items;
+
+ public Batch(List<T> items, boolean hasMore, Object nextOffset) {
+ this.items = items;
+ this.hasMore = hasMore;
+ this.nextOffset = nextOffset;
+ }
+}
Added:
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/ClassifierException.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/ClassifierException.java?rev=1226872&view=auto
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/ClassifierException.java
(added)
+++
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/ClassifierException.java
Tue Jan 3 17:11:32 2012
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.topic;
+
+
+/**
+ * Exception raised when trying to classify a document or updating the
classifier statistical model.
+ */
+public class ClassifierException extends Exception {
+
+ public ClassifierException(String message, Throwable cause) {
+ super(message, cause);
+ }
+
+ public ClassifierException(String message) {
+ super(message);
+ }
+
+ public ClassifierException(Throwable cause) {
+ super(cause);
+ }
+
+ private static final long serialVersionUID = 1L;
+
+}
Added:
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TopicClassifier.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TopicClassifier.java?rev=1226872&view=auto
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TopicClassifier.java
(added)
+++
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TopicClassifier.java
Tue Jan 3 17:11:32 2012
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.topic;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.stanbol.enhancer.servicesapi.EngineException;
+
+/**
+ * Service interface for suggesting hierarchical topics from a specific scheme
(a.k.a. taxonomy, thesaurus or
+ * topics hierarchy) from the text content of a document or part of a document.
+ */
+public interface TopicClassifier {
+
+ /**
+ * @return the short id identifying this classifier / scheme: can be used
as URL path component to publish
+ * the service.
+ */
+ public String getSchemeId();
+
+ /**
+ * @return list of language codes for text that can be automatically
classified by the service.
+ */
+ public List<String> getAcceptedLanguages();
+
+ /**
+ * Perform automated text categorization based on statistical occurrences
of words in the given text.
+ *
+ * @param text
+ * the text content to analyze
+ * @return the most likely topics related to the text
+ * @throws EngineException
+ */
+ List<TopicSuggestion> suggestTopics(String text) throws
ClassifierException;
+
+ /**
+ * @return the set of ids of topics directly broader than
+ * @param id
+ */
+ Set<String> getNarrowerTopics(String broadTopicId) throws
ClassifierException;
+
+ /**
+ * @return the set of ids of topics directly narrower than
+ * @param id
+ */
+ Set<String> getBroaderTopics(String id) throws ClassifierException;
+
+ /**
+ * @return the set of ids of topics without broader topics.
+ */
+ Set<String> getTopicRoots() throws ClassifierException;
+
+ /**
+ * @return true if the classifier model can be updated with the {@code
addTopic} / {@code removeTopic} /
+ * {@code updateModel} / methods.
+ */
+ boolean isUpdatable();
+
+ /**
+ * Register a topic and set it's ancestors in the taxonomy. Warning:
re-adding an already existing topic
+ * can delete the underlying statistical model. Calling {@code
updateModel} is necessary to rebuild the
+ * statistical model based on the hierarchical structure of the topics and
the registered training set.
+ *
+ * @param id
+ * the new topic id
+ * @param broaderTopics
+ * list of directly broader topics in the thesaurus
+ */
+ void addTopic(String id, Collection<String> broaderTopics) throws
ClassifierException;
+
+ /**
+ * Remove a topic from the thesaurus. WARNING: it is the caller
responsibility to recursively remove or
+ * update any narrower topic that might hold a reference on this topic.
Once the tree is updated,
+ * {@code updateModel} should be called to re-align the statistical model
to match the new hierarchy by
+ * drawing examples from the dataset.
+ *
+ * @param id
+ * if of the topic to remove from the model
+ */
+ void removeTopic(String id) throws ClassifierException;
+
+ /**
+ * Register a training set to use to build the statistical model of the
classifier.
+ */
+ void setTrainingSet(TrainingSet trainingSet);
+
+ /**
+ * Incrementally update the statistical model of the classifier. Note:
depending on the size of the
+ * dataset and the number of topics to update, this process can take a
long time and should probably be
+ * wrapped in a dedicated thread if called by a the user interface layer.
+ */
+ void updateModel() throws TrainingSetException;
+}
Added:
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TopicSuggestion.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TopicSuggestion.java?rev=1226872&view=auto
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TopicSuggestion.java
(added)
+++
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TopicSuggestion.java
Tue Jan 3 17:11:32 2012
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.topic;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Data transfer object for the individual topic classification results.
+ */
+public class TopicSuggestion {
+
+ public final String uri;
+
+ public final List<String> paths = new ArrayList<String>();
+
+ public final double score;
+
+ public TopicSuggestion(String uri, List<String> paths, double score) {
+ this.uri = uri;
+ if (paths != null) {
+ this.paths.addAll(paths);
+ }
+ this.score = score;
+ }
+
+ public TopicSuggestion(String uri, double score) {
+ this(uri, null, score);
+ }
+
+}
Added:
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TrainingSet.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TrainingSet.java?rev=1226872&view=auto
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TrainingSet.java
(added)
+++
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TrainingSet.java
Tue Jan 3 17:11:32 2012
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.topic;
+
+import java.util.Calendar;
+import java.util.List;
+import java.util.Set;
+
+/**
+ * Source of categorized text documents that can be used to build a the
statistical model of a
+ * TopicClassifier.
+ */
+public interface TrainingSet {
+
+ /**
+ * @return true if the training set can be updated using the {@code
registerExample} API. If false that
+ * means that the component is a view on a remote datasource that
has its own API for updates
+ * (e.g. the document repository of a CMS).
+ */
+ boolean isUpdatable() throws TrainingSetException;
+
+ /**
+ * Register some text content to be used as an example of document that
should be positively classified as
+ * topics by the model.
+ *
+ * @param id
+ * Unique identifier of the example to create or override. If
null, a new example with a
+ * generated id will be created.
+ * @param text
+ * Text content of the example. If null the example with the
matching id will be deleted.
+ *
+ * @param topics
+ * The list of all the topics the example should be classified
as.
+ * @return the id of the registered example (can be automatically
generated)
+ */
+ String registerExample(String exampleId, String text, List<String> topics)
throws TrainingSetException;
+
+ /**
+ * @param lastModificationDate
+ * typically the date of the last classifier model update or
null to find the list of all
+ * topics registered in the dataset.
+ * @return the set of topic ids that received some modifications (e.g. new
or updated examples) since
+ * {@code lastModificationDate}.
+ */
+ Set<String> getUpdatedTopics(Calendar lastModificationDate) throws
TrainingSetException;
+
+ /**
+ * Fetch examples representative of the set of topics passed as argument
so as to be able to build a
+ * statistical model.
+ *
+ * @param topics
+ * list of admissible topics to search examples for: each
example in the batch will be
+ * classified in at list one of the requested topics. This list
would typically comprise a
+ * topic along with it's direct narrower descendants (and maybe
level 2 descendants too).
+ * @param offset
+ * marker value to fetch the next batch. Pass null to fetch the
first batch.
+ * @return a batch of example suitable for training a classifier model for
the requested topics.
+ */
+ Batch<String> getPositiveExamples(List<String> topics, Object offset)
throws TrainingSetException;
+
+ /**
+ * Fetch examples representative of any document not specifically
classified in one of the passed topics.
+ * This can be useful to train a statistical model for a classifier of
those topics to negatively weight
+ * generic features (term occurrences) and limit the number of false
positives in the classification. It
+ * is up to the classifier model to decide to use such negative examples
or not at training time.
+ *
+ * @param topics
+ * list of non-admissible topics to search example for: each
example in the batch must no be
+ * classified in any of the passed topics.
+ * @param offset
+ * marker value to fetch the next batch. Pass null to fetch the
first batch.
+ * @return a batch of examples suitable for training (negative-refinement)
a classifier model for the
+ * requested topics.
+ */
+ Batch<String> getNegativeExamples(List<String> topics, Object offset)
throws TrainingSetException;
+
+ /**
+ * Number of examples to fetch at once.
+ */
+ public void setBatchSize();
+
+}
Added:
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TrainingSetException.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TrainingSetException.java?rev=1226872&view=auto
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TrainingSetException.java
(added)
+++
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TrainingSetException.java
Tue Jan 3 17:11:32 2012
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.topic;
+
+import java.io.IOException;
+
+/**
+ * Unexpected Error while performing read or write access to a topic
classifier training set.
+ */
+public class TrainingSetException extends IOException {
+
+ private static final long serialVersionUID = 1L;
+
+ public TrainingSetException(String message) {
+ super(message);
+ }
+
+}
Modified:
incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/TopicEngineTest.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/TopicEngineTest.java?rev=1226872&r1=1226871&r2=1226872&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/TopicEngineTest.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/TopicEngineTest.java
Tue Jan 3 17:11:32 2012
@@ -42,6 +42,7 @@ import org.apache.solr.client.solrj.resp
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.core.CoreContainer;
import org.apache.stanbol.commons.solr.utils.StreamQueryRequest;
+import org.apache.stanbol.enhancer.topic.TopicSuggestion;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
@@ -128,6 +129,7 @@ public class TopicEngineTest {
config.put(TopicClassificationEngine.SOLR_CORE, solrServer);
config.put(TopicClassificationEngine.TOPIC_URI_FIELD, "topic");
config.put(TopicClassificationEngine.SIMILARTITY_FIELD, "text");
+ config.put(TopicClassificationEngine.BROADER_FIELD, "broader");
return config;
}
@@ -162,13 +164,53 @@ public class TopicEngineTest {
// check accept language optional param
Hashtable<String,Object> configWithAcceptLangage = new
Hashtable<String,Object>();
configWithAcceptLangage.putAll(config);
- configWithAcceptLangage.put(TopicClassificationEngine.LANGUAGE, "en,
fr");
+ configWithAcceptLangage.put(TopicClassificationEngine.LANGUAGES, "en,
fr");
engine =
TopicClassificationEngine.fromParameters(configWithAcceptLangage);
assertNotNull(engine);
assertEquals(engine.acceptedLanguages, Arrays.asList("en", "fr"));
}
@Test
+ public void testProgrammaticThesaurusConstruction() throws Exception {
+ TopicClassificationEngine engine =
TopicClassificationEngine.fromParameters(getDefaultConfigParams());
+
+ // Register the roots of the taxonomy
+ engine.addTopic("http://example.com/topics/root1", null);
+ engine.addTopic("http://example.com/topics/root2", null);
+ engine.addTopic("http://example.com/topics/root3", new
ArrayList<String>());
+ assertEquals(0,
engine.getBroaderTopics("http://example.com/topics/root1").size());
+ assertEquals(0,
engine.getBroaderTopics("http://example.com/topics/root2").size());
+ assertEquals(0,
engine.getBroaderTopics("http://example.com/topics/root3").size());
+ assertEquals(3, engine.getTopicRoots().size());
+
+ // Register some non root nodes
+ engine.addTopic("http://example.com/topics/node1",
+ Arrays.asList("http://example.com/topics/root1",
"http://example.com/topics/root2"));
+ engine.addTopic("http://example.com/topics/node2",
Arrays.asList("http://example.com/topics/root3"));
+ engine.addTopic("http://example.com/topics/node3",
+ Arrays.asList("http://example.com/topics/node1",
"http://example.com/topics/node2"));
+
+ // the root where not impacted
+ assertEquals(0,
engine.getBroaderTopics("http://example.com/topics/root1").size());
+ assertEquals(0,
engine.getBroaderTopics("http://example.com/topics/root2").size());
+ assertEquals(0,
engine.getBroaderTopics("http://example.com/topics/root3").size());
+ assertEquals(3, engine.getTopicRoots().size());
+
+ // the other nodes have the same broader topics as at creation time
+ assertEquals(2,
engine.getBroaderTopics("http://example.com/topics/node1").size());
+ assertEquals(1,
engine.getBroaderTopics("http://example.com/topics/node2").size());
+ assertEquals(2,
engine.getBroaderTopics("http://example.com/topics/node3").size());
+
+ // check the induced narrower relationships
+ assertEquals(1,
engine.getNarrowerTopics("http://example.com/topics/root1").size());
+ assertEquals(1,
engine.getNarrowerTopics("http://example.com/topics/root2").size());
+ assertEquals(1,
engine.getNarrowerTopics("http://example.com/topics/root3").size());
+ assertEquals(1,
engine.getNarrowerTopics("http://example.com/topics/node1").size());
+ assertEquals(1,
engine.getNarrowerTopics("http://example.com/topics/node2").size());
+ assertEquals(0,
engine.getNarrowerTopics("http://example.com/topics/node3").size());
+ }
+
+ @Test
public void testEmptyIndexTopicClassification() throws Exception {
TopicClassificationEngine engine =
TopicClassificationEngine.fromParameters(getDefaultConfigParams());
List<TopicSuggestion> suggestedTopics = engine.suggestTopics("This is
a test.");
Modified:
incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/test_schema.xml
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/test_schema.xml?rev=1226872&r1=1226871&r2=1226872&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/test_schema.xml
(original)
+++
incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/test_schema.xml
Tue Jan 3 17:11:32 2012
@@ -59,7 +59,8 @@
<fields>
<field name="topic" type="string" indexed="true" stored="true"
required="true" />
<field name="type" type="string" indexed="true" stored="true"
multiValued="true" />
- <field name="paths" type="string" indexed="true" stored="true"
multiValued="true" />
+ <field name="paths" type="string" indexed="true" stored="true"
multiValued="true" />
+ <field name="broader" type="string" indexed="true" stored="true"
multiValued="true" />
<field name="text" type="text" indexed="true" stored="false"
termVectors="true" termPositions="false" termOffsets="false" />
<field name="popularity" type="int" indexed="true" stored="true" />
Modified:
incubator/stanbol/trunk/enhancer/generic/servicesapi/src/main/java/org/apache/stanbol/enhancer/servicesapi/rdf/TechnicalClasses.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/generic/servicesapi/src/main/java/org/apache/stanbol/enhancer/servicesapi/rdf/TechnicalClasses.java?rev=1226872&r1=1226871&r2=1226872&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/generic/servicesapi/src/main/java/org/apache/stanbol/enhancer/servicesapi/rdf/TechnicalClasses.java
(original)
+++
incubator/stanbol/trunk/enhancer/generic/servicesapi/src/main/java/org/apache/stanbol/enhancer/servicesapi/rdf/TechnicalClasses.java
Tue Jan 3 17:11:32 2012
@@ -45,6 +45,20 @@ public class TechnicalClasses {
*/
public static final UriRef ENHANCER_ENTITYANNOTATION = new UriRef(
NamespaceEnum.enhancer+"EntityAnnotation");
+
+ /**
+ * Type used for annotations documents. This type is intended
+ * to be used in combination with ENHANCER_ENHANCEMENT and
+ * ENHANCER_ENTITYANNOTATION as a complimentary marker to suggest
+ * that the referenced is the one of the primary topic of the
+ * whole document or of a specific section specified by a linked
+ * TextAnnotation.
+ *
+ * The entity or concept is not necessarily explicitly mentioned
+ * in the document (like a traditional entity occurrence would).
+ */
+ public static final UriRef ENHANCER_TOPICANNOTATION = new UriRef(
+ NamespaceEnum.enhancer+"TopicAnnotation");
/**
* To be used as a type pour any semantic knowledge extraction