Author: ogrisel
Date: Mon Jan 9 14:28:33 2012
New Revision: 1229168
URL: http://svn.apache.org/viewvc?rev=1229168&view=rev
Log:
STANBOL-197: minimalist implementation of the model training algorithm
Modified:
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/Batch.java
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/SolrTrainingSet.java
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TopicClassifier.java
incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/TopicEngineTest.java
Modified:
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java?rev=1229168&r1=1229167&r2=1229168&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
Mon Jan 9 14:28:33 2012
@@ -22,6 +22,7 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
+import java.util.Date;
import java.util.Dictionary;
import java.util.Iterator;
import java.util.LinkedHashSet;
@@ -58,6 +59,7 @@ import org.apache.stanbol.enhancer.servi
import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
import org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses;
+import org.apache.stanbol.enhancer.topic.Batch;
import org.apache.stanbol.enhancer.topic.ClassifierException;
import org.apache.stanbol.enhancer.topic.ConfiguredSolrCoreTracker;
import org.apache.stanbol.enhancer.topic.TopicClassifier;
@@ -112,6 +114,12 @@ public class TopicClassificationEngine e
private static final Logger log =
LoggerFactory.getLogger(TopicClassificationEngine.class);
+ // TODO: make the following bounds configurable
+
+ public int MAX_CHARS_PER_TOPIC = 100000;
+
+ public Integer MAX_ROOTS = 1000;
+
protected String engineId;
protected List<String> acceptedLanguages;
@@ -156,7 +164,7 @@ public class TopicClassificationEngine e
// optional fields, can be null
broaderField = (String) config.get(BROADER_FIELD);
- materializedPathField = (String) config.get(TOPIC_URI_FIELD);
+ materializedPathField = (String) config.get(MATERIALIZED_PATH_FIELD);
modelUpdateDateField = (String) config.get(MODEL_UPDATE_DATE_FIELD);
Object orderParamValue = config.get(ORDER);
if (orderParamValue != null) {
@@ -340,10 +348,11 @@ public class TopicClassificationEngine e
@Override
public Set<String> getTopicRoots() throws ClassifierException {
- // TODO: this can be very big on flat thesauri: should we enable a
paging API instead?
LinkedHashSet<String> rootTopics = new LinkedHashSet<String>();
SolrServer solrServer = getActiveSolrServer();
SolrQuery query = new SolrQuery();
+ // TODO: this can be very big on flat thesauri: should we enable a
paging API instead?
+ query.setRows(MAX_ROOTS);
if (broaderField != null) {
// find any topic with an empty broaderField
query.setParam("q", "-" + broaderField + ":[\"\" TO *]");
@@ -352,7 +361,12 @@ public class TopicClassificationEngine e
query.setQuery("*:*");
}
try {
- for (SolrDocument result : solrServer.query(query).getResults()) {
+ QueryResponse response = solrServer.query(query);
+ if (response.getResults().size() >= MAX_ROOTS) {
+ log.warn(String.format("TopicClassifier '%s' has more than %d
registered topic roots."
+ + " Some roots might be ignored.",
engineId, MAX_ROOTS));
+ }
+ for (SolrDocument result : response.getResults()) {
rootTopics.add(result.getFirstValue(topicUriField).toString());
}
} catch (SolrServerException e) {
@@ -402,16 +416,99 @@ public class TopicClassificationEngine e
}
@Override
- public int updateModel(boolean incremental) throws TrainingSetException {
+ public int updateModel(boolean incremental) throws TrainingSetException,
ClassifierException {
checkTrainingSet();
- // TODO:
- // perform a first query to iterate over all the registered topics
sorted by id (to allow for paging)
- // for each topic find the last update date of the union of the topic
and it's narrower topic
- return 0;
+ if (incremental && modelUpdateDateField == null) {
+ log.warn(MODEL_UPDATE_DATE_FIELD + " field is not configured:
switching to batch update mode.");
+ incremental = false;
+ }
+ // TODO: implement incremental update by using the date informations
+ int updatedTopics = 0;
+ SolrServer solrServer = getActiveSolrServer();
+ SolrQuery query = new SolrQuery();
+ String q = "*:*";
+ query.setFields(topicUriField, broaderField);
+ String offset = null;
+ boolean done = false;
+ int batchSize = 1000;
+ query.addSortField(topicUriField, SolrQuery.ORDER.asc);
+ query.setRows(batchSize + 1);
+ while (!done) {
+ // batch over all the indexed topics
+ try {
+ if (offset != null) {
+ q += " AND " + topicUriField + ":[" + offset.toString() +
" TO *]";
+ }
+ query.setQuery(q);
+ QueryResponse response = solrServer.query(query);
+ int count = 0;
+ for (SolrDocument result : response.getResults()) {
+ String topicId =
result.getFirstValue(topicUriField).toString();
+ if (count == batchSize) {
+ offset = topicId;
+ } else {
+ count++;
+ updateTopic(topicId,
result.getFieldValues(broaderField));
+ updatedTopics++;
+ }
+ }
+ if (count < batchSize) {
+ done = true;
+ }
+ } catch (SolrServerException e) {
+ String msg = String.format("Error while updating topics on
Solr Core '%s'.", solrCoreId);
+ throw new TrainingSetException(msg, e);
+ }
+ }
+ return updatedTopics;
+ }
+
+ /**
+ * @param topicId
+ * @throws TrainingSetException
+ * @throws ClassifierException
+ */
+ public void updateTopic(String topicId, Collection<Object>
broaderTopicIds) throws TrainingSetException,
+
ClassifierException {
+ ArrayList<String> impactedTopics = new ArrayList<String>();
+ impactedTopics.add(topicId);
+ impactedTopics.addAll(getNarrowerTopics(topicId));
+ Batch<String> examples = Batch.emtpyBatch(String.class);
+ StringBuffer sb = new StringBuffer();
+ do {
+ examples = trainingSet.getPositiveExamples(impactedTopics,
examples.nextOffset);
+ for (String example : examples.items) {
+ sb.append(example);
+ sb.append("\n\n");
+ }
+ } while (sb.length() < MAX_CHARS_PER_TOPIC && examples.hasMore);
+
+ // reindex the topic with the new text data collected from the examples
+ SolrInputDocument doc = new SolrInputDocument();
+ doc.addField(topicUriField, topicId);
+ if (broaderTopicIds != null && broaderField != null) {
+ doc.addField(broaderField, broaderTopicIds);
+ }
+ if (sb.length() > 0) {
+ doc.addField(similarityField, sb);
+ }
+ if (modelUpdateDateField != null) {
+ // TODO: force UTC timezone here
+ doc.addField(modelUpdateDateField, new Date());
+ }
+ SolrServer solrServer = getActiveSolrServer();
+ try {
+ solrServer.add(doc);
+ solrServer.commit();
+ } catch (Exception e) {
+ String msg = String.format("Error updating topic with id '%s' on
Solr Core '%s'", topicId,
+ solrCoreId);
+ throw new ClassifierException(msg, e);
+ }
}
protected void checkTrainingSet() throws TrainingSetException {
- if (trainingSet != null) {
+ if (trainingSet == null) {
throw new TrainingSetException(
String.format("TopicClassificationEngine %s has no
registered"
+ " training set hence cannot be updated.",
engineId));
Modified:
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/Batch.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/Batch.java?rev=1229168&r1=1229167&r2=1229168&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/Batch.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/Batch.java
Mon Jan 9 14:28:33 2012
@@ -17,6 +17,7 @@
package org.apache.stanbol.enhancer.topic;
import java.io.Serializable;
+import java.util.ArrayList;
import java.util.List;
/**
@@ -53,4 +54,11 @@ public class Batch<T> implements Seriali
this.hasMore = hasMore;
this.nextOffset = nextOffset;
}
+
+ /**
+ * Helper method to return a first empty batch to bootstrap an iteration
loop.
+ */
+ public static <T2> Batch<T2> emtpyBatch(Class<T2> clazz) {
+ return new Batch<T2>(new ArrayList<T2>(), true, null);
+ }
}
Modified:
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/SolrTrainingSet.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/SolrTrainingSet.java?rev=1229168&r1=1229167&r2=1229168&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/SolrTrainingSet.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/SolrTrainingSet.java
Mon Jan 9 14:28:33 2012
@@ -193,8 +193,8 @@ public class SolrTrainingSet extends Con
String offset = null;
boolean done = false;
query.addSortField(exampleIdField, SolrQuery.ORDER.asc);
- query.set("rows", batchSize + 1);
- query.set("fl", exampleIdField + "," + topicUrisField);
+ query.setRows(batchSize + 1);
+ query.setFields(exampleIdField, topicUrisField);
while (!done) {
try {
if (offset != null) {
Modified:
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TopicClassifier.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TopicClassifier.java?rev=1229168&r1=1229167&r2=1229168&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TopicClassifier.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TopicClassifier.java
Mon Jan 9 14:28:33 2012
@@ -107,5 +107,5 @@ public interface TopicClassifier {
*
* @return the number of updated topics
*/
- int updateModel(boolean incremental) throws TrainingSetException;
+ int updateModel(boolean incremental) throws TrainingSetException,
ClassifierException;
}
Modified:
incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/TopicEngineTest.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/TopicEngineTest.java?rev=1229168&r1=1229167&r2=1229168&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/TopicEngineTest.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/TopicEngineTest.java
Mon Jan 9 14:28:33 2012
@@ -18,6 +18,7 @@ package org.apache.stanbol.enhancer.engi
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
import java.io.File;
@@ -202,8 +203,8 @@ public class TopicEngineTest extends Bas
assertEquals(bestSuggestion.uri, "Category:American_films");
}
- //@Test
- public void testTrainClassifierFromExamples() throws Exception {
+ @Test
+ public void testBatchTrainClassifierFromExamples() throws Exception {
// mini taxonomy for news articles
String business = "urn:topics/business";
@@ -212,25 +213,28 @@ public class TopicEngineTest extends Bas
String sport = "urn:topics/sport";
String football = "urn:topics/football";
String wordcup = "urn:topics/wordcup";
+ String music = "urn:topics/music";
classifier.addTopic(business, null);
classifier.addTopic(technology, null);
classifier.addTopic(sport, null);
+ classifier.addTopic(music, null);
classifier.addTopic(apple, Arrays.asList(business, technology));
classifier.addTopic(football, Arrays.asList(sport));
classifier.addTopic(wordcup, Arrays.asList(football));
// train the classifier on an empty dataset
classifier.setTrainingSet(trainingSet);
- assertEquals(6, classifier.updateModel(true));
+ assertEquals(7, classifier.updateModel(false));
// the model is updated but does not predict anything
List<TopicSuggestion> suggestions = classifier
.suggestTopics("I like the sound of vuvuzula in the morning!");
assertEquals(0, suggestions.size());
- // further update of the model leave do not change any topic
- assertEquals(0, classifier.updateModel(true));
+ // further update of the model leave do not change any topic but they
are re-indexed anyway because
+ // incremental update is disabled.
+ assertEquals(7, classifier.updateModel(false));
// lets register some examples
trainingSet.registerExample(null, "Money, money, money is the root of
all evil.",
@@ -243,15 +247,16 @@ public class TopicEngineTest extends Bas
Arrays.asList(football));
trainingSet.registerExample(null, "Vuvuzela made the soundtrack of the"
+ " football wordcup of 2010 in
South Africa.",
- Arrays.asList(football, wordcup));
+ Arrays.asList(football, wordcup, music));
- // retrain the model: all 6 topics are impacted by the new examples
- assertEquals(6, classifier.updateModel(true));
+ // retrain the model: all topics are recomputed
+ assertEquals(7, classifier.updateModel(false));
suggestions = classifier.suggestTopics("I like the sound of vuvuzula
in the morning!");
- assertEquals(3, suggestions.size());
- assertEquals(wordcup, suggestions.get(0).uri);
- assertEquals(football, suggestions.get(1).uri);
- assertEquals(sport, suggestions.get(2).uri);
+ assertTrue(suggestions.size() >= 4);
+ assertEquals(music, suggestions.get(0).uri);
+ assertEquals(wordcup, suggestions.get(1).uri);
+ assertEquals(football, suggestions.get(2).uri);
+ assertEquals(sport, suggestions.get(3).uri);
}
protected Hashtable<String,Object> getDefaultClassifierConfigParams() {