Author: ogrisel
Date: Tue Jan 10 16:33:20 2012
New Revision: 1229628
URL: http://svn.apache.org/viewvc?rev=1229628&view=rev
Log:
STANBOL-197: implement incremental statistical model building
Added:
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/UTCTimeStamper.java
Modified:
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/SolrTrainingSet.java
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TrainingSet.java
incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/TopicEngineTest.java
incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/TrainingSetTest.java
Modified:
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java?rev=1229628&r1=1229627&r2=1229628&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
Tue Jan 10 16:33:20 2012
@@ -66,6 +66,7 @@ import org.apache.stanbol.enhancer.topic
import org.apache.stanbol.enhancer.topic.TopicSuggestion;
import org.apache.stanbol.enhancer.topic.TrainingSet;
import org.apache.stanbol.enhancer.topic.TrainingSetException;
+import org.apache.stanbol.enhancer.topic.UTCTimeStamper;
import org.osgi.framework.InvalidSyntaxException;
import org.osgi.service.cm.ConfigurationException;
import org.osgi.service.component.ComponentContext;
@@ -429,9 +430,9 @@ public class TopicClassificationEngine e
SolrQuery query = new SolrQuery();
String q = "*:*";
if (modelUpdateDateField != null) {
- query.setFields(topicUriField, broaderField);
- } else {
query.setFields(topicUriField, broaderField, modelUpdateDateField);
+ } else {
+ query.setFields(topicUriField, broaderField);
}
String offset = null;
boolean done = false;
@@ -492,7 +493,6 @@ public class TopicClassificationEngine e
public void updateTopic(String topicId, List<String> impactedTopics,
Collection<Object> broaderTopics) throws TrainingSetException,
ClassifierException {
long start = System.currentTimeMillis();
-
Batch<String> examples = Batch.emtpyBatch(String.class);
StringBuffer sb = new StringBuffer();
do {
@@ -513,8 +513,7 @@ public class TopicClassificationEngine e
doc.addField(similarityField, sb);
}
if (modelUpdateDateField != null) {
- // TODO: force UTC timezone here
- doc.addField(modelUpdateDateField, new Date());
+ doc.addField(modelUpdateDateField, UTCTimeStamper.nowUtcDate());
}
SolrServer solrServer = getActiveSolrServer();
try {
Modified:
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/SolrTrainingSet.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/SolrTrainingSet.java?rev=1229628&r1=1229627&r2=1229628&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/SolrTrainingSet.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/SolrTrainingSet.java
Tue Jan 10 16:33:20 2012
@@ -16,18 +16,11 @@
*/
package org.apache.stanbol.enhancer.topic;
-import java.text.DateFormat;
-import java.text.SimpleDateFormat;
import java.util.ArrayList;
-import java.util.Calendar;
import java.util.Collection;
import java.util.Date;
import java.util.Dictionary;
-import java.util.GregorianCalendar;
import java.util.List;
-import java.util.Set;
-import java.util.TimeZone;
-import java.util.TreeSet;
import java.util.UUID;
import org.apache.commons.lang.StringUtils;
@@ -64,8 +57,6 @@ import org.slf4j.LoggerFactory;
@Property(name =
SolrTrainingSet.MODIFICATION_DATE_FIELD)})
public class SolrTrainingSet extends ConfiguredSolrCoreTracker implements
TrainingSet {
- protected static final TimeZone UTC = TimeZone.getTimeZone("UTC");
-
public static final String TRAINING_SET_ID =
"org.apache.stanbol.enhancer.topic.trainingset.id";
public static final String SOLR_CORE =
"org.apache.stanbol.enhancer.engine.topic.solrCore";
@@ -130,18 +121,6 @@ public class SolrTrainingSet extends Con
return true;
}
- protected String utcIsoString(Date date) {
- DateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss");
- return df.format(date);
- }
-
- protected String utcIsoString(Calendar calendar) {
- if (!calendar.getTimeZone().equals(UTC)) {
- calendar.setTimeZone(UTC);
- }
- return utcIsoString(calendar.getTime());
- }
-
@Override
public String registerExample(String exampleId, String text, List<String>
topics) throws TrainingSetException {
if (text == null) {
@@ -170,8 +149,7 @@ public class SolrTrainingSet extends Con
if (topics != null) {
doc.addField(topicUrisField, topics);
}
- String utcIsoDate = utcIsoString(new GregorianCalendar(UTC));
- doc.addField(modificationDateField, utcIsoDate + "Z");
+ doc.addField(modificationDateField, UTCTimeStamper.nowUtcDate());
SolrServer server = getActiveSolrServer();
try {
server.add(doc);
@@ -184,57 +162,35 @@ public class SolrTrainingSet extends Con
return exampleId;
}
-
@Override
- public boolean hasChangedSince(List<String> topics, Date referenceDate) {
- // TODO
- return true;
- }
-
- @Deprecated
- public Set<String> getUpdatedTopics(Calendar lastModificationDate) throws
TrainingSetException {
- TreeSet<String> collectedTopics = new TreeSet<String>();
- SolrQuery query = new SolrQuery();
- String utcIsoDate = utcIsoString(lastModificationDate);
- String q = modificationDateField + ":[" + utcIsoDate + "Z TO *]";
- String offset = null;
- boolean done = false;
- query.addSortField(exampleIdField, SolrQuery.ORDER.asc);
- query.setRows(batchSize + 1);
- query.setFields(exampleIdField, topicUrisField);
- while (!done) {
- try {
- if (offset != null) {
- q += " AND " + exampleIdField + ":[" + offset.toString() +
" TO *]";
- }
- query.setQuery(q);
- QueryResponse response = solrServer.query(query);
- int count = 0;
- for (SolrDocument result : response.getResults()) {
- if (count == batchSize) {
- offset =
result.getFirstValue(exampleIdField).toString();
- } else {
- count++;
- Collection<Object> values =
result.getFieldValues(topicUrisField);
- if (values == null) {
- continue;
- }
- for (Object value : values) {
- collectedTopics.add(value.toString());
- }
- }
- }
- if (count < batchSize) {
- done = true;
- }
- } catch (SolrServerException e) {
- String msg = String.format(
- "Error while fetching topics for examples modified after
'%s' on Solr Core '%s'.",
- utcIsoDate, solrCoreId);
- throw new TrainingSetException(msg, e);
+ public boolean hasChangedSince(List<String> topics, Date referenceDate)
throws TrainingSetException {
+ String utcIsoDate = UTCTimeStamper.utcIsoString(referenceDate);
+ StringBuffer sb = new StringBuffer();
+ sb.append(modificationDateField);
+ sb.append(":[");
+ sb.append(utcIsoDate);
+ sb.append(" TO *]");
+ if (topics != null && topics.size() > 0) {
+ sb.append(" AND (");
+ List<String> parts = new ArrayList<String>();
+ for (String topic : topics) {
+ // use a nested query to avoid string escaping issues with
special solr chars
+ parts.add("_query_:\"{!field f=" + topicUrisField + "}" +
topic + "\"");
}
+ sb.append(StringUtils.join(parts, " OR "));
+ sb.append(")");
+ }
+ SolrQuery query = new SolrQuery(sb.toString());
+ query.setRows(1);
+ query.setFields(exampleIdField);
+ try {
+ return solrServer.query(query).getResults().size() > 0;
+ } catch (SolrServerException e) {
+ String msg = String.format(
+ "Error while fetching topics for examples modified after '%s'
on Solr Core '%s'.",
+ utcIsoDate, solrCoreId);
+ throw new TrainingSetException(msg, e);
}
- return collectedTopics;
}
@Override
Modified:
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TrainingSet.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TrainingSet.java?rev=1229628&r1=1229627&r2=1229628&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TrainingSet.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TrainingSet.java
Tue Jan 10 16:33:20 2012
@@ -93,6 +93,6 @@ public interface TrainingSet {
* look for changes after that date
* @return true if one of the passed topics has changed since the last date
*/
- boolean hasChangedSince(List<String> topics, Date referenceDate);
+ boolean hasChangedSince(List<String> topics, Date referenceDate) throws
TrainingSetException;
}
Added:
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/UTCTimeStamper.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/UTCTimeStamper.java?rev=1229628&view=auto
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/UTCTimeStamper.java
(added)
+++
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/UTCTimeStamper.java
Tue Jan 10 16:33:20 2012
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */package org.apache.stanbol.enhancer.topic;
+
+import java.text.DateFormat;
+import java.text.SimpleDateFormat;
+import java.util.Date;
+import java.util.GregorianCalendar;
+import java.util.TimeZone;
+
+/**
+ * Utilities to compute UTC timestamp to make Solr queries independent of the
locale of the server.
+ */
+public class UTCTimeStamper {
+
+ protected static final TimeZone UTC = TimeZone.getTimeZone("UTC");
+
+ /**
+ * @return format a date in the timezone as an UTC ISO 8601 string up to
the millisecond precision.
+ */
+ public static String utcIsoString(Date date) {
+ DateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS");
+ df.setTimeZone(UTC);
+ return df.format(date) + "Z";
+ }
+
+ /**
+ * @return ISO 8601 serialization of the current date in the UTC timezone
suitable for range queries on
+ * Solr.
+ */
+ public static String nowUtcIsoString() {
+ return utcIsoString(nowUtcDate());
+ }
+
+ /**
+ * @return current date in the UTC timezone suitable for storage in a date
field of a Solr index.
+ */
+ public static Date nowUtcDate() {
+ return (new GregorianCalendar(UTC)).getTime();
+ }
+}
Modified:
incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/TopicEngineTest.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/TopicEngineTest.java?rev=1229628&r1=1229627&r2=1229628&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/TopicEngineTest.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/TopicEngineTest.java
Tue Jan 10 16:33:20 2012
@@ -204,7 +204,7 @@ public class TopicEngineTest extends Bas
}
@Test
- public void testBatchTrainClassifierFromExamples() throws Exception {
+ public void testTrainClassifierFromExamples() throws Exception {
// mini taxonomy for news articles
String business = "urn:topics/business";
@@ -214,6 +214,7 @@ public class TopicEngineTest extends Bas
String football = "urn:topics/football";
String worldcup = "urn:topics/worldcup";
String music = "urn:topics/music";
+ String law = "urn:topics/law";
classifier.addTopic(business, null);
classifier.addTopic(technology, null);
@@ -225,13 +226,16 @@ public class TopicEngineTest extends Bas
// train the classifier on an empty dataset
classifier.setTrainingSet(trainingSet);
- assertEquals(7, classifier.updateModel(false));
+ assertEquals(7, classifier.updateModel(true));
// the model is updated but does not predict anything
List<TopicSuggestion> suggestions = classifier
.suggestTopics("I like the sound of vuvuzula in the morning!");
assertEquals(0, suggestions.size());
+ // check that updating the model incrementally without changing the
dataset won't change anything.
+ assertEquals(0, classifier.updateModel(true));
+
// lets register some examples
trainingSet.registerExample(null, "Money, money, money is the root of
all evil.",
Arrays.asList(business));
@@ -254,7 +258,7 @@ public class TopicEngineTest extends Bas
trainingSet.registerExample(null, "Amon Tobin will be live in Paris
soon.", Arrays.asList(music));
// retrain the model: all topics are recomputed
- assertEquals(7, classifier.updateModel(false));
+ assertEquals(7, classifier.updateModel(true));
// test the trained classifier
suggestions = classifier.suggestTopics("I like the sound of vuvuzula
in the morning!");
@@ -276,6 +280,39 @@ public class TopicEngineTest extends Bas
assertEquals(worldcup, suggestions.get(1).uri);
assertEquals(technology, suggestions.get(2).uri);
assertEquals(football, suggestions.get(3).uri);
+
+ // test incremental update of a single root node
+ Thread.sleep(10);
+ trainingSet.registerExample(null, "Dubstep is broken beat as are
Hip-Hop, Dancehall"
+ + " or Drum & Bass",
Arrays.asList(music));
+ assertEquals(1, classifier.updateModel(true));
+ suggestions = classifier.suggestTopics("Glory box is best mixed as
dubstep.");
+ assertTrue(suggestions.size() >= 1);
+ assertEquals(music, suggestions.get(0).uri);
+ assertEquals(0, classifier.updateModel(true));
+
+ // test incremental update of a leaf node (the parent topic needs
re-indexing too)
+ Thread.sleep(10);
+ trainingSet.registerExample(null, "The Brazil team has won the cup so
many times.",
+ Arrays.asList(worldcup));
+ assertEquals(2, classifier.updateModel(true));
+ assertEquals(0, classifier.updateModel(true));
+
+ // it's always possible to rebuild all models from scratch
+ assertEquals(7, classifier.updateModel(false));
+
+ // it's also possible to define new topics on an existing model and
leverage incremental indexing for
+ // them as long as there are effectively registered on the classifier
+ trainingSet.registerExample(null,
+ "Under Belgian law, judges and prosecutors are judicial officers
with equal rank and pay.",
+ Arrays.asList(law));
+ trainingSet.registerExample(null, "Prosecutors are typically lawyers
who possess a law degree,"
+ + " and are recognized as legal
professionals by the court"
+ + " in which they intend to
represent the state.",
+ Arrays.asList(law));
+ assertEquals(0, classifier.updateModel(true));
+ classifier.addTopic(law, null);
+ assertEquals(1, classifier.updateModel(true));
}
protected Hashtable<String,Object> getDefaultClassifierConfigParams() {
Modified:
incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/TrainingSetTest.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/TrainingSetTest.java?rev=1229628&r1=1229627&r2=1229628&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/TrainingSetTest.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/TrainingSetTest.java
Tue Jan 10 16:33:20 2012
@@ -16,17 +16,21 @@
*/
package org.apache.stanbol.enhancer.engine.topic;
-import static org.junit.Assert.*;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Calendar;
+import java.util.Date;
import java.util.GregorianCalendar;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Set;
+import java.util.TimeZone;
import javax.xml.parsers.ParserConfigurationException;
@@ -35,6 +39,7 @@ import org.apache.solr.client.solrj.embe
import org.apache.stanbol.enhancer.topic.Batch;
import org.apache.stanbol.enhancer.topic.SolrTrainingSet;
import org.apache.stanbol.enhancer.topic.TrainingSetException;
+import org.apache.stanbol.enhancer.topic.UTCTimeStamper;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
@@ -76,6 +81,19 @@ public class TrainingSetTest extends Bas
}
@Test
+ public void testDateSerialization() throws Exception {
+ GregorianCalendar timeUtc = new
GregorianCalendar(TimeZone.getTimeZone("UTC"));
+ timeUtc.set(2012, 23, 12, 06, 43, 00);
+ timeUtc.set(Calendar.MILLISECOND, 0);
+ assertEquals("2013-12-12T06:43:00.000Z",
UTCTimeStamper.utcIsoString(timeUtc.getTime()));
+
+ GregorianCalendar timeCet = new
GregorianCalendar(TimeZone.getTimeZone("CET"));
+ timeCet.set(2012, 23, 12, 06, 43, 00);
+ timeCet.set(Calendar.MILLISECOND, 0);
+ assertEquals("2013-12-12T05:43:00.000Z",
UTCTimeStamper.utcIsoString(timeCet.getTime()));
+ }
+
+ @Test
public void testEmptyTrainingSet() throws TrainingSetException {
Batch<String> examples = trainingSet.getPositiveExamples(new
ArrayList<String>(), null);
assertEquals(examples.items.size(), 0);
@@ -122,7 +140,7 @@ public class TrainingSetTest extends Bas
assertEquals(2, examples.items.size());
assertEquals(examples.items, Arrays.asList("Text of example1.", "Text
of example2."));
assertFalse(examples.hasMore);
-
+
// Test example removal
trainingSet.registerExample("example1", null, Arrays.asList(TOPIC_1,
TOPIC_3));
examples = trainingSet.getPositiveExamples(Arrays.asList(TOPIC_1,
TOPIC_3), null);
@@ -188,38 +206,33 @@ public class TrainingSetTest extends Bas
}
@Test
- public void testIncrementalQueries() throws Exception {
- Calendar date0 = new GregorianCalendar();
- Set<String> updatedTopics = trainingSet.getUpdatedTopics(date0);
- assertEquals(0, updatedTopics.size());
+ public void testHasChangedSince() throws Exception {
+ Date date0 = new Date();
+ assertFalse(trainingSet.hasChangedSince(Arrays.asList(TOPIC_1),
date0));
+ assertFalse(trainingSet.hasChangedSince(Arrays.asList(TOPIC_2),
date0));
+ assertFalse(trainingSet.hasChangedSince(Arrays.asList(TOPIC_3),
date0));
+ assertFalse(trainingSet.hasChangedSince(Arrays.asList(TOPIC_1,
TOPIC_2), date0));
+ assertFalse(trainingSet.hasChangedSince(Arrays.asList(TOPIC_1,
TOPIC_3), date0));
trainingSet.registerExample("example1", "Text of example1.",
Arrays.asList(TOPIC_1));
trainingSet.registerExample("example2", "Text of example2.",
Arrays.asList(TOPIC_1, TOPIC_2));
- updatedTopics = trainingSet.getUpdatedTopics(date0);
- assertEquals(2, updatedTopics.size());
- assertTrue(updatedTopics.contains(TOPIC_1));
- assertTrue(updatedTopics.contains(TOPIC_2));
-
- // check that the new registration look as compared to a new date:
- Thread.sleep(1000);
-
- Calendar date1 = new GregorianCalendar();
- updatedTopics = trainingSet.getUpdatedTopics(date1);
- assertEquals(0, updatedTopics.size());
-
- // check that incremental query works with batching
- trainingSet.setBatchSize(3);
-
- Set<String> expectedTopics = new HashSet<String>();
- for (int i = 0; i < 11; i++) {
- String topic = "http://example.org/new-topics/" + i;
- String text = "Text of example" + i + ".";
- trainingSet.registerExample(null, text, Arrays.asList(topic));
- expectedTopics.add(topic);
- }
- Set<String> newlyUpdatedTopics = trainingSet.getUpdatedTopics(date1);
- assertEquals(expectedTopics, newlyUpdatedTopics);
+ assertTrue(trainingSet.hasChangedSince(Arrays.asList(TOPIC_1), date0));
+ assertTrue(trainingSet.hasChangedSince(Arrays.asList(TOPIC_2), date0));
+ assertFalse(trainingSet.hasChangedSince(Arrays.asList(TOPIC_3),
date0));
+ assertTrue(trainingSet.hasChangedSince(Arrays.asList(TOPIC_1,
TOPIC_2), date0));
+ assertTrue(trainingSet.hasChangedSince(Arrays.asList(TOPIC_1,
TOPIC_3), date0));
+
+ // check that the new registration look as compared to a new date (who
are stored up to the
+ // millisecond precision):
+ Thread.sleep(10);
+
+ Date date1 = new Date();
+ assertFalse(trainingSet.hasChangedSince(Arrays.asList(TOPIC_1),
date1));
+ assertFalse(trainingSet.hasChangedSince(Arrays.asList(TOPIC_2),
date1));
+ assertFalse(trainingSet.hasChangedSince(Arrays.asList(TOPIC_3),
date1));
+ assertFalse(trainingSet.hasChangedSince(Arrays.asList(TOPIC_1,
TOPIC_2), date1));
+ assertFalse(trainingSet.hasChangedSince(Arrays.asList(TOPIC_1,
TOPIC_3), date1));
}
protected Hashtable<String,Object> getDefaultConfigParams() {