Author: ogrisel
Date: Thu Jan 5 18:13:12 2012
New Revision: 1227729
URL: http://svn.apache.org/viewvc?rev=1227729&view=rev
Log:
STANBOL-197: implement incremental queries on SolrTrainingSet
Modified:
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/SolrTrainingSet.java
incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/TrainingSetTest.java
Modified:
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/SolrTrainingSet.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/SolrTrainingSet.java?rev=1227729&r1=1227728&r2=1227729&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/SolrTrainingSet.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/SolrTrainingSet.java
Thu Jan 5 18:13:12 2012
@@ -16,14 +16,18 @@
*/
package org.apache.stanbol.enhancer.topic;
+import java.text.DateFormat;
+import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Collection;
-import java.util.Collections;
import java.util.Date;
import java.util.Dictionary;
+import java.util.GregorianCalendar;
import java.util.List;
import java.util.Set;
+import java.util.TimeZone;
+import java.util.TreeSet;
import java.util.UUID;
import org.apache.commons.lang.StringUtils;
@@ -60,6 +64,8 @@ import org.slf4j.LoggerFactory;
@Property(name =
SolrTrainingSet.MODIFICATION_DATE_FIELD)})
public class SolrTrainingSet extends ConfiguredSolrCoreTracker implements
TrainingSet {
+ protected static final TimeZone UTC = TimeZone.getTimeZone("UTC");
+
public static final String TRAINING_SET_ID =
"org.apache.stanbol.enhancer.topic.trainingset.id";
public static final String SOLR_CORE =
"org.apache.stanbol.enhancer.engine.topic.solrCore";
@@ -124,6 +130,18 @@ public class SolrTrainingSet extends Con
return true;
}
+ protected String utcIsoString(Date date) {
+ DateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss");
+ return df.format(date);
+ }
+
+ protected String utcIsoString(Calendar calendar) {
+ if (!calendar.getTimeZone().equals(UTC)) {
+ calendar.setTimeZone(UTC);
+ }
+ return utcIsoString(calendar.getTime());
+ }
+
@Override
public String registerExample(String exampleId, String text, List<String>
topics) throws TrainingSetException {
if (exampleId == null || exampleId.isEmpty()) {
@@ -135,7 +153,8 @@ public class SolrTrainingSet extends Con
if (topics != null) {
doc.addField(topicUrisField, topics);
}
- doc.addField(modificationDateField, new Date());
+ String utcIsoDate = utcIsoString(new GregorianCalendar(UTC));
+ doc.addField(modificationDateField, utcIsoDate + "Z");
SolrServer server = getActiveSolrServer();
try {
server.add(doc);
@@ -150,8 +169,48 @@ public class SolrTrainingSet extends Con
@Override
public Set<String> getUpdatedTopics(Calendar lastModificationDate) throws
TrainingSetException {
- // TODO
- return Collections.emptySet();
+ TreeSet<String> collectedTopics = new TreeSet<String>();
+ SolrQuery query = new SolrQuery();
+ String utcIsoDate = utcIsoString(lastModificationDate);
+ String q = modificationDateField + ":[" + utcIsoDate + "Z TO *]";
+ String offset = null;
+ boolean done = false;
+ query.addSortField(exampleIdField, SolrQuery.ORDER.asc);
+ query.set("rows", batchSize + 1);
+ query.set("fl", exampleIdField + "," + topicUrisField);
+ while (!done) {
+ try {
+ if (offset != null) {
+ q += " AND " + exampleIdField + ":[" + offset.toString() +
" TO *]";
+ }
+ query.setQuery(q);
+ QueryResponse response = solrServer.query(query);
+ int count = 0;
+ for (SolrDocument result : response.getResults()) {
+ if (count == batchSize) {
+ offset =
result.getFirstValue(exampleIdField).toString();
+ } else {
+ count++;
+ Collection<Object> values =
result.getFieldValues(topicUrisField);
+ if (values == null) {
+ continue;
+ }
+ for (Object value : values) {
+ collectedTopics.add(value.toString());
+ }
+ }
+ }
+ if (count < batchSize) {
+ done = true;
+ }
+ } catch (SolrServerException e) {
+ String msg = String.format(
+ "Error while fetching topics for examples modified after
'%s' on Solr Core '%s'.",
+ utcIsoDate, solrCoreId);
+ throw new TrainingSetException(msg, e);
+ }
+ }
+ return collectedTopics;
}
@Override
Modified:
incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/TrainingSetTest.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/TrainingSetTest.java?rev=1227729&r1=1227728&r2=1227729&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/TrainingSetTest.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/TrainingSetTest.java
Thu Jan 5 18:13:12 2012
@@ -22,6 +22,8 @@ import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
+import java.util.Calendar;
+import java.util.GregorianCalendar;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Set;
@@ -173,6 +175,28 @@ public class TrainingSetTest extends Bas
assertEquals(expectedCollectedText, collectedText);
}
+ @Test
+ public void testIncrementalQueries() throws Exception {
+ Calendar date0 = new GregorianCalendar();
+ Set<String> updatedTopics = trainingSet.getUpdatedTopics(date0);
+ assertEquals(0, updatedTopics.size());
+
+ trainingSet.registerExample("example1", "Text of example1.",
Arrays.asList(TOPIC_1));
+ trainingSet.registerExample("example2", "Text of example2.",
Arrays.asList(TOPIC_1, TOPIC_2));
+
+ updatedTopics = trainingSet.getUpdatedTopics(date0);
+ assertEquals(2, updatedTopics.size());
+ assertTrue(updatedTopics.contains(TOPIC_1));
+ assertTrue(updatedTopics.contains(TOPIC_2));
+
+ // check that the new registration look as compared to a new date:
+ Thread.sleep(1000);
+
+ Calendar date1 = new GregorianCalendar();
+ updatedTopics = trainingSet.getUpdatedTopics(date1);
+ assertEquals(0, updatedTopics.size());
+ }
+
protected Hashtable<String,Object> getDefaultConfigParams() {
Hashtable<String,Object> config = new Hashtable<String,Object>();
config.put(SolrTrainingSet.SOLR_CORE, trainingsetSolrServer);