Author: ogrisel
Date: Mon Jan 16 17:03:37 2012
New Revision: 1232065

URL: http://svn.apache.org/viewvc?rev=1232065&view=rev
Log:
STANBOL-197: refactored scan over the topics to make it reusable for the 
evaluation part

Added:
    
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/BatchProcessor.java
Modified:
    
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
    
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TopicClassifier.java
    
incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/classifier/schema.xml

Added: 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/BatchProcessor.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/BatchProcessor.java?rev=1232065&view=auto
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/BatchProcessor.java
 (added)
+++ 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/BatchProcessor.java
 Mon Jan 16 17:03:37 2012
@@ -0,0 +1,12 @@
+package org.apache.stanbol.enhancer.engine.topic;
+
+import java.util.List;
+
+import org.apache.stanbol.enhancer.topic.ClassifierException;
+import org.apache.stanbol.enhancer.topic.TrainingSetException;
+
+public interface BatchProcessor<T> {
+
+    int process(List<T> batch) throws ClassifierException, 
TrainingSetException;
+
+}

Modified: 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java?rev=1232065&r1=1232064&r2=1232065&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
 (original)
+++ 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
 Mon Jan 16 17:03:37 2012
@@ -18,6 +18,7 @@ package org.apache.stanbol.enhancer.engi
 
 import static 
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.NIE_PLAINTEXTCONTENT;
 
+import java.io.File;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
@@ -36,6 +37,7 @@ import org.apache.clerezza.rdf.core.MGra
 import org.apache.clerezza.rdf.core.Triple;
 import org.apache.clerezza.rdf.core.UriRef;
 import org.apache.clerezza.rdf.core.impl.TripleImpl;
+import org.apache.commons.io.FileUtils;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang.StringUtils;
 import org.apache.felix.scr.annotations.Activate;
@@ -48,6 +50,7 @@ import org.apache.felix.scr.annotations.
 import org.apache.solr.client.solrj.SolrQuery;
 import org.apache.solr.client.solrj.SolrServer;
 import org.apache.solr.client.solrj.SolrServerException;
+import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
 import org.apache.solr.client.solrj.request.UpdateRequest;
 import org.apache.solr.client.solrj.response.QueryResponse;
 import org.apache.solr.client.solrj.util.ClientUtils;
@@ -210,6 +213,8 @@ public class TopicClassificationEngine e
 
     protected int cvFoldCount = 0;
 
+    protected File evaluationFolder;
+
     @Activate
     protected void activate(ComponentContext context) throws 
ConfigurationException, InvalidSyntaxException {
         @SuppressWarnings("unchecked")
@@ -553,25 +558,12 @@ public class TopicClassificationEngine e
         this.trainingSet = trainingSet;
     }
 
-    @Override
-    public int updateModel(boolean incremental) throws TrainingSetException, 
ClassifierException {
-        checkTrainingSet();
-        long start = System.currentTimeMillis();
-        if (incremental && modelUpdateDateField == null) {
-            log.warn(MODEL_UPDATE_DATE_FIELD + " field is not configured: 
switching to batch update mode.");
-            incremental = false;
-        }
+    protected int batchOverTopics(BatchProcessor<SolrDocument> processor) 
throws TrainingSetException {
         // TODO: implement incremental update by using the date informations
-        int updatedTopics = 0;
+        int processedCount = 0;
         SolrServer solrServer = getActiveSolrServer();
         SolrQuery query = new SolrQuery();
         String q = entryTypeField + ":" + METADATA_ENTRY;
-        if (modelUpdateDateField != null) {
-            query.setFields(topicUriField, entryIdField, modelEntryIdField, 
broaderField,
-                modelUpdateDateField);
-        } else {
-            query.setFields(topicUriField, entryIdField, modelEntryIdField, 
broaderField);
-        }
         String offset = null;
         boolean done = false;
         int batchSize = 1000;
@@ -587,29 +579,17 @@ public class TopicClassificationEngine e
                 query.setQuery(q);
                 QueryResponse response = solrServer.query(query);
                 int count = 0;
+                List<SolrDocument> batchDocuments = new 
ArrayList<SolrDocument>();
                 for (SolrDocument result : response.getResults()) {
                     String topicId = 
result.getFirstValue(topicUriField).toString();
                     if (count == batchSize) {
                         offset = topicId;
                     } else {
                         count++;
-                        List<String> impactedTopics = new ArrayList<String>();
-                        impactedTopics.add(topicId);
-                        impactedTopics.addAll(getNarrowerTopics(topicId));
-                        if (incremental) {
-                            Date lastModelUpdate = (Date) 
result.getFirstValue(modelUpdateDateField);
-                            if (lastModelUpdate != null
-                                && 
!trainingSet.hasChangedSince(impactedTopics, lastModelUpdate)) {
-                                continue;
-                            }
-                        }
-                        String metadataEntryId = 
result.getFirstValue(entryIdField).toString();
-                        String modelEntryId = 
result.getFirstValue(modelEntryIdField).toString();
-                        updateTopic(topicId, metadataEntryId, modelEntryId, 
impactedTopics,
-                            result.getFieldValues(broaderField));
-                        updatedTopics++;
+                        batchDocuments.add(result);
                     }
                 }
+                processedCount += processor.process(batchDocuments);
                 solrServer.commit();
                 if (count < batchSize) {
                     done = true;
@@ -620,6 +600,43 @@ public class TopicClassificationEngine e
             String msg = String.format("Error while updating topics on Solr 
Core '%s'.", solrCoreId);
             throw new TrainingSetException(msg, e);
         }
+        return processedCount;
+    }
+
+    @Override
+    public int updateModel(boolean incremental) throws TrainingSetException, 
ClassifierException {
+        checkTrainingSet();
+        long start = System.currentTimeMillis();
+        if (incremental && modelUpdateDateField == null) {
+            log.warn(MODEL_UPDATE_DATE_FIELD + " field is not configured: 
switching to batch update mode.");
+            incremental = false;
+        }
+        final boolean incr = incremental;
+        int updatedTopics = batchOverTopics(new BatchProcessor<SolrDocument>() 
{
+            @Override
+            public int process(List<SolrDocument> batch) throws 
ClassifierException, TrainingSetException {
+                int processed = 0;
+                for (SolrDocument result : batch) {
+                    String topicId = 
result.getFirstValue(topicUriField).toString();
+                    List<String> impactedTopics = new ArrayList<String>();
+                    impactedTopics.add(topicId);
+                    impactedTopics.addAll(getNarrowerTopics(topicId));
+                    if (incr) {
+                        Date lastModelUpdate = (Date) 
result.getFirstValue(modelUpdateDateField);
+                        if (lastModelUpdate != null
+                            && !trainingSet.hasChangedSince(impactedTopics, 
lastModelUpdate)) {
+                            continue;
+                        }
+                    }
+                    String metadataEntryId = 
result.getFirstValue(entryIdField).toString();
+                    String modelEntryId = 
result.getFirstValue(modelEntryIdField).toString();
+                    updateTopic(topicId, metadataEntryId, modelEntryId, 
impactedTopics,
+                        result.getFieldValues(broaderField));
+                    processed++;
+                }
+                return processed;
+            }
+        });
         long stop = System.currentTimeMillis();
         log.info("Sucessfully updated {} topics in {}s", updatedTopics, 
(double) (stop - start) / 1000.);
         return updatedTopics;
@@ -709,25 +726,86 @@ public class TopicClassificationEngine e
         cvFoldCount = foldCount;
     }
 
-    @Override
-    public TopicClassifier cloneWithEmdeddedModel() throws ClassifierException 
{
-        // TODO Auto-generated method stub
+    protected Dictionary<String,Object> 
getCanonicalConfiguration(EmbeddedSolrServer server) {
+        // TODO
         return null;
     }
 
-    @Override
-    public void destroyModel() throws ClassifierException {
-        // TODO Auto-generated method stub
+    protected EmbeddedSolrServer makeTopicClassifierSolrServer(File folder) {
+
+        // TODO
+        return null;
+    }
 
+    public boolean isEvaluationRunning() {
+        return evaluationFolder != null;
     }
 
     public int updatePerformanceEstimates(boolean incremental) throws 
ClassifierException,
                                                               
TrainingSetException {
+        if (evaluationFolder != null) {
+            throw new ClassifierException("Another evaluation is already 
running");
+        }
         int updatedTopics = 0;
-        // TODO
+        int cvFoldCount = 3; // 3-folds CV is hardcoded for now
+
+        TopicClassificationEngine classifier = new TopicClassificationEngine();
+        classifier.setTrainingSet(trainingSet);
+        try {
+            // TODO: make the temporary folder path configurable with a 
property
+            evaluationFolder = 
File.createTempFile("stanbol-classifier-evaluation-", "-solr");
+            for (int cvFoldIndex = 0; cvFoldIndex < cvFoldCount; 
cvFoldIndex++) {
+                performCVFold(classifier, cvFoldIndex, cvFoldCount);
+            }
+        } catch (ConfigurationException e) {
+            throw new ClassifierException(e);
+        } catch (IOException e) {
+            throw new ClassifierException(e);
+        } finally {
+            FileUtils.deleteQuietly(evaluationFolder);
+            evaluationFolder = null;
+        }
         return updatedTopics;
     }
 
+    protected void performCVFold(TopicClassificationEngine classifier, int 
cvFoldIndex, int cvFoldCount) throws ConfigurationException,
+                                                                               
                         TrainingSetException,
+                                                                               
                         ClassifierException {
+
+        log.info(String.format("Performing evaluation CV iteration %d/%d on 
classifier %s", cvFoldIndex + 1,
+            cvFoldCount, engineId));
+        long start = System.currentTimeMillis();
+        FileUtils.deleteQuietly(evaluationFolder);
+        evaluationFolder.mkdir();
+        EmbeddedSolrServer evaluationServer = 
makeTopicClassifierSolrServer(evaluationFolder);
+        classifier.configure(getCanonicalConfiguration(evaluationServer));
+
+        // iterate over all the topics to register them in the evaluation 
classifier
+        batchOverTopics(new BatchProcessor<SolrDocument>() {
+            @Override
+            public int process(List<SolrDocument> batch) {
+                return 0;
+            }
+        });
+
+        // build the model on the for the current train CV folds
+        classifier.setCrossValidationInfo(cvFoldIndex, cvFoldCount);
+        classifier.updateModel(false);
+
+        // iterate over the topics again to compute scores on the test fold
+        batchOverTopics(new BatchProcessor<SolrDocument>() {
+            @Override
+            public int process(List<SolrDocument> batch) {
+                return 0;
+            }
+        });
+
+        float averageF1 = 0.0f;
+        long stop = System.currentTimeMillis();
+        log.info(String.format("Finished CV iteration %d/%d on classifier %s 
in %fs. F1-score = %f",
+            cvFoldIndex + 1, cvFoldCount, engineId, (stop - start) / 1000.0, 
averageF1));
+    }
+
     @Override
     public ClassificationReport getPerformanceEstimates(String topic) throws 
ClassifierException {
 

Modified: 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TopicClassifier.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TopicClassifier.java?rev=1232065&r1=1232064&r2=1232065&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TopicClassifier.java
 (original)
+++ 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TopicClassifier.java
 Mon Jan 16 17:03:37 2012
@@ -133,18 +133,6 @@ public interface TopicClassifier {
     void setCrossValidationInfo(int foldIndex, int foldCount);
 
     /**
-     * Clone the classifier to get a new independent instance with an empty 
embedded model to be trained on a
-     * subsample of the dataset in a cross validation setting for model 
evaluation.
-     */
-    TopicClassifier cloneWithEmdeddedModel() throws ClassifierException;
-
-    /**
-     * Free the backing resources of the model (e.g. indices persisted on the 
harddrive or a DB) once the
-     * cross validation process is completed.
-     */
-    void destroyModel() throws ClassifierException;
-
-    /**
      * Get a classification report with various accuracy metrics (precision, 
recall and f1-score) along with
      * the example ids of some mistakes (false positives or false negatives).
      */

Modified: 
incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/classifier/schema.xml
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/classifier/schema.xml?rev=1232065&r1=1232064&r2=1232065&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/classifier/schema.xml
 (original)
+++ 
incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/classifier/schema.xml
 Mon Jan 16 17:03:37 2012
@@ -66,8 +66,8 @@
       required="true" />
 
     <!-- If entry_type can be model 'model' or 'metadata' -->
-  <field name="entry_type" type="string" indexed="true" stored="true"
-    required="true" />
+    <field name="entry_type" type="string" indexed="true" stored="true"
+      required="true" />
 
     <!-- Mandatory classifier model attribute when entry_type == 'model' -->
     <field name="classifier_features" type="text" indexed="true"
@@ -81,15 +81,21 @@
       multiValued="true" />
     <field name="last_update_dt" type="tdate" indexed="true"
       stored="true" />
-    <!-- Accuracy evaluation of the model -->
-    <field name="precision" type="tfloat" indexed="true" stored="true" />
-    <field name="recall" type="tfloat" indexed="true" stored="true" />
-    <field name="f1" type="tfloat" indexed="true" stored="true" />
+    <!-- Accuracy evaluation of the model (accross CV folds) -->
+    <field name="precision" type="tfloat" indexed="true" stored="true"
+      multiValued="true" />
+    <field name="recall" type="tfloat" indexed="true" stored="true"
+      multiValued="true" />
+    <field name="f1" type="tfloat" indexed="true" stored="true"
+      multiValued="true" />
     <field name="last_evaluation_dt" type="tdate" indexed="true"
       stored="true" />
-    <field name="positive_support" type="tint" indexed="false" stored="true" />
-    <field name="negative_support" type="tint" indexed="false" stored="true" />
-    <!-- Store ids of some false positive and negative examples -->
+    <field name="positive_support" type="tint" indexed="false"
+      stored="true" multiValued="true" />
+    <field name="negative_support" type="tint" indexed="false"
+      stored="true" multiValued="true" />
+    <!-- Store ids of some false positive and negative examples (accumulated 
+      over several CV folds) -->
     <field name="false_positives" type="string" indexed="false"
       multiValued="true" stored="true" />
     <field name="negative_positives" type="string" indexed="false"


Reply via email to