to...

ogrisel Wed, 28 Mar 2012 00:53:15 -0700

Author: ogrisel
Date: Wed Mar 28 07:52:46 2012
New Revision: 1306206

URL: http://svn.apache.org/viewvc?rev=1306206&view=rev
Log:
STANBOL-197: basic REST API to train a classifier model + various OSGi fixes


Added:
    incubator/stanbol/trunk/enhancer/topic-web/tools/
    incubator/stanbol/trunk/enhancer/topic-web/tools/newsmlimporter.py
Modified:
    
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
    
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TopicClassifier.java
    
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/training/SolrTrainingSet.java
    
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/training/TrainingSet.java
    
incubator/stanbol/trunk/enhancer/topic-web/src/main/java/org/apache/stanbol/enhancer/web/topic/fragment/TopicClassifierFragment.java
    
incubator/stanbol/trunk/enhancer/topic-web/src/main/java/org/apache/stanbol/enhancer/web/topic/resource/TopicModelResource.java

Modified: 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java?rev=1306206&r1=1306205&r2=1306206&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
 (original)
+++ 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
 Wed Mar 28 07:52:46 2012
@@ -117,6 +117,17 @@ import org.slf4j.LoggerFactory;
  * 
  * The Solr server is expected to be configured with the MoreLikeThisHandler 
and the matching fields from the
  * engine configuration.
+ * 
+ * This text classifier method sometimes goes by the name of "Rocchio 
classification" or "Nearest Centroid
+ * classification" in the IR and machine learning literature. It is often 
slightly less accurate than fitting
+ * penalized linear model such as linear kernel Support Vector Machines or 
penalized Logistic Regression but
+ * has the advantage to scale to large number of categories (e.g. more that 
tens of thousands) without having
+ * to load the full statistical model in memory thanks to the use of the 
inverted index datastructure that
+ * also provides the feature extraction and TF-IDF weighting for free.
+ * 
+ * Furthermore it could be refined by using a "Learning to Rank" approach by 
training a RankSVM, Gradient
+ * Boosted Trees or Random Forests on the output on the raw output of the 
Rocchio classifier so as to re-rank
+ * candidate classes more finely. The Learning to Rank refinement is not 
implemented yet.
  */
 @Component(metatype = true, immediate = true, configurationFactory = true, 
policy = ConfigurationPolicy.REQUIRE)
 @Service
@@ -202,16 +213,18 @@ public class TopicClassificationEngine e
 
     // TODO: make the following fields configurable
 
-    private int MAX_COLLECTED_EXAMPLES = 100;
+    private int MAX_COLLECTED_EXAMPLES = 1000;
+
+    public int MAX_EVALUATION_SAMPLES = 200;
 
-    public int MAX_EVALUATION_SAMPLES = 1000;
+    public int MIN_EVALUATION_SAMPLES = 10;
 
     public int MAX_CHARS_PER_TOPIC = 100000;
 
     public Integer MAX_ROOTS = 1000;
 
     public int MAX_SUGGESTIONS = 5; // never suggest more than this: this is 
expected to be a reasonable
-                                    // estimate of the number of topics 
occuring in each documents
+                                    // estimate of the number of topics 
occurring in each documents
 
     protected String engineName;
 
@@ -267,15 +280,20 @@ public class TopicClassificationEngine e
 
     protected int cvFoldCount = 0;
 
-    protected File evaluationFolder;
+    protected boolean evaluationRunning = false;
 
     @Reference(cardinality = ReferenceCardinality.OPTIONAL_UNARY, bind = 
"bindManagedSolrServer", unbind = "unbindManagedSolrServer", strategy = 
ReferenceStrategy.EVENT, policy = ReferencePolicy.DYNAMIC)
-    protected ManagedSolrServer managedSolrServer;
+    protected ManagedSolrServer managedSolrServerDummy; // trick to call the 
super class binders
 
     @Activate
     protected void activate(ComponentContext context) throws 
ConfigurationException, InvalidSyntaxException {
         @SuppressWarnings("unchecked")
         Dictionary<String,Object> config = context.getProperties();
+        activate(context, config);
+    }
+
+    protected void activate(ComponentContext context, 
Dictionary<String,Object> config) throws ConfigurationException,
+                                                                               
        InvalidSyntaxException {
         this.context = context;
         indexArchiveName = "default-topic-model";
         configure(config);
@@ -288,6 +306,9 @@ public class TopicClassificationEngine e
                     .createFilter(filter), null);
             trainingSetTracker.open();
         }
+        // TODO if training set is null, make it possible to programmatically 
create a SolrTrainingSet
+        // instance using the same managed solr server and register it under 
the same name as the engine
+        // it-self.
     }
 
     @Deactivate
@@ -659,14 +680,29 @@ public class TopicClassificationEngine e
     }
 
     @Override
+    public void removeAllConcepts() throws ClassifierException {
+        SolrServer solrServer = getActiveSolrServer();
+        try {
+            solrServer.deleteByQuery("*:*");
+            solrServer.commit();
+        } catch (Exception e) {
+            String msg = String.format("Error deleting concepts from Solr Core 
'%s'", solrCoreId);
+            throw new ClassifierException(msg, e);
+        }
+    }
+
+    @Override
     public void removeConcept(String conceptId) throws ClassifierException {
+        if (conceptId == null || conceptId.isEmpty()) {
+            throw new ClassifierException("conceptId must not be null or 
empty");
+        }
         SolrServer solrServer = getActiveSolrServer();
         try {
             solrServer.deleteByQuery(conceptUriField + ":" + 
ClientUtils.escapeQueryChars(conceptId));
             solrServer.commit();
         } catch (Exception e) {
-            String msg = String.format("Error removing topic with id '%s' on 
Solr Core '%s'", conceptId,
-                solrCoreId);
+            String msg = String
+                    .format("Error removing concept '%s' on Solr Core '%s'", 
conceptId, solrCoreId);
             throw new ClassifierException(msg, e);
         }
     }
@@ -878,7 +914,7 @@ public class TopicClassificationEngine e
         cvFoldCount = foldCount;
     }
 
-    protected Dictionary<String,Object> 
getCanonicalConfiguration(EmbeddedSolrServer server) {
+    protected Dictionary<String,Object> getCanonicalConfiguration(Object 
server) {
         Hashtable<String,Object> config = new Hashtable<String,Object>();
         config.put(EnhancementEngine.PROPERTY_NAME, engineName + 
"-evaluation");
         config.put(TopicClassificationEngine.ENTRY_ID_FIELD, "entry_id");
@@ -901,39 +937,50 @@ public class TopicClassificationEngine e
     }
 
     public boolean isEvaluationRunning() {
-        return evaluationFolder != null;
+        return evaluationRunning;
     }
 
-    public int updatePerformanceEstimates(boolean incremental) throws 
ClassifierException,
-                                                              
TrainingSetException {
-        if (evaluationFolder != null) {
+    synchronized public int updatePerformanceEstimates(boolean incremental) 
throws ClassifierException,
+                                                                           
TrainingSetException {
+        checkTrainingSet();
+        if (evaluationRunning) {
             throw new ClassifierException("Another evaluation is already 
running");
         }
         int updatedTopics = 0;
-        int cvFoldCount = 3; // 3-folds CV is hardcoded for now
-        int cvIterationCount = 1; // only one 3-folds CV iteration
-
-        TopicClassificationEngine classifier = new TopicClassificationEngine();
-        classifier.setTrainingSet(trainingSet);
+        File tmpfolder = null;
         try {
+            tmpfolder = File.createTempFile("stanbol-evaluation-folder-", 
".tmp");
+            tmpfolder.delete();
+            tmpfolder.mkdir();
+            evaluationRunning = true;
+            int cvFoldCount = 3; // 3-folds CV is hardcoded for now
+            int cvIterationCount = 1; // only one 3-folds CV iteration
+
+            // We will use the training set quite intensively, ensure that the 
index is packed and its
+            // statistics are up to date
+            getTrainingSet().optimize();
+
             // TODO: make the temporary folder path configurable with a 
property
-            evaluationFolder = 
File.createTempFile("stanbol-classifier-evaluation-", "-solr");
             for (int cvFoldIndex = 0; cvFoldIndex < cvIterationCount; 
cvFoldIndex++) {
-                updatedTopics = performCVFold(classifier, cvFoldIndex, 
cvFoldCount, cvIterationCount,
+                updatedTopics = performCVFold(tmpfolder, cvFoldIndex, 
cvFoldCount, cvIterationCount,
                     incremental);
             }
+            SolrServer solrServer = getActiveSolrServer();
+            solrServer.optimize();
         } catch (ConfigurationException e) {
             throw new ClassifierException(e);
         } catch (IOException e) {
             throw new ClassifierException(e);
+        } catch (SolrServerException e) {
+            throw new ClassifierException(e);
         } finally {
-            FileUtils.deleteQuietly(evaluationFolder);
-            evaluationFolder = null;
+            FileUtils.deleteQuietly(tmpfolder);
+            evaluationRunning = false;
         }
         return updatedTopics;
     }
 
-    protected int performCVFold(final TopicClassificationEngine classifier,
+    protected int performCVFold(File tmpfolder,
                                 int cvFoldIndex,
                                 int cvFoldCount,
                                 int cvIterations,
@@ -945,16 +992,27 @@ public class TopicClassificationEngine e
         log.info(String.format("Performing evaluation %d-fold CV iteration 
%d/%d on classifier %s",
             cvFoldCount, cvFoldIndex + 1, cvIterations, engineName));
         long start = System.currentTimeMillis();
-        FileUtils.deleteQuietly(evaluationFolder);
-        evaluationFolder.mkdir();
+        final TopicClassificationEngine classifier = new 
TopicClassificationEngine();
         try {
-            EmbeddedSolrServer evaluationServer = 
EmbeddedSolrHelper.makeEmbeddedSolrServer(evaluationFolder,
-                "evaluationclassifierserver", "default-topic-model", 
"default-topic-model");
-            classifier.configure(getCanonicalConfiguration(evaluationServer));
+            if (managedSolrServer != null) {
+                // OSGi setup: the evaluation server will be generated 
automatically using the
+                // managedSolrServer
+                classifier.bindManagedSolrServer(managedSolrServer);
+                classifier.activate(context, 
getCanonicalConfiguration(engineName + "-evaluation"));
+            } else {
+                // non-OSGi runtime, need to do the setup manually
+                EmbeddedSolrServer evaluationServer = 
EmbeddedSolrHelper.makeEmbeddedSolrServer(tmpfolder,
+                    "evaluationclassifierserver", "default-topic-model", 
"default-topic-model");
+                
classifier.configure(getCanonicalConfiguration(evaluationServer));
+            }
         } catch (Exception e) {
             throw new ClassifierException(e);
         }
 
+        // clean all previous concepts from the evaluation classifier in case 
we are reusing an existing solr
+        // index from OSGi.
+        classifier.removeAllConcepts();
+
         // iterate over all the topics to register them in the evaluation 
classifier
         batchOverTopics(new BatchProcessor<SolrDocument>() {
             @Override
@@ -978,6 +1036,8 @@ public class TopicClassificationEngine e
 
         // build the model on the for the current train CV folds
         classifier.setCrossValidationInfo(cvFoldIndex, cvFoldCount);
+        // bind our new classifier to the same training set at the parent
+        classifier.setTrainingSet(getTrainingSet());
         classifier.updateModel(false);
 
         final int foldCount = cvFoldCount;
@@ -989,6 +1049,7 @@ public class TopicClassificationEngine e
             @Override
             public int process(List<SolrDocument> batch) throws 
TrainingSetException, ClassifierException {
                 int offset;
+                int updated = 0;
                 for (SolrDocument topicMetadata : batch) {
                     String topic = 
topicMetadata.getFirstValue(conceptUriField).toString();
                     List<String> topics = Arrays.asList(topic);
@@ -998,8 +1059,15 @@ public class TopicClassificationEngine e
                     int positiveSupport = 0;
                     offset = 0;
                     Batch<Example> examples = Batch.emtpyBatch(Example.class);
+                    boolean skipTopic = false;
                     do {
                         examples = 
getTrainingSet().getPositiveExamples(topics, examples.nextOffset);
+                        if (offset == 0 && examples.items.size() < 
MIN_EVALUATION_SAMPLES) {
+                            // we need a minimum about of examples otherwise 
it's really not
+                            // worth computing statistics
+                            skipTopic = true;
+                            break;
+                        }
                         for (Example example : examples.items) {
                             if (!(offset % foldCount == foldIndex)) {
                                 // this example is not part of the test fold, 
skip it
@@ -1025,7 +1093,7 @@ public class TopicClassificationEngine e
                                 }
                             }
                         }
-                    } while (examples.hasMore && offset < 
MAX_EVALUATION_SAMPLES);
+                    } while (!skipTopic && examples.hasMore && offset < 
MAX_EVALUATION_SAMPLES);
 
                     List<String> falsePositiveExamples = new 
ArrayList<String>();
                     int falsePositives = 0;
@@ -1033,6 +1101,9 @@ public class TopicClassificationEngine e
                     offset = 0;
                     examples = Batch.emtpyBatch(Example.class);
                     do {
+                        if (skipTopic) {
+                            break;
+                        }
                         examples = 
getTrainingSet().getNegativeExamples(topics, examples.nextOffset);
                         for (Example example : examples.items) {
                             if (!(offset % foldCount == foldIndex)) {
@@ -1057,31 +1128,39 @@ public class TopicClassificationEngine e
                         }
                     } while (examples.hasMore && offset < 
MAX_EVALUATION_SAMPLES);
 
-                    // compute precision, recall and f1 score for the current 
test fold and topic
-                    float precision = 0;
-                    if (truePositives != 0 || falsePositives != 0) {
-                        precision = truePositives / (float) (truePositives + 
falsePositives);
-                    }
-                    float recall = 0;
-                    if (truePositives != 0 || falseNegatives != 0) {
-                        recall = truePositives / (float) (truePositives + 
falseNegatives);
+                    if (skipTopic) {
+                        log.debug("Skipping evaluation of {} because too few 
positive examples.", topic);
+                    } else {
+                        // compute precision, recall and f1 score for the 
current test fold and topic
+                        float precision = 0;
+                        if (truePositives != 0 || falsePositives != 0) {
+                            precision = truePositives / (float) (truePositives 
+ falsePositives);
+                        }
+                        float recall = 0;
+                        if (truePositives != 0 || falseNegatives != 0) {
+                            recall = truePositives / (float) (truePositives + 
falseNegatives);
+                        }
+                        updatePerformanceMetadata(topic, precision, recall, 
positiveSupport, negativeSupport,
+                            falsePositiveExamples, falseNegativeExamples);
+                        updated += 1;
                     }
-                    updatePerformanceMetadata(topic, precision, recall, 
positiveSupport, negativeSupport,
-                        falsePositiveExamples, falseNegativeExamples);
                 }
                 try {
                     getActiveSolrServer().commit();
                 } catch (Exception e) {
                     throw new ClassifierException(e);
                 }
-                return batch.size();
+                return updated;
             }
         });
 
-        float averageF1 = 0.0f;
         long stop = System.currentTimeMillis();
-        log.info(String.format("Finished CV iteration %d/%d on classifier %s 
in %fs. F1-score = %f",
-            cvFoldIndex + 1, cvFoldCount, engineName, (stop - start) / 1000.0, 
averageF1));
+        log.info(String.format("Finished CV iteration %d/%d on classifier %s 
in %fs.", cvFoldIndex + 1,
+            cvFoldCount, engineName, (stop - start) / 1000.0));
+        if (context != null) {
+            // close open trackers
+            classifier.deactivate(context);
+        }
         return updatedTopics;
     }
 
@@ -1120,6 +1199,9 @@ public class TopicClassificationEngine e
                 newEntry.setField(modelEvaluationDateField, 
UTCTimeStamper.nowUtcDate());
                 solrServer.add(newEntry);
             }
+            log.info(String.format("Performance for concept '%s': 
precision=%f, recall=%f,"
+                                   + " positiveSupport=%d, 
negativeSupport=%d", conceptId, precision, recall,
+                positiveSupport, negativeSupport));
         } catch (Exception e) {
             String msg = String
                     .format("Error updating performance metadata for topic 
'%s' on Solr Core '%s'",

Modified: 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TopicClassifier.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TopicClassifier.java?rev=1306206&r1=1306205&r2=1306206&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TopicClassifier.java
 (original)
+++ 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TopicClassifier.java
 Wed Mar 28 07:52:46 2012
@@ -111,11 +111,18 @@ public interface TopicClassifier {
      * drawing examples from the dataset.
      * 
      * @param conceptUri
-     *            if of the topic to remove from the model
+     *            if of the topic to remove from the model, must not be null
      */
     void removeConcept(String conceptUri) throws ClassifierException;
 
     /**
+     * Remove all the concepts from the current model leaving with an empty 
model.
+     * 
+     * @throws ClassifierException
+     */
+    void removeAllConcepts() throws ClassifierException;
+
+    /**
      * @return the training set registered for this classifier (either set 
explicitly using setTrainingSet or
      *         configured through OSGi properties).
      */
@@ -171,7 +178,7 @@ public interface TopicClassifier {
     List<String> getChainNames() throws InvalidSyntaxException, ChainException;
 
     /**
-     * Initialize the concept hierarch of the model using the provided RDF 
model (e.g. a SKOS taxonomy).
+     * Initialize the concept hierarchy of the model using the provided RDF 
model (e.g. a SKOS taxonomy).
      * 
      * @return the number of concepts successfully imported (including roots).
      */

Modified: 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/training/SolrTrainingSet.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/training/SolrTrainingSet.java?rev=1306206&r1=1306205&r2=1306206&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/training/SolrTrainingSet.java
 (original)
+++ 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/training/SolrTrainingSet.java
 Wed Mar 28 07:52:46 2012
@@ -93,14 +93,14 @@ public class SolrTrainingSet extends Con
 
     // TODO: make me configurable using an OSGi property
     protected int batchSize = 100;
-    
+
     @Reference(cardinality = ReferenceCardinality.OPTIONAL_UNARY, bind = 
"bindManagedSolrServer", unbind = "unbindManagedSolrServer", strategy = 
ReferenceStrategy.EVENT, policy = ReferencePolicy.DYNAMIC)
     protected ManagedSolrServer managedSolrServer;
 
     public String getName() {
         return trainingSetId;
     }
-    
+
     @Activate
     protected void activate(ComponentContext context) throws 
ConfigurationException, InvalidSyntaxException {
         indexArchiveName = "default-topic-trainingset";
@@ -284,4 +284,13 @@ public class SolrTrainingSet extends Con
         this.batchSize = batchSize;
     }
 
+    @Override
+    public void optimize() throws TrainingSetException {
+        try {
+            getActiveSolrServer().optimize();
+        } catch (Exception e) {
+            throw new TrainingSetException("Error optimizing training dataset 
" + getName(), e);
+        }
+    }
+
 }

Modified: 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/training/TrainingSet.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/training/TrainingSet.java?rev=1306206&r1=1306205&r2=1306206&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/training/TrainingSet.java
 (original)
+++ 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/training/TrainingSet.java
 Wed Mar 28 07:52:46 2012
@@ -103,4 +103,9 @@ public interface TrainingSet {
      */
     boolean hasChangedSince(List<String> topics, Date referenceDate) throws 
TrainingSetException;
 
+    /**
+     * Trigger optimization of the underlying index. 
+     */
+    void optimize() throws TrainingSetException;
+
 }

Modified: 
incubator/stanbol/trunk/enhancer/topic-web/src/main/java/org/apache/stanbol/enhancer/web/topic/fragment/TopicClassifierFragment.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/topic-web/src/main/java/org/apache/stanbol/enhancer/web/topic/fragment/TopicClassifierFragment.java?rev=1306206&r1=1306205&r2=1306206&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/topic-web/src/main/java/org/apache/stanbol/enhancer/web/topic/fragment/TopicClassifierFragment.java
 (original)
+++ 
incubator/stanbol/trunk/enhancer/topic-web/src/main/java/org/apache/stanbol/enhancer/web/topic/fragment/TopicClassifierFragment.java
 Wed Mar 28 07:52:46 2012
@@ -29,6 +29,7 @@ import org.apache.stanbol.commons.web.ba
 import org.apache.stanbol.commons.web.base.NavigationLink;
 import org.apache.stanbol.commons.web.base.ScriptResource;
 import org.apache.stanbol.commons.web.base.WebFragment;
+import org.apache.stanbol.commons.web.base.readers.GraphReader;
 import 
org.apache.stanbol.enhancer.web.topic.resource.TopicClassifierRootResource;
 import org.apache.stanbol.enhancer.web.topic.resource.TopicModelResource;
 import org.osgi.framework.BundleContext;
@@ -64,6 +65,7 @@ public class TopicClassifierFragment imp
         Set<Class<?>> classes = new HashSet<Class<?>>();
         classes.add(TopicClassifierRootResource.class);
         classes.add(TopicModelResource.class);
+        classes.add(GraphReader.class);
         return classes;
     }
 

Modified: 
incubator/stanbol/trunk/enhancer/topic-web/src/main/java/org/apache/stanbol/enhancer/web/topic/resource/TopicModelResource.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/topic-web/src/main/java/org/apache/stanbol/enhancer/web/topic/resource/TopicModelResource.java?rev=1306206&r1=1306205&r2=1306206&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/topic-web/src/main/java/org/apache/stanbol/enhancer/web/topic/resource/TopicModelResource.java
 (original)
+++ 
incubator/stanbol/trunk/enhancer/topic-web/src/main/java/org/apache/stanbol/enhancer/web/topic/resource/TopicModelResource.java
 Wed Mar 28 07:52:46 2012
@@ -18,23 +18,39 @@ package org.apache.stanbol.enhancer.web.
 
 import static javax.ws.rs.core.MediaType.TEXT_HTML;
 import static org.apache.stanbol.commons.web.base.CorsHelper.addCORSOrigin;
+import static org.apache.stanbol.commons.web.base.CorsHelper.enableCORS;
+
+import java.util.List;
 
 import javax.servlet.ServletContext;
+import javax.ws.rs.Consumes;
+import javax.ws.rs.DELETE;
 import javax.ws.rs.GET;
+import javax.ws.rs.OPTIONS;
+import javax.ws.rs.POST;
 import javax.ws.rs.Path;
 import javax.ws.rs.PathParam;
 import javax.ws.rs.Produces;
+import javax.ws.rs.QueryParam;
 import javax.ws.rs.WebApplicationException;
 import javax.ws.rs.core.Context;
 import javax.ws.rs.core.HttpHeaders;
+import javax.ws.rs.core.MediaType;
 import javax.ws.rs.core.Response;
-import javax.ws.rs.core.UriInfo;
 import javax.ws.rs.core.Response.ResponseBuilder;
+import javax.ws.rs.core.UriInfo;
 
+import org.apache.clerezza.rdf.core.Graph;
+import org.apache.clerezza.rdf.core.UriRef;
 import org.apache.stanbol.commons.web.base.ContextHelper;
 import org.apache.stanbol.commons.web.base.resource.BaseStanbolResource;
 import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses;
+import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;
+import org.apache.stanbol.enhancer.topic.ClassifierException;
 import org.apache.stanbol.enhancer.topic.TopicClassifier;
+import org.apache.stanbol.enhancer.topic.training.TrainingSet;
+import org.apache.stanbol.enhancer.topic.training.TrainingSetException;
 import org.osgi.framework.BundleContext;
 import org.osgi.framework.InvalidSyntaxException;
 import org.osgi.framework.ServiceReference;
@@ -65,6 +81,10 @@ public final class TopicModelResource ex
         classifier = (TopicClassifier) bundleContext.getService(references[0]);
     }
 
+    public TopicClassifier getClassifier() {
+        return classifier;
+    }
+
     @GET
     @Produces(TEXT_HTML)
     public Response get(@Context HttpHeaders headers) {
@@ -74,7 +94,197 @@ public final class TopicModelResource ex
         return rb.build();
     }
 
-    public TopicClassifier getClassifier() {
-        return classifier;
+    @OPTIONS
+    @Path("concept")
+    public Response handleCorsPreflightOnConcept(@Context HttpHeaders headers) 
{
+        ResponseBuilder res = Response.ok();
+        enableCORS(servletContext, res, headers);
+        return res.build();
+    }
+
+    // TODO: make it possible to fetch concept descriptions (with broader and 
narrower links) using the GET
+    // verb
+
+    @POST
+    @Path("concept")
+    @Consumes(MediaType.WILDCARD)
+    public Response addConcept(@QueryParam(value = "id") String concept,
+                               @QueryParam(value = "broader") List<String> 
broaderConcepts,
+                               @Context HttpHeaders headers) throws 
ClassifierException {
+        classifier.addConcept(concept, broaderConcepts);
+        ResponseBuilder rb = Response.ok();
+        addCORSOrigin(servletContext, rb, headers);
+        return rb.build();
+    }
+
+    @DELETE
+    @Path("concept")
+    @Consumes(MediaType.WILDCARD)
+    public Response remoteConcept(@QueryParam(value = "id") String concept, 
@Context HttpHeaders headers) throws ClassifierException {
+        if (concept != null && !concept.isEmpty()) {
+            classifier.removeConcept(concept);
+        } else {
+            classifier.removeAllConcepts();
+        }
+        // TODO: count the number of deleted entries and return is a text 
entity
+        ResponseBuilder rb = Response.ok();
+        addCORSOrigin(servletContext, rb, headers);
+        return rb.build();
+    }
+
+    @OPTIONS
+    @Path("performance")
+    public Response handleCorsPreflightOnPerformance(@Context HttpHeaders 
headers) {
+        ResponseBuilder res = Response.ok();
+        enableCORS(servletContext, res, headers);
+        return res.build();
+    }
+
+    // TODO: make it possible to fetch performance reports and evaluation 
running state using the GET verb
+
+    @POST
+    @Path("performance")
+    @Consumes(MediaType.WILDCARD)
+    public Response updatePerformance(@QueryParam(value = "incremental") 
Boolean incremental,
+                                      @Context HttpHeaders headers) throws 
TrainingSetException,
+                                                                   
ClassifierException {
+        if (incremental == null) {
+            incremental = Boolean.TRUE;
+        }
+        int updated = classifier.updatePerformanceEstimates(incremental);
+        ResponseBuilder rb = Response.ok(String.format(
+            "Successfully updated the performance estimates of %d 
concept(s).\n", updated));
+        addCORSOrigin(servletContext, rb, headers);
+        return rb.build();
+    }
+
+    @OPTIONS
+    @Path("trainer")
+    public Response handleCorsPreflightOnTrainer(@Context HttpHeaders headers) 
{
+        ResponseBuilder res = Response.ok();
+        enableCORS(servletContext, res, headers);
+        return res.build();
+    }
+
+    // TODO: make it possible to fetch training set statistics and training 
state using the GET verb
+
+    @POST
+    @Path("trainer")
+    @Consumes(MediaType.WILDCARD)
+    public Response updateModel(@QueryParam(value = "incremental") Boolean 
incremental,
+                                @Context HttpHeaders headers) throws 
TrainingSetException,
+                                                             
ClassifierException {
+        if (incremental == null) {
+            incremental = Boolean.TRUE;
+        }
+        int updated = classifier.updateModel(incremental);
+        ResponseBuilder rb = Response.ok(String.format(
+            "Successfully updated the statistical model(s) of %d 
concept(s).\n", updated));
+        addCORSOrigin(servletContext, rb, headers);
+        return rb.build();
+    }
+
+    public Response handleCorsPreflightOnTrainingSet(@Context HttpHeaders 
headers) {
+        ResponseBuilder res = Response.ok();
+        enableCORS(servletContext, res, headers);
+        return res.build();
+    }
+
+    // TODO: make it possible browse the training set content on the GET verb 
using a subresource
+
+    @POST
+    @Path("trainingset")
+    @Consumes(MediaType.TEXT_PLAIN)
+    public Response registerExample(@QueryParam(value = "example_id") String 
exampleId,
+                                    @QueryParam(value = "concept") 
List<String> concepts,
+                                    String textContent,
+                                    @Context HttpHeaders headers) throws 
TrainingSetException,
+                                                                 
ClassifierException {
+        ResponseBuilder rb;
+        if (!classifier.isUpdatable()) {
+            rb = Response.status(Response.Status.BAD_REQUEST).entity(
+                String.format("Classifier %s is not updateble.\n", 
classifier.getName()));
+        } else {
+            TrainingSet trainingSet = classifier.getTrainingSet();
+            exampleId = trainingSet.registerExample(exampleId, textContent, 
concepts);
+            // TODO: make example GETable resources and return a 201 to it 
instead of a simple message.
+            rb = Response.ok(String.format(
+                "Successfully added or updated example '%s' in training set 
'%s'.\n", exampleId,
+                trainingSet.getName()));
+        }
+        addCORSOrigin(servletContext, rb, headers);
+        return rb.build();
+    }
+
+    // TODO make the following a DELETE method on the example sub-resources 
them-selves once we have a GET for
+    // them
+
+    @DELETE
+    @Path("trainingset")
+    @Consumes(MediaType.WILDCARD)
+    public Response removeExample(@QueryParam(value = "example_id") 
List<String> exampleIds,
+                                  @Context HttpHeaders headers) throws 
TrainingSetException,
+                                                               
ClassifierException {
+        ResponseBuilder rb;
+        if (!classifier.isUpdatable()) {
+            rb = Response.status(Response.Status.BAD_REQUEST).entity(
+                String.format("Classifier %s is not updateble.\n", 
classifier.getName()));
+        } else {
+            TrainingSet trainingSet = classifier.getTrainingSet();
+            if (exampleIds != null && !exampleIds.isEmpty()) {
+                for (String exampleId : exampleIds) {
+                    trainingSet.registerExample(exampleId, null, null);
+                }
+            } else {
+                // implement a way to cleanup a complete training set? or is 
it too dangerous and we should
+                // return an error instead?
+            }
+            rb = Response.ok(String.format("Successfully deleted examples in 
training set '%s'.\n",
+                trainingSet.getName()));
+        }
+        addCORSOrigin(servletContext, rb, headers);
+        return rb.build();
+    }
+
+    @OPTIONS
+    public Response handleCorsPreflight(@Context HttpHeaders headers) {
+        ResponseBuilder res = Response.ok();
+        enableCORS(servletContext, res, headers);
+        return res.build();
+    }
+
+    /**
+     * Simple RDF / SKOS importer that loads the complete model in memory for 
easy parsing and then does graph
+     * introspection to find the concepts to load into the model.
+     * 
+     * If a scalable implementation is required, one should probably use a 
transient triple store and pass it
+     * the raw RDF stream instead of using the naive GraphReader JAX-RS 
provider.
+     */
+    @POST
+    @Consumes(MediaType.WILDCARD)
+    public Response importConceptsFromRDF(@QueryParam(value = "concept_class") 
String conceptClassUri,
+                                          @QueryParam(value = 
"broader_property") String broaderPropertyUri,
+                                          Graph graph,
+                                          @Context HttpHeaders headers) throws 
ClassifierException {
+        UriRef conceptClass = OntologicalClasses.SKOS_CONCEPT;
+        UriRef broaderProperty = Properties.SKOS_BROADER;
+        if (conceptClassUri != null && !conceptClassUri.isEmpty()) {
+            conceptClass = new UriRef(conceptClassUri);
+        }
+        if (broaderPropertyUri != null && !broaderPropertyUri.isEmpty()) {
+            broaderProperty = new UriRef(broaderPropertyUri);
+        }
+        int imported = classifier.importConceptsFromGraph(graph, conceptClass, 
broaderProperty);
+        ResponseBuilder rb;
+        if (imported == 0) {
+            rb = Response.status(Response.Status.BAD_REQUEST).entity(
+                String.format("Could not find any instances of '%s' in 
payload.\n",
+                    conceptClass.getUnicodeString()));
+        } else {
+            rb = Response.ok(String.format("Imported %d instance of '%s'.\n", 
imported,
+                conceptClass.getUnicodeString()));
+        }
+        addCORSOrigin(servletContext, rb, headers);
+        return rb.build();
     }
 }

Added: incubator/stanbol/trunk/enhancer/topic-web/tools/newsmlimporter.py
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/topic-web/tools/newsmlimporter.py?rev=1306206&view=auto
==============================================================================
--- incubator/stanbol/trunk/enhancer/topic-web/tools/newsmlimporter.py (added)
+++ incubator/stanbol/trunk/enhancer/topic-web/tools/newsmlimporter.py Wed Mar 
28 07:52:46 2012
@@ -0,0 +1,108 @@
+#!/usr/bin/env python
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Basic python script to load NewsML documents as training set
+
+Need Python 2.7 and lxml.
+
+TODO: port to Python 3 as well if not working by default.
+"""
+from __future__ import print_function
+
+import os
+from lxml import html
+from lxml import etree
+from urllib import quote
+import urllib2
+from hashlib import sha1
+
+
+IPTC_SUBJECT_PREFIX = "http://cv.iptc.org/newscodes/subjectcode/";
+
+
+def find_text_and_subjects(newsml_content,
+                           subject_tags=('SubjectMatter', 'SubjectDetail'),
+                           text_tags=('HeadLine',),
+                           html_tags=('body.content',)):
+    # First parse of the document as XML for the structured attributes
+    xtree = etree.ElementTree(etree.fromstring(newsml_content))
+    text_items = [e.text.strip()
+                  for tag in text_tags
+                  for e in xtree.findall('//' + tag)]
+    subjects = [IPTC_SUBJECT_PREFIX + e.get('FormalName')
+                for tag in subject_tags
+                for e in xtree.findall('//' + tag)]
+
+    # Then use HTML parser to find the that looks like HTML hence can leverage
+    # the text_content method.
+    htree = etree.ElementTree(html.document_fromstring(newsml_content))
+
+    text_items += [e.text_content().strip()
+                   for tag in html_tags
+                   for e in htree.findall('//' + tag)]
+    text = "\n\n".join(text_items)
+    return text, subjects
+
+
+def register_newsml_document(text, codes, url):
+    id = sha1(text).hexdigest()
+    url += "?example_id=%s" % id
+    for code in codes:
+        url += "&concept=%s" % quote(code)
+    print("Calling:", url)
+    request = urllib2.Request(url, data=text.encode('utf-8'))
+    request.add_header('Content-Type', 'text/plain')
+    opener = urllib2.build_opener()
+    print(opener.open(request).read())
+
+
+def print_newsml_summary(text, codes, server_url=None):
+    print(text.split('\n\n')[0])
+    for code in codes:
+        print('code: ' + code)
+    print()
+
+
+if __name__ == "__main__":
+    import sys
+
+    # TODO: use argparse and debug switch to use print_newsfile_summary
+    # instead of the default handler
+    topfolder = sys.argv[1]
+    max = int(sys.argv[2])
+    server_url = sys.argv[3]
+    handle_news = register_newsml_document
+
+    count = 0
+    for dirpath, dirnames, filenames in os.walk(topfolder):
+        if count >= max:
+            break
+
+        if '.svn' in dirnames:
+            dirnames.remove('.svn')
+
+        for filename in filenames:
+            if count >= max:
+                break
+            if not filename.endswith('.xml'):
+                continue
+            full_path = os.path.join(topfolder, dirpath, filename)
+            newsml_content = open(full_path, 'rb').read()
+            text, codes = find_text_and_subjects(newsml_content)
+            if len(codes) == 0:
+                # ignore document without subject info
+                continue
+            handle_news(text, codes, server_url)
+            count += 1

svn commit: r1306206 - in /incubator/stanbol/trunk/enhancer: engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/ engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/ engines/topic/src/main/java/org/apache/stanbol/enhancer/to...

Reply via email to