Author: ogrisel
Date: Wed Mar 28 07:52:46 2012
New Revision: 1306206
URL: http://svn.apache.org/viewvc?rev=1306206&view=rev
Log:
STANBOL-197: basic REST API to train a classifier model + various OSGi fixes
Added:
incubator/stanbol/trunk/enhancer/topic-web/tools/
incubator/stanbol/trunk/enhancer/topic-web/tools/newsmlimporter.py
Modified:
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TopicClassifier.java
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/training/SolrTrainingSet.java
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/training/TrainingSet.java
incubator/stanbol/trunk/enhancer/topic-web/src/main/java/org/apache/stanbol/enhancer/web/topic/fragment/TopicClassifierFragment.java
incubator/stanbol/trunk/enhancer/topic-web/src/main/java/org/apache/stanbol/enhancer/web/topic/resource/TopicModelResource.java
Modified:
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java?rev=1306206&r1=1306205&r2=1306206&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
Wed Mar 28 07:52:46 2012
@@ -117,6 +117,17 @@ import org.slf4j.LoggerFactory;
*
* The Solr server is expected to be configured with the MoreLikeThisHandler
and the matching fields from the
* engine configuration.
+ *
+ * This text classifier method sometimes goes by the name of "Rocchio
classification" or "Nearest Centroid
+ * classification" in the IR and machine learning literature. It is often
slightly less accurate than fitting
+ * penalized linear model such as linear kernel Support Vector Machines or
penalized Logistic Regression but
+ * has the advantage to scale to large number of categories (e.g. more that
tens of thousands) without having
+ * to load the full statistical model in memory thanks to the use of the
inverted index datastructure that
+ * also provides the feature extraction and TF-IDF weighting for free.
+ *
+ * Furthermore it could be refined by using a "Learning to Rank" approach by
training a RankSVM, Gradient
+ * Boosted Trees or Random Forests on the output on the raw output of the
Rocchio classifier so as to re-rank
+ * candidate classes more finely. The Learning to Rank refinement is not
implemented yet.
*/
@Component(metatype = true, immediate = true, configurationFactory = true,
policy = ConfigurationPolicy.REQUIRE)
@Service
@@ -202,16 +213,18 @@ public class TopicClassificationEngine e
// TODO: make the following fields configurable
- private int MAX_COLLECTED_EXAMPLES = 100;
+ private int MAX_COLLECTED_EXAMPLES = 1000;
+
+ public int MAX_EVALUATION_SAMPLES = 200;
- public int MAX_EVALUATION_SAMPLES = 1000;
+ public int MIN_EVALUATION_SAMPLES = 10;
public int MAX_CHARS_PER_TOPIC = 100000;
public Integer MAX_ROOTS = 1000;
public int MAX_SUGGESTIONS = 5; // never suggest more than this: this is
expected to be a reasonable
- // estimate of the number of topics
occuring in each documents
+ // estimate of the number of topics
occurring in each documents
protected String engineName;
@@ -267,15 +280,20 @@ public class TopicClassificationEngine e
protected int cvFoldCount = 0;
- protected File evaluationFolder;
+ protected boolean evaluationRunning = false;
@Reference(cardinality = ReferenceCardinality.OPTIONAL_UNARY, bind =
"bindManagedSolrServer", unbind = "unbindManagedSolrServer", strategy =
ReferenceStrategy.EVENT, policy = ReferencePolicy.DYNAMIC)
- protected ManagedSolrServer managedSolrServer;
+ protected ManagedSolrServer managedSolrServerDummy; // trick to call the
super class binders
@Activate
protected void activate(ComponentContext context) throws
ConfigurationException, InvalidSyntaxException {
@SuppressWarnings("unchecked")
Dictionary<String,Object> config = context.getProperties();
+ activate(context, config);
+ }
+
+ protected void activate(ComponentContext context,
Dictionary<String,Object> config) throws ConfigurationException,
+
InvalidSyntaxException {
this.context = context;
indexArchiveName = "default-topic-model";
configure(config);
@@ -288,6 +306,9 @@ public class TopicClassificationEngine e
.createFilter(filter), null);
trainingSetTracker.open();
}
+ // TODO if training set is null, make it possible to programmatically
create a SolrTrainingSet
+ // instance using the same managed solr server and register it under
the same name as the engine
+ // it-self.
}
@Deactivate
@@ -659,14 +680,29 @@ public class TopicClassificationEngine e
}
@Override
+ public void removeAllConcepts() throws ClassifierException {
+ SolrServer solrServer = getActiveSolrServer();
+ try {
+ solrServer.deleteByQuery("*:*");
+ solrServer.commit();
+ } catch (Exception e) {
+ String msg = String.format("Error deleting concepts from Solr Core
'%s'", solrCoreId);
+ throw new ClassifierException(msg, e);
+ }
+ }
+
+ @Override
public void removeConcept(String conceptId) throws ClassifierException {
+ if (conceptId == null || conceptId.isEmpty()) {
+ throw new ClassifierException("conceptId must not be null or
empty");
+ }
SolrServer solrServer = getActiveSolrServer();
try {
solrServer.deleteByQuery(conceptUriField + ":" +
ClientUtils.escapeQueryChars(conceptId));
solrServer.commit();
} catch (Exception e) {
- String msg = String.format("Error removing topic with id '%s' on
Solr Core '%s'", conceptId,
- solrCoreId);
+ String msg = String
+ .format("Error removing concept '%s' on Solr Core '%s'",
conceptId, solrCoreId);
throw new ClassifierException(msg, e);
}
}
@@ -878,7 +914,7 @@ public class TopicClassificationEngine e
cvFoldCount = foldCount;
}
- protected Dictionary<String,Object>
getCanonicalConfiguration(EmbeddedSolrServer server) {
+ protected Dictionary<String,Object> getCanonicalConfiguration(Object
server) {
Hashtable<String,Object> config = new Hashtable<String,Object>();
config.put(EnhancementEngine.PROPERTY_NAME, engineName +
"-evaluation");
config.put(TopicClassificationEngine.ENTRY_ID_FIELD, "entry_id");
@@ -901,39 +937,50 @@ public class TopicClassificationEngine e
}
public boolean isEvaluationRunning() {
- return evaluationFolder != null;
+ return evaluationRunning;
}
- public int updatePerformanceEstimates(boolean incremental) throws
ClassifierException,
-
TrainingSetException {
- if (evaluationFolder != null) {
+ synchronized public int updatePerformanceEstimates(boolean incremental)
throws ClassifierException,
+
TrainingSetException {
+ checkTrainingSet();
+ if (evaluationRunning) {
throw new ClassifierException("Another evaluation is already
running");
}
int updatedTopics = 0;
- int cvFoldCount = 3; // 3-folds CV is hardcoded for now
- int cvIterationCount = 1; // only one 3-folds CV iteration
-
- TopicClassificationEngine classifier = new TopicClassificationEngine();
- classifier.setTrainingSet(trainingSet);
+ File tmpfolder = null;
try {
+ tmpfolder = File.createTempFile("stanbol-evaluation-folder-",
".tmp");
+ tmpfolder.delete();
+ tmpfolder.mkdir();
+ evaluationRunning = true;
+ int cvFoldCount = 3; // 3-folds CV is hardcoded for now
+ int cvIterationCount = 1; // only one 3-folds CV iteration
+
+ // We will use the training set quite intensively, ensure that the
index is packed and its
+ // statistics are up to date
+ getTrainingSet().optimize();
+
// TODO: make the temporary folder path configurable with a
property
- evaluationFolder =
File.createTempFile("stanbol-classifier-evaluation-", "-solr");
for (int cvFoldIndex = 0; cvFoldIndex < cvIterationCount;
cvFoldIndex++) {
- updatedTopics = performCVFold(classifier, cvFoldIndex,
cvFoldCount, cvIterationCount,
+ updatedTopics = performCVFold(tmpfolder, cvFoldIndex,
cvFoldCount, cvIterationCount,
incremental);
}
+ SolrServer solrServer = getActiveSolrServer();
+ solrServer.optimize();
} catch (ConfigurationException e) {
throw new ClassifierException(e);
} catch (IOException e) {
throw new ClassifierException(e);
+ } catch (SolrServerException e) {
+ throw new ClassifierException(e);
} finally {
- FileUtils.deleteQuietly(evaluationFolder);
- evaluationFolder = null;
+ FileUtils.deleteQuietly(tmpfolder);
+ evaluationRunning = false;
}
return updatedTopics;
}
- protected int performCVFold(final TopicClassificationEngine classifier,
+ protected int performCVFold(File tmpfolder,
int cvFoldIndex,
int cvFoldCount,
int cvIterations,
@@ -945,16 +992,27 @@ public class TopicClassificationEngine e
log.info(String.format("Performing evaluation %d-fold CV iteration
%d/%d on classifier %s",
cvFoldCount, cvFoldIndex + 1, cvIterations, engineName));
long start = System.currentTimeMillis();
- FileUtils.deleteQuietly(evaluationFolder);
- evaluationFolder.mkdir();
+ final TopicClassificationEngine classifier = new
TopicClassificationEngine();
try {
- EmbeddedSolrServer evaluationServer =
EmbeddedSolrHelper.makeEmbeddedSolrServer(evaluationFolder,
- "evaluationclassifierserver", "default-topic-model",
"default-topic-model");
- classifier.configure(getCanonicalConfiguration(evaluationServer));
+ if (managedSolrServer != null) {
+ // OSGi setup: the evaluation server will be generated
automatically using the
+ // managedSolrServer
+ classifier.bindManagedSolrServer(managedSolrServer);
+ classifier.activate(context,
getCanonicalConfiguration(engineName + "-evaluation"));
+ } else {
+ // non-OSGi runtime, need to do the setup manually
+ EmbeddedSolrServer evaluationServer =
EmbeddedSolrHelper.makeEmbeddedSolrServer(tmpfolder,
+ "evaluationclassifierserver", "default-topic-model",
"default-topic-model");
+
classifier.configure(getCanonicalConfiguration(evaluationServer));
+ }
} catch (Exception e) {
throw new ClassifierException(e);
}
+ // clean all previous concepts from the evaluation classifier in case
we are reusing an existing solr
+ // index from OSGi.
+ classifier.removeAllConcepts();
+
// iterate over all the topics to register them in the evaluation
classifier
batchOverTopics(new BatchProcessor<SolrDocument>() {
@Override
@@ -978,6 +1036,8 @@ public class TopicClassificationEngine e
// build the model on the for the current train CV folds
classifier.setCrossValidationInfo(cvFoldIndex, cvFoldCount);
+ // bind our new classifier to the same training set at the parent
+ classifier.setTrainingSet(getTrainingSet());
classifier.updateModel(false);
final int foldCount = cvFoldCount;
@@ -989,6 +1049,7 @@ public class TopicClassificationEngine e
@Override
public int process(List<SolrDocument> batch) throws
TrainingSetException, ClassifierException {
int offset;
+ int updated = 0;
for (SolrDocument topicMetadata : batch) {
String topic =
topicMetadata.getFirstValue(conceptUriField).toString();
List<String> topics = Arrays.asList(topic);
@@ -998,8 +1059,15 @@ public class TopicClassificationEngine e
int positiveSupport = 0;
offset = 0;
Batch<Example> examples = Batch.emtpyBatch(Example.class);
+ boolean skipTopic = false;
do {
examples =
getTrainingSet().getPositiveExamples(topics, examples.nextOffset);
+ if (offset == 0 && examples.items.size() <
MIN_EVALUATION_SAMPLES) {
+ // we need a minimum about of examples otherwise
it's really not
+ // worth computing statistics
+ skipTopic = true;
+ break;
+ }
for (Example example : examples.items) {
if (!(offset % foldCount == foldIndex)) {
// this example is not part of the test fold,
skip it
@@ -1025,7 +1093,7 @@ public class TopicClassificationEngine e
}
}
}
- } while (examples.hasMore && offset <
MAX_EVALUATION_SAMPLES);
+ } while (!skipTopic && examples.hasMore && offset <
MAX_EVALUATION_SAMPLES);
List<String> falsePositiveExamples = new
ArrayList<String>();
int falsePositives = 0;
@@ -1033,6 +1101,9 @@ public class TopicClassificationEngine e
offset = 0;
examples = Batch.emtpyBatch(Example.class);
do {
+ if (skipTopic) {
+ break;
+ }
examples =
getTrainingSet().getNegativeExamples(topics, examples.nextOffset);
for (Example example : examples.items) {
if (!(offset % foldCount == foldIndex)) {
@@ -1057,31 +1128,39 @@ public class TopicClassificationEngine e
}
} while (examples.hasMore && offset <
MAX_EVALUATION_SAMPLES);
- // compute precision, recall and f1 score for the current
test fold and topic
- float precision = 0;
- if (truePositives != 0 || falsePositives != 0) {
- precision = truePositives / (float) (truePositives +
falsePositives);
- }
- float recall = 0;
- if (truePositives != 0 || falseNegatives != 0) {
- recall = truePositives / (float) (truePositives +
falseNegatives);
+ if (skipTopic) {
+ log.debug("Skipping evaluation of {} because too few
positive examples.", topic);
+ } else {
+ // compute precision, recall and f1 score for the
current test fold and topic
+ float precision = 0;
+ if (truePositives != 0 || falsePositives != 0) {
+ precision = truePositives / (float) (truePositives
+ falsePositives);
+ }
+ float recall = 0;
+ if (truePositives != 0 || falseNegatives != 0) {
+ recall = truePositives / (float) (truePositives +
falseNegatives);
+ }
+ updatePerformanceMetadata(topic, precision, recall,
positiveSupport, negativeSupport,
+ falsePositiveExamples, falseNegativeExamples);
+ updated += 1;
}
- updatePerformanceMetadata(topic, precision, recall,
positiveSupport, negativeSupport,
- falsePositiveExamples, falseNegativeExamples);
}
try {
getActiveSolrServer().commit();
} catch (Exception e) {
throw new ClassifierException(e);
}
- return batch.size();
+ return updated;
}
});
- float averageF1 = 0.0f;
long stop = System.currentTimeMillis();
- log.info(String.format("Finished CV iteration %d/%d on classifier %s
in %fs. F1-score = %f",
- cvFoldIndex + 1, cvFoldCount, engineName, (stop - start) / 1000.0,
averageF1));
+ log.info(String.format("Finished CV iteration %d/%d on classifier %s
in %fs.", cvFoldIndex + 1,
+ cvFoldCount, engineName, (stop - start) / 1000.0));
+ if (context != null) {
+ // close open trackers
+ classifier.deactivate(context);
+ }
return updatedTopics;
}
@@ -1120,6 +1199,9 @@ public class TopicClassificationEngine e
newEntry.setField(modelEvaluationDateField,
UTCTimeStamper.nowUtcDate());
solrServer.add(newEntry);
}
+ log.info(String.format("Performance for concept '%s':
precision=%f, recall=%f,"
+ + " positiveSupport=%d,
negativeSupport=%d", conceptId, precision, recall,
+ positiveSupport, negativeSupport));
} catch (Exception e) {
String msg = String
.format("Error updating performance metadata for topic
'%s' on Solr Core '%s'",
Modified:
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TopicClassifier.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TopicClassifier.java?rev=1306206&r1=1306205&r2=1306206&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TopicClassifier.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TopicClassifier.java
Wed Mar 28 07:52:46 2012
@@ -111,11 +111,18 @@ public interface TopicClassifier {
* drawing examples from the dataset.
*
* @param conceptUri
- * if of the topic to remove from the model
+ * if of the topic to remove from the model, must not be null
*/
void removeConcept(String conceptUri) throws ClassifierException;
/**
+ * Remove all the concepts from the current model leaving with an empty
model.
+ *
+ * @throws ClassifierException
+ */
+ void removeAllConcepts() throws ClassifierException;
+
+ /**
* @return the training set registered for this classifier (either set
explicitly using setTrainingSet or
* configured through OSGi properties).
*/
@@ -171,7 +178,7 @@ public interface TopicClassifier {
List<String> getChainNames() throws InvalidSyntaxException, ChainException;
/**
- * Initialize the concept hierarch of the model using the provided RDF
model (e.g. a SKOS taxonomy).
+ * Initialize the concept hierarchy of the model using the provided RDF
model (e.g. a SKOS taxonomy).
*
* @return the number of concepts successfully imported (including roots).
*/
Modified:
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/training/SolrTrainingSet.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/training/SolrTrainingSet.java?rev=1306206&r1=1306205&r2=1306206&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/training/SolrTrainingSet.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/training/SolrTrainingSet.java
Wed Mar 28 07:52:46 2012
@@ -93,14 +93,14 @@ public class SolrTrainingSet extends Con
// TODO: make me configurable using an OSGi property
protected int batchSize = 100;
-
+
@Reference(cardinality = ReferenceCardinality.OPTIONAL_UNARY, bind =
"bindManagedSolrServer", unbind = "unbindManagedSolrServer", strategy =
ReferenceStrategy.EVENT, policy = ReferencePolicy.DYNAMIC)
protected ManagedSolrServer managedSolrServer;
public String getName() {
return trainingSetId;
}
-
+
@Activate
protected void activate(ComponentContext context) throws
ConfigurationException, InvalidSyntaxException {
indexArchiveName = "default-topic-trainingset";
@@ -284,4 +284,13 @@ public class SolrTrainingSet extends Con
this.batchSize = batchSize;
}
+ @Override
+ public void optimize() throws TrainingSetException {
+ try {
+ getActiveSolrServer().optimize();
+ } catch (Exception e) {
+ throw new TrainingSetException("Error optimizing training dataset
" + getName(), e);
+ }
+ }
+
}
Modified:
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/training/TrainingSet.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/training/TrainingSet.java?rev=1306206&r1=1306205&r2=1306206&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/training/TrainingSet.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/training/TrainingSet.java
Wed Mar 28 07:52:46 2012
@@ -103,4 +103,9 @@ public interface TrainingSet {
*/
boolean hasChangedSince(List<String> topics, Date referenceDate) throws
TrainingSetException;
+ /**
+ * Trigger optimization of the underlying index.
+ */
+ void optimize() throws TrainingSetException;
+
}
Modified:
incubator/stanbol/trunk/enhancer/topic-web/src/main/java/org/apache/stanbol/enhancer/web/topic/fragment/TopicClassifierFragment.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/topic-web/src/main/java/org/apache/stanbol/enhancer/web/topic/fragment/TopicClassifierFragment.java?rev=1306206&r1=1306205&r2=1306206&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/topic-web/src/main/java/org/apache/stanbol/enhancer/web/topic/fragment/TopicClassifierFragment.java
(original)
+++
incubator/stanbol/trunk/enhancer/topic-web/src/main/java/org/apache/stanbol/enhancer/web/topic/fragment/TopicClassifierFragment.java
Wed Mar 28 07:52:46 2012
@@ -29,6 +29,7 @@ import org.apache.stanbol.commons.web.ba
import org.apache.stanbol.commons.web.base.NavigationLink;
import org.apache.stanbol.commons.web.base.ScriptResource;
import org.apache.stanbol.commons.web.base.WebFragment;
+import org.apache.stanbol.commons.web.base.readers.GraphReader;
import
org.apache.stanbol.enhancer.web.topic.resource.TopicClassifierRootResource;
import org.apache.stanbol.enhancer.web.topic.resource.TopicModelResource;
import org.osgi.framework.BundleContext;
@@ -64,6 +65,7 @@ public class TopicClassifierFragment imp
Set<Class<?>> classes = new HashSet<Class<?>>();
classes.add(TopicClassifierRootResource.class);
classes.add(TopicModelResource.class);
+ classes.add(GraphReader.class);
return classes;
}
Modified:
incubator/stanbol/trunk/enhancer/topic-web/src/main/java/org/apache/stanbol/enhancer/web/topic/resource/TopicModelResource.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/topic-web/src/main/java/org/apache/stanbol/enhancer/web/topic/resource/TopicModelResource.java?rev=1306206&r1=1306205&r2=1306206&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/topic-web/src/main/java/org/apache/stanbol/enhancer/web/topic/resource/TopicModelResource.java
(original)
+++
incubator/stanbol/trunk/enhancer/topic-web/src/main/java/org/apache/stanbol/enhancer/web/topic/resource/TopicModelResource.java
Wed Mar 28 07:52:46 2012
@@ -18,23 +18,39 @@ package org.apache.stanbol.enhancer.web.
import static javax.ws.rs.core.MediaType.TEXT_HTML;
import static org.apache.stanbol.commons.web.base.CorsHelper.addCORSOrigin;
+import static org.apache.stanbol.commons.web.base.CorsHelper.enableCORS;
+
+import java.util.List;
import javax.servlet.ServletContext;
+import javax.ws.rs.Consumes;
+import javax.ws.rs.DELETE;
import javax.ws.rs.GET;
+import javax.ws.rs.OPTIONS;
+import javax.ws.rs.POST;
import javax.ws.rs.Path;
import javax.ws.rs.PathParam;
import javax.ws.rs.Produces;
+import javax.ws.rs.QueryParam;
import javax.ws.rs.WebApplicationException;
import javax.ws.rs.core.Context;
import javax.ws.rs.core.HttpHeaders;
+import javax.ws.rs.core.MediaType;
import javax.ws.rs.core.Response;
-import javax.ws.rs.core.UriInfo;
import javax.ws.rs.core.Response.ResponseBuilder;
+import javax.ws.rs.core.UriInfo;
+import org.apache.clerezza.rdf.core.Graph;
+import org.apache.clerezza.rdf.core.UriRef;
import org.apache.stanbol.commons.web.base.ContextHelper;
import org.apache.stanbol.commons.web.base.resource.BaseStanbolResource;
import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses;
+import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;
+import org.apache.stanbol.enhancer.topic.ClassifierException;
import org.apache.stanbol.enhancer.topic.TopicClassifier;
+import org.apache.stanbol.enhancer.topic.training.TrainingSet;
+import org.apache.stanbol.enhancer.topic.training.TrainingSetException;
import org.osgi.framework.BundleContext;
import org.osgi.framework.InvalidSyntaxException;
import org.osgi.framework.ServiceReference;
@@ -65,6 +81,10 @@ public final class TopicModelResource ex
classifier = (TopicClassifier) bundleContext.getService(references[0]);
}
+ public TopicClassifier getClassifier() {
+ return classifier;
+ }
+
@GET
@Produces(TEXT_HTML)
public Response get(@Context HttpHeaders headers) {
@@ -74,7 +94,197 @@ public final class TopicModelResource ex
return rb.build();
}
- public TopicClassifier getClassifier() {
- return classifier;
+ @OPTIONS
+ @Path("concept")
+ public Response handleCorsPreflightOnConcept(@Context HttpHeaders headers)
{
+ ResponseBuilder res = Response.ok();
+ enableCORS(servletContext, res, headers);
+ return res.build();
+ }
+
+ // TODO: make it possible to fetch concept descriptions (with broader and
narrower links) using the GET
+ // verb
+
+ @POST
+ @Path("concept")
+ @Consumes(MediaType.WILDCARD)
+ public Response addConcept(@QueryParam(value = "id") String concept,
+ @QueryParam(value = "broader") List<String>
broaderConcepts,
+ @Context HttpHeaders headers) throws
ClassifierException {
+ classifier.addConcept(concept, broaderConcepts);
+ ResponseBuilder rb = Response.ok();
+ addCORSOrigin(servletContext, rb, headers);
+ return rb.build();
+ }
+
+ @DELETE
+ @Path("concept")
+ @Consumes(MediaType.WILDCARD)
+ public Response remoteConcept(@QueryParam(value = "id") String concept,
@Context HttpHeaders headers) throws ClassifierException {
+ if (concept != null && !concept.isEmpty()) {
+ classifier.removeConcept(concept);
+ } else {
+ classifier.removeAllConcepts();
+ }
+ // TODO: count the number of deleted entries and return is a text
entity
+ ResponseBuilder rb = Response.ok();
+ addCORSOrigin(servletContext, rb, headers);
+ return rb.build();
+ }
+
+ @OPTIONS
+ @Path("performance")
+ public Response handleCorsPreflightOnPerformance(@Context HttpHeaders
headers) {
+ ResponseBuilder res = Response.ok();
+ enableCORS(servletContext, res, headers);
+ return res.build();
+ }
+
+ // TODO: make it possible to fetch performance reports and evaluation
running state using the GET verb
+
+ @POST
+ @Path("performance")
+ @Consumes(MediaType.WILDCARD)
+ public Response updatePerformance(@QueryParam(value = "incremental")
Boolean incremental,
+ @Context HttpHeaders headers) throws
TrainingSetException,
+
ClassifierException {
+ if (incremental == null) {
+ incremental = Boolean.TRUE;
+ }
+ int updated = classifier.updatePerformanceEstimates(incremental);
+ ResponseBuilder rb = Response.ok(String.format(
+ "Successfully updated the performance estimates of %d
concept(s).\n", updated));
+ addCORSOrigin(servletContext, rb, headers);
+ return rb.build();
+ }
+
+ @OPTIONS
+ @Path("trainer")
+ public Response handleCorsPreflightOnTrainer(@Context HttpHeaders headers)
{
+ ResponseBuilder res = Response.ok();
+ enableCORS(servletContext, res, headers);
+ return res.build();
+ }
+
+ // TODO: make it possible to fetch training set statistics and training
state using the GET verb
+
+ @POST
+ @Path("trainer")
+ @Consumes(MediaType.WILDCARD)
+ public Response updateModel(@QueryParam(value = "incremental") Boolean
incremental,
+ @Context HttpHeaders headers) throws
TrainingSetException,
+
ClassifierException {
+ if (incremental == null) {
+ incremental = Boolean.TRUE;
+ }
+ int updated = classifier.updateModel(incremental);
+ ResponseBuilder rb = Response.ok(String.format(
+ "Successfully updated the statistical model(s) of %d
concept(s).\n", updated));
+ addCORSOrigin(servletContext, rb, headers);
+ return rb.build();
+ }
+
+ public Response handleCorsPreflightOnTrainingSet(@Context HttpHeaders
headers) {
+ ResponseBuilder res = Response.ok();
+ enableCORS(servletContext, res, headers);
+ return res.build();
+ }
+
+ // TODO: make it possible browse the training set content on the GET verb
using a subresource
+
+ @POST
+ @Path("trainingset")
+ @Consumes(MediaType.TEXT_PLAIN)
+ public Response registerExample(@QueryParam(value = "example_id") String
exampleId,
+ @QueryParam(value = "concept")
List<String> concepts,
+ String textContent,
+ @Context HttpHeaders headers) throws
TrainingSetException,
+
ClassifierException {
+ ResponseBuilder rb;
+ if (!classifier.isUpdatable()) {
+ rb = Response.status(Response.Status.BAD_REQUEST).entity(
+ String.format("Classifier %s is not updateble.\n",
classifier.getName()));
+ } else {
+ TrainingSet trainingSet = classifier.getTrainingSet();
+ exampleId = trainingSet.registerExample(exampleId, textContent,
concepts);
+ // TODO: make example GETable resources and return a 201 to it
instead of a simple message.
+ rb = Response.ok(String.format(
+ "Successfully added or updated example '%s' in training set
'%s'.\n", exampleId,
+ trainingSet.getName()));
+ }
+ addCORSOrigin(servletContext, rb, headers);
+ return rb.build();
+ }
+
+ // TODO make the following a DELETE method on the example sub-resources
them-selves once we have a GET for
+ // them
+
+ @DELETE
+ @Path("trainingset")
+ @Consumes(MediaType.WILDCARD)
+ public Response removeExample(@QueryParam(value = "example_id")
List<String> exampleIds,
+ @Context HttpHeaders headers) throws
TrainingSetException,
+
ClassifierException {
+ ResponseBuilder rb;
+ if (!classifier.isUpdatable()) {
+ rb = Response.status(Response.Status.BAD_REQUEST).entity(
+ String.format("Classifier %s is not updateble.\n",
classifier.getName()));
+ } else {
+ TrainingSet trainingSet = classifier.getTrainingSet();
+ if (exampleIds != null && !exampleIds.isEmpty()) {
+ for (String exampleId : exampleIds) {
+ trainingSet.registerExample(exampleId, null, null);
+ }
+ } else {
+ // implement a way to cleanup a complete training set? or is
it too dangerous and we should
+ // return an error instead?
+ }
+ rb = Response.ok(String.format("Successfully deleted examples in
training set '%s'.\n",
+ trainingSet.getName()));
+ }
+ addCORSOrigin(servletContext, rb, headers);
+ return rb.build();
+ }
+
+ @OPTIONS
+ public Response handleCorsPreflight(@Context HttpHeaders headers) {
+ ResponseBuilder res = Response.ok();
+ enableCORS(servletContext, res, headers);
+ return res.build();
+ }
+
+ /**
+ * Simple RDF / SKOS importer that loads the complete model in memory for
easy parsing and then does graph
+ * introspection to find the concepts to load into the model.
+ *
+ * If a scalable implementation is required, one should probably use a
transient triple store and pass it
+ * the raw RDF stream instead of using the naive GraphReader JAX-RS
provider.
+ */
+ @POST
+ @Consumes(MediaType.WILDCARD)
+ public Response importConceptsFromRDF(@QueryParam(value = "concept_class")
String conceptClassUri,
+ @QueryParam(value =
"broader_property") String broaderPropertyUri,
+ Graph graph,
+ @Context HttpHeaders headers) throws
ClassifierException {
+ UriRef conceptClass = OntologicalClasses.SKOS_CONCEPT;
+ UriRef broaderProperty = Properties.SKOS_BROADER;
+ if (conceptClassUri != null && !conceptClassUri.isEmpty()) {
+ conceptClass = new UriRef(conceptClassUri);
+ }
+ if (broaderPropertyUri != null && !broaderPropertyUri.isEmpty()) {
+ broaderProperty = new UriRef(broaderPropertyUri);
+ }
+ int imported = classifier.importConceptsFromGraph(graph, conceptClass,
broaderProperty);
+ ResponseBuilder rb;
+ if (imported == 0) {
+ rb = Response.status(Response.Status.BAD_REQUEST).entity(
+ String.format("Could not find any instances of '%s' in
payload.\n",
+ conceptClass.getUnicodeString()));
+ } else {
+ rb = Response.ok(String.format("Imported %d instance of '%s'.\n",
imported,
+ conceptClass.getUnicodeString()));
+ }
+ addCORSOrigin(servletContext, rb, headers);
+ return rb.build();
}
}
Added: incubator/stanbol/trunk/enhancer/topic-web/tools/newsmlimporter.py
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/topic-web/tools/newsmlimporter.py?rev=1306206&view=auto
==============================================================================
--- incubator/stanbol/trunk/enhancer/topic-web/tools/newsmlimporter.py (added)
+++ incubator/stanbol/trunk/enhancer/topic-web/tools/newsmlimporter.py Wed Mar
28 07:52:46 2012
@@ -0,0 +1,108 @@
+#!/usr/bin/env python
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Basic python script to load NewsML documents as training set
+
+Need Python 2.7 and lxml.
+
+TODO: port to Python 3 as well if not working by default.
+"""
+from __future__ import print_function
+
+import os
+from lxml import html
+from lxml import etree
+from urllib import quote
+import urllib2
+from hashlib import sha1
+
+
+IPTC_SUBJECT_PREFIX = "http://cv.iptc.org/newscodes/subjectcode/"
+
+
+def find_text_and_subjects(newsml_content,
+ subject_tags=('SubjectMatter', 'SubjectDetail'),
+ text_tags=('HeadLine',),
+ html_tags=('body.content',)):
+ # First parse of the document as XML for the structured attributes
+ xtree = etree.ElementTree(etree.fromstring(newsml_content))
+ text_items = [e.text.strip()
+ for tag in text_tags
+ for e in xtree.findall('//' + tag)]
+ subjects = [IPTC_SUBJECT_PREFIX + e.get('FormalName')
+ for tag in subject_tags
+ for e in xtree.findall('//' + tag)]
+
+ # Then use HTML parser to find the that looks like HTML hence can leverage
+ # the text_content method.
+ htree = etree.ElementTree(html.document_fromstring(newsml_content))
+
+ text_items += [e.text_content().strip()
+ for tag in html_tags
+ for e in htree.findall('//' + tag)]
+ text = "\n\n".join(text_items)
+ return text, subjects
+
+
+def register_newsml_document(text, codes, url):
+ id = sha1(text).hexdigest()
+ url += "?example_id=%s" % id
+ for code in codes:
+ url += "&concept=%s" % quote(code)
+ print("Calling:", url)
+ request = urllib2.Request(url, data=text.encode('utf-8'))
+ request.add_header('Content-Type', 'text/plain')
+ opener = urllib2.build_opener()
+ print(opener.open(request).read())
+
+
+def print_newsml_summary(text, codes, server_url=None):
+ print(text.split('\n\n')[0])
+ for code in codes:
+ print('code: ' + code)
+ print()
+
+
+if __name__ == "__main__":
+ import sys
+
+ # TODO: use argparse and debug switch to use print_newsfile_summary
+ # instead of the default handler
+ topfolder = sys.argv[1]
+ max = int(sys.argv[2])
+ server_url = sys.argv[3]
+ handle_news = register_newsml_document
+
+ count = 0
+ for dirpath, dirnames, filenames in os.walk(topfolder):
+ if count >= max:
+ break
+
+ if '.svn' in dirnames:
+ dirnames.remove('.svn')
+
+ for filename in filenames:
+ if count >= max:
+ break
+ if not filename.endswith('.xml'):
+ continue
+ full_path = os.path.join(topfolder, dirpath, filename)
+ newsml_content = open(full_path, 'rb').read()
+ text, codes = find_text_and_subjects(newsml_content)
+ if len(codes) == 0:
+ # ignore document without subject info
+ continue
+ handle_news(text, codes, server_url)
+ count += 1