Author: ogrisel
Date: Tue Jan  3 17:11:32 2012
New Revision: 1226872

URL: http://svn.apache.org/viewvc?rev=1226872&view=rev
Log:
STANBOL-197: Make it possible to programmatically define a topic classification 
model

Added:
    
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/
    
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/Batch.java
    
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/ClassifierException.java
    
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TopicClassifier.java
    
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TopicSuggestion.java
    
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TrainingSet.java
    
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TrainingSetException.java
Removed:
    
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicSuggestion.java
Modified:
    
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
    
incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/TopicEngineTest.java
    
incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/test_schema.xml
    
incubator/stanbol/trunk/enhancer/generic/servicesapi/src/main/java/org/apache/stanbol/enhancer/servicesapi/rdf/TechnicalClasses.java

Modified: 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java?rev=1226872&r1=1226871&r2=1226872&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
 (original)
+++ 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
 Tue Jan  3 17:11:32 2012
@@ -21,14 +21,19 @@ import static org.apache.stanbol.enhance
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.Collection;
 import java.util.Collections;
 import java.util.Dictionary;
 import java.util.Iterator;
+import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 
+import org.apache.clerezza.rdf.core.MGraph;
 import org.apache.clerezza.rdf.core.Triple;
 import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.clerezza.rdf.core.impl.TripleImpl;
 import org.apache.commons.io.IOUtils;
 import org.apache.felix.scr.annotations.Activate;
 import org.apache.felix.scr.annotations.Component;
@@ -43,6 +48,7 @@ import org.apache.solr.client.solrj.Solr
 import org.apache.solr.client.solrj.response.QueryResponse;
 import org.apache.solr.common.SolrDocument;
 import org.apache.solr.common.SolrDocumentList;
+import org.apache.solr.common.SolrInputDocument;
 import org.apache.solr.common.params.CommonParams;
 import org.apache.solr.common.params.MoreLikeThisParams;
 import org.apache.stanbol.commons.solr.IndexReference;
@@ -53,6 +59,13 @@ import org.apache.stanbol.enhancer.servi
 import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
 import org.apache.stanbol.enhancer.servicesapi.InvalidContentException;
 import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
+import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
+import org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses;
+import org.apache.stanbol.enhancer.topic.ClassifierException;
+import org.apache.stanbol.enhancer.topic.TopicClassifier;
+import org.apache.stanbol.enhancer.topic.TopicSuggestion;
+import org.apache.stanbol.enhancer.topic.TrainingSet;
+import org.apache.stanbol.enhancer.topic.TrainingSetException;
 import org.osgi.framework.InvalidSyntaxException;
 import org.osgi.service.cm.ConfigurationException;
 import org.osgi.service.component.ComponentContext;
@@ -72,17 +85,19 @@ import org.slf4j.LoggerFactory;
 @Properties(value = {@Property(name = TopicClassificationEngine.ENGINE_ID),
                      @Property(name = TopicClassificationEngine.ORDER, 
intValue = 100),
                      @Property(name = TopicClassificationEngine.SOLR_CORE),
-                     @Property(name = TopicClassificationEngine.LANGUAGE),
+                     @Property(name = TopicClassificationEngine.LANGUAGES),
                      @Property(name = 
TopicClassificationEngine.SIMILARTITY_FIELD),
                      @Property(name = 
TopicClassificationEngine.TOPIC_URI_FIELD),
-                     @Property(name = 
TopicClassificationEngine.MATERIALIZED_PATH_FIELD)})
-public class TopicClassificationEngine implements EnhancementEngine, 
ServiceProperties {
+                     @Property(name = TopicClassificationEngine.BROADER_FIELD),
+                     @Property(name = 
TopicClassificationEngine.MATERIALIZED_PATH_FIELD),
+                     @Property(name = 
TopicClassificationEngine.MODEL_UPDATE_DATE_FIELD)})
+public class TopicClassificationEngine implements EnhancementEngine, 
ServiceProperties, TopicClassifier {
 
     public static final String ENGINE_ID = 
"org.apache.stanbol.enhancer.engine.id";
 
     public static final String SOLR_CORE = 
"org.apache.stanbol.enhancer.engine.topic.solrCore";
 
-    public static final String LANGUAGE = 
"org.apache.stanbol.enhancer.engine.topic.language";
+    public static final String LANGUAGES = 
"org.apache.stanbol.enhancer.engine.topic.languages";
 
     public static final String ORDER = 
"org.apache.stanbol.enhancer.engine.topic.order";
 
@@ -90,8 +105,12 @@ public class TopicClassificationEngine i
 
     public static final String TOPIC_URI_FIELD = 
"org.apache.stanbol.enhancer.engine.topic.uriField";
 
+    public static final String BROADER_FIELD = 
"org.apache.stanbol.enhancer.engine.topic.broaderField";
+
     public static final String MATERIALIZED_PATH_FIELD = 
"org.apache.stanbol.enhancer.engine.topic.materializedPathField";
 
+    public static final String MODEL_UPDATE_DATE_FIELD = 
"org.apache.stanbol.enhancer.engine.topic.modelUpdateField";
+
     private static final Logger log = 
LoggerFactory.getLogger(TopicClassificationEngine.class);
 
     protected String engineId;
@@ -111,12 +130,18 @@ public class TopicClassificationEngine i
 
     protected String topicUriField;
 
+    protected String modelUpdateDateField;
+
+    protected String broaderField;
+
     protected String materializedPathField;
 
     protected ComponentContext context;
 
     protected int numTopics = 10;
 
+    protected TrainingSet trainingSet;
+
     @Activate
     protected void activate(ComponentContext context) throws 
ConfigurationException, InvalidSyntaxException {
         @SuppressWarnings("unchecked")
@@ -136,7 +161,7 @@ public class TopicClassificationEngine i
         engineId = getRequiredStringParam(config, ENGINE_ID);
         similarityField = getRequiredStringParam(config, SIMILARTITY_FIELD);
         topicUriField = getRequiredStringParam(config, TOPIC_URI_FIELD);
-        acceptedLanguages = getStringListParan(config, LANGUAGE);
+        acceptedLanguages = getStringListParan(config, LANGUAGES);
         if (config.get(SOLR_CORE) instanceof SolrServer) {
             // Bind a fixed Solr server client instead of doing dynamic OSGi 
lookup using the service tracker.
             // This can be useful both for unit-testing .
@@ -159,8 +184,10 @@ public class TopicClassificationEngine i
                 throw new ConfigurationException(SOLR_CORE, e.getMessage(), e);
             }
         }
-        // optional field, can be null
+        // optional fields, can be null
+        broaderField = (String) config.get(BROADER_FIELD);
         materializedPathField = (String) config.get(TOPIC_URI_FIELD);
+        modelUpdateDateField = (String) config.get(MODEL_UPDATE_DATE_FIELD);
         Object orderParamValue = config.get(ORDER);
         if (orderParamValue != null) {
             order = (Integer) orderParamValue;
@@ -218,12 +245,79 @@ public class TopicClassificationEngine i
     @Override
     public void computeEnhancements(ContentItem ci) throws EngineException {
         String text = getTextFromContentItem(ci);
-        suggestTopics(text);
+        MGraph metadata = ci.getMetadata();
+        List<TopicSuggestion> topics;
+        try {
+            topics = suggestTopics(text);
+        } catch (ClassifierException e) {
+            throw new EngineException(e);
+        }
+        for (TopicSuggestion topic : topics) {
+            UriRef enhancement = 
EnhancementEngineHelper.createEntityEnhancement(ci, this);
+            metadata.add(new TripleImpl(enhancement,
+                    
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE,
+                    TechnicalClasses.ENHANCER_TOPICANNOTATION));
+
+            // add link to entity
+            metadata.add(new TripleImpl(enhancement,
+                    
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_REFERENCE,
+                    new UriRef(topic.uri)));
+            // TODO: make it possible to dereference and the path to the root 
the entities according to a
+            // configuration parameter
+        }
+    }
 
-        // TODO: express the results as RDF.
+    /**
+     * @return the manually bound solrServer instance or the one tracked by 
the OSGi service tracker.
+     */
+    protected SolrServer getActiveSolrServer() {
+        return solrServer != null ? solrServer : indexTracker.getService();
     }
 
-    public List<TopicSuggestion> suggestTopics(String text) throws 
EngineException {
+    @Override
+    public Map<String,Object> getServiceProperties() {
+        return 
Collections.unmodifiableMap(Collections.singletonMap(ENHANCEMENT_ENGINE_ORDERING,
+            (Object) order));
+    }
+
+    protected String getTextFromContentItem(ContentItem ci) throws 
InvalidContentException {
+        // Refactor the following using an adapter.
+        String text = "";
+        if (ci.getMimeType().startsWith("text/plain")) {
+            try {
+                // TODO: handle explicit charsets if any and fallback to UTF-8 
if missing
+                text = IOUtils.toString(ci.getStream(), "UTF-8");
+            } catch (IOException e) {
+                throw new InvalidContentException(this, ci, e);
+            }
+        } else {
+            Iterator<Triple> it = ci.getMetadata().filter(ci.getUri(), 
NIE_PLAINTEXTCONTENT, null);
+            while (it.hasNext()) {
+                text += it.next().getObject();
+            }
+        }
+        return text;
+    }
+
+    public static TopicClassificationEngine 
fromParameters(Dictionary<String,Object> config) throws ConfigurationException {
+        TopicClassificationEngine engine = new TopicClassificationEngine();
+        engine.configure(config);
+        return engine;
+    }
+
+    // classifier API
+
+    @Override
+    public String getSchemeId() {
+        return engineId;
+    }
+
+    @Override
+    public List<String> getAcceptedLanguages() {
+        return acceptedLanguages;
+    }
+
+    public List<TopicSuggestion> suggestTopics(String text) throws 
ClassifierException {
         List<TopicSuggestion> suggestedTopics = new 
ArrayList<TopicSuggestion>(numTopics);
         SolrServer solrServer = getActiveSolrServer();
         SolrQuery query = new SolrQuery();
@@ -232,7 +326,7 @@ public class TopicClassificationEngine i
         query.set(MoreLikeThisParams.MIN_DOC_FREQ, 1);
         query.set(MoreLikeThisParams.MIN_TERM_FREQ, 1);
         // TODO: find a way to parse the interesting terms and report them
-        // for debugging / explanation in dedicated RDF datastucture.
+        // for debugging / explanation in dedicated RDF data structure.
         // query.set(MoreLikeThisParams.INTERESTING_TERMS, "details");
         query.set(MoreLikeThisParams.SIMILARITY_FIELDS, similarityField);
         query.set(CommonParams.STREAM_BODY, text);
@@ -244,8 +338,8 @@ public class TopicClassificationEngine i
             for (SolrDocument result : results.toArray(new SolrDocument[0])) {
                 String uri = (String) result.getFirstValue(topicUriField);
                 if (uri == null) {
-                    throw new EngineException(String.format("Solr Core '%s' is 
missing required field '%s'.",
-                        solrCoreId, topicUriField));
+                    throw new ClassifierException(String.format(
+                        "Solr Core '%s' is missing required field '%s'.", 
solrCoreId, topicUriField));
                 }
                 suggestedTopics.add(new TopicSuggestion(uri, 0.0));
             }
@@ -254,50 +348,144 @@ public class TopicClassificationEngine i
                 String message = String.format("SolrServer with id '%s' for 
topic engine '%s' lacks"
                                                + " configuration for the 
MoreLikeThisHandler", solrCoreId,
                     engineId);
-                throw new EngineException(message, e);
+                throw new ClassifierException(message, e);
             } else {
-                throw new EngineException(e);
+                throw new ClassifierException(e);
             }
         }
         return suggestedTopics;
     }
 
-    /**
-     * @return the manually bound solrServer instance or the one tracked by 
the OSGi service tracker.
-     */
-    protected SolrServer getActiveSolrServer() {
-        return solrServer != null ? solrServer : indexTracker.getService();
+    @Override
+    public Set<String> getNarrowerTopics(String broadTopicId) throws 
ClassifierException {
+        LinkedHashSet<String> narrowerTopics = new LinkedHashSet<String>();
+        if (broaderField == null) {
+            return narrowerTopics;
+        }
+        SolrServer solrServer = getActiveSolrServer();
+        SolrQuery query = new SolrQuery("*:*");
+        // use a filter query to avoid string escaping issues with special 
solr chars
+        query.addFilterQuery("{!field f=" + broaderField + "}" + broadTopicId);
+        query.addField(topicUriField);
+        query.addSortField(topicUriField, SolrQuery.ORDER.asc);
+        try {
+            for (SolrDocument result : solrServer.query(query).getResults()) {
+                
narrowerTopics.add(result.getFirstValue(topicUriField).toString());
+            }
+        } catch (SolrServerException e) {
+            String msg = String.format("Error while fetching narrower topics 
of '%s' on Solr Core '%s'.",
+                broadTopicId, solrCoreId);
+            throw new ClassifierException(msg, e);
+        }
+        return narrowerTopics;
     }
 
     @Override
-    public Map<String,Object> getServiceProperties() {
-        return 
Collections.unmodifiableMap(Collections.singletonMap(ENHANCEMENT_ENGINE_ORDERING,
-            (Object) order));
+    public Set<String> getBroaderTopics(String id) throws ClassifierException {
+        LinkedHashSet<String> broaderTopics = new LinkedHashSet<String>();
+        if (broaderField == null) {
+            return broaderTopics;
+        }
+        SolrServer solrServer = getActiveSolrServer();
+        SolrQuery query = new SolrQuery("*:*");
+        // use a filter query to avoid string escaping issues with special 
solr chars
+        query.addFilterQuery("{!field f=" + topicUriField + "}" + id);
+        query.addField(broaderField);
+        try {
+            for (SolrDocument result : solrServer.query(query).getResults()) {
+                // there should be only one results
+                Collection<Object> broaderFieldValues = 
result.getFieldValues(broaderField);
+                if (broaderFieldValues == null) {
+                    continue;
+                }
+                for (Object value : broaderFieldValues) {
+                    broaderTopics.add(value.toString());
+                }
+            }
+        } catch (SolrServerException e) {
+            String msg = String.format("Error while fetching broader topics of 
'%s' on Solr Core '%s'.", id,
+                solrCoreId);
+            throw new ClassifierException(msg, e);
+        }
+        return broaderTopics;
     }
 
-    protected String getTextFromContentItem(ContentItem ci) throws 
InvalidContentException {
-        // Refactor the following using an adapter.
-        String text = "";
-        if (ci.getMimeType().startsWith("text/plain")) {
-            try {
-                // TODO: handle explicit charsets if any and fallback to UTF-8 
if missing
-                text = IOUtils.toString(ci.getStream(), "UTF-8");
-            } catch (IOException e) {
-                throw new InvalidContentException(this, ci, e);
-            }
+    @Override
+    public Set<String> getTopicRoots() throws ClassifierException {
+        // TODO: this can be very big on flat thesauri: should we enable a 
paging API instead?
+        LinkedHashSet<String> rootTopics = new LinkedHashSet<String>();
+        SolrServer solrServer = getActiveSolrServer();
+        SolrQuery query = new SolrQuery();
+        if (broaderField != null) {
+            // find any topic with an empty broaderField
+            query.setParam("q", "-" + broaderField + ":[\"\" TO *]");
         } else {
-            Iterator<Triple> it = ci.getMetadata().filter(ci.getUri(), 
NIE_PLAINTEXTCONTENT, null);
-            while (it.hasNext()) {
-                text += it.next().getObject();
+            // find any topic
+            query.setQuery("*:*");
+        }
+        try {
+            for (SolrDocument result : solrServer.query(query).getResults()) {
+                rootTopics.add(result.getFirstValue(topicUriField).toString());
             }
+        } catch (SolrServerException e) {
+            String msg = String.format("Error while fetching root topics on 
Solr Core '%s'.", solrCoreId);
+            throw new ClassifierException(msg, e);
         }
-        return text;
+        return rootTopics;
     }
 
-    public static TopicClassificationEngine 
fromParameters(Dictionary<String,Object> config) throws ConfigurationException {
-        TopicClassificationEngine engine = new TopicClassificationEngine();
-        engine.configure(config);
-        return engine;
+    @Override
+    public void addTopic(String id, Collection<String> broaderTopics) throws 
ClassifierException {
+        SolrInputDocument doc = new SolrInputDocument();
+        doc.addField(topicUriField, id);
+        if (broaderTopics != null && broaderField != null) {
+            doc.addField(broaderField, broaderTopics);
+        }
+        SolrServer solrServer = getActiveSolrServer();
+        try {
+            solrServer.add(doc);
+            solrServer.commit();
+        } catch (Exception e) {
+            String msg = String.format("Error adding topic with id '%s' on 
Solr Core '%s'", id, solrCoreId);
+            throw new ClassifierException(msg, e);
+        }
+    }
+
+    @Override
+    public void removeTopic(String id) throws ClassifierException {
+        SolrServer solrServer = getActiveSolrServer();
+        try {
+            solrServer.deleteByQuery(topicUriField + ":" + id);
+            solrServer.commit();
+        } catch (Exception e) {
+            String msg = String.format("Error adding topic with id '%s' on 
Solr Core '%s'", id, solrCoreId);
+            throw new ClassifierException(msg, e);
+        }
     }
 
+    @Override
+    public boolean isUpdatable() {
+        return trainingSet != null;
+    }
+
+    @Override
+    public void setTrainingSet(TrainingSet trainingSet) {
+        this.trainingSet = trainingSet;
+    }
+
+    @Override
+    public void updateModel() throws TrainingSetException {
+        checkTrainingSet();
+        // TODO:
+        // perform a first query to iterate over all the registered topics 
sorted by id (to allow for paging)
+        // for each topic find the last update date of the union of the topic 
and it's narrower topic
+    }
+
+    protected void checkTrainingSet() throws TrainingSetException {
+        if (trainingSet != null) {
+            throw new TrainingSetException(
+                    String.format("TopicClassificationEngine %s has no 
registered"
+                                  + " training set hence cannot be updated.", 
engineId));
+        }
+    }
 }

Added: 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/Batch.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/Batch.java?rev=1226872&view=auto
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/Batch.java
 (added)
+++ 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/Batch.java
 Tue Jan  3 17:11:32 2012
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.topic;
+
+import java.io.Serializable;
+import java.util.List;
+
+/**
+ * Container data transfer object to fetch partial results over a query 
results one batch at a time.
+ * 
+ * @param <T>
+ *            the type of the items to batch over.
+ */
+public class Batch<T> implements Serializable {
+
+    private static final long serialVersionUID = 1L;
+
+    /**
+     * Boolean marker set by the backend to tell the caller that it can expect 
new items by issuing the same
+     * query by passing the companion offset marker.
+     */
+    public final boolean hasMore;
+
+    /**
+     * Marker value that the caller can pass to the dataset to fetch the next 
batch and perform efficient
+     * server side batching.
+     * 
+     * This value should refer to an indexed field with unique values such as 
a primary key or a random uuid
+     * (good for shuffling the example in arbitrary order). The samples return 
in the batches should be sorted
+     * according to this field so that the server can perform efficient range 
queries that are guaranteed to
+     * return no duplicate results across batches.
+     */
+    public final Object nextOffset;
+
+    public final List<T> items;
+
+    public Batch(List<T> items, boolean hasMore, Object nextOffset) {
+        this.items = items;
+        this.hasMore = hasMore;
+        this.nextOffset = nextOffset;
+    }
+}

Added: 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/ClassifierException.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/ClassifierException.java?rev=1226872&view=auto
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/ClassifierException.java
 (added)
+++ 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/ClassifierException.java
 Tue Jan  3 17:11:32 2012
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.topic;
+
+
+/**
+ * Exception raised when trying to classify a document or updating the 
classifier statistical model.
+ */
+public class ClassifierException extends Exception {
+
+    public ClassifierException(String message, Throwable cause) {
+        super(message, cause);
+    }
+
+    public ClassifierException(String message) {
+        super(message);
+    }
+
+    public ClassifierException(Throwable cause) {
+        super(cause);
+    }
+
+    private static final long serialVersionUID = 1L;
+
+}

Added: 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TopicClassifier.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TopicClassifier.java?rev=1226872&view=auto
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TopicClassifier.java
 (added)
+++ 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TopicClassifier.java
 Tue Jan  3 17:11:32 2012
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.topic;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.stanbol.enhancer.servicesapi.EngineException;
+
+/**
+ * Service interface for suggesting hierarchical topics from a specific scheme 
(a.k.a. taxonomy, thesaurus or
+ * topics hierarchy) from the text content of a document or part of a document.
+ */
+public interface TopicClassifier {
+
+    /**
+     * @return the short id identifying this classifier / scheme: can be used 
as URL path component to publish
+     *         the service.
+     */
+    public String getSchemeId();
+
+    /**
+     * @return list of language codes for text that can be automatically 
classified by the service.
+     */
+    public List<String> getAcceptedLanguages();
+
+    /**
+     * Perform automated text categorization based on statistical occurrences 
of words in the given text.
+     * 
+     * @param text
+     *            the text content to analyze
+     * @return the most likely topics related to the text
+     * @throws EngineException
+     */
+    List<TopicSuggestion> suggestTopics(String text) throws 
ClassifierException;
+
+    /**
+     * @return the set of ids of topics directly broader than
+     * @param id
+     */
+    Set<String> getNarrowerTopics(String broadTopicId) throws 
ClassifierException;
+
+    /**
+     * @return the set of ids of topics directly narrower than
+     * @param id
+     */
+    Set<String> getBroaderTopics(String id) throws ClassifierException;
+
+    /**
+     * @return the set of ids of topics without broader topics.
+     */
+    Set<String> getTopicRoots() throws ClassifierException;
+
+    /**
+     * @return true if the classifier model can be updated with the {@code 
addTopic} / {@code removeTopic} /
+     *         {@code updateModel} / methods.
+     */
+    boolean isUpdatable();
+
+    /**
+     * Register a topic and set it's ancestors in the taxonomy. Warning: 
re-adding an already existing topic
+     * can delete the underlying statistical model. Calling {@code 
updateModel} is necessary to rebuild the
+     * statistical model based on the hierarchical structure of the topics and 
the registered training set.
+     * 
+     * @param id
+     *            the new topic id
+     * @param broaderTopics
+     *            list of directly broader topics in the thesaurus
+     */
+    void addTopic(String id, Collection<String> broaderTopics) throws 
ClassifierException;
+
+    /**
+     * Remove a topic from the thesaurus. WARNING: it is the caller 
responsibility to recursively remove or
+     * update any narrower topic that might hold a reference on this topic. 
Once the tree is updated,
+     * {@code updateModel} should be called to re-align the statistical model 
to match the new hierarchy by
+     * drawing examples from the dataset.
+     * 
+     * @param id
+     *            if of the topic to remove from the model
+     */
+    void removeTopic(String id) throws ClassifierException;
+
+    /**
+     * Register a training set to use to build the statistical model of the 
classifier.
+     */
+    void setTrainingSet(TrainingSet trainingSet);
+
+    /**
+     * Incrementally update the statistical model of the classifier. Note: 
depending on the size of the
+     * dataset and the number of topics to update, this process can take a 
long time and should probably be
+     * wrapped in a dedicated thread if called by a the user interface layer.
+     */
+    void updateModel() throws TrainingSetException;
+}

Added: 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TopicSuggestion.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TopicSuggestion.java?rev=1226872&view=auto
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TopicSuggestion.java
 (added)
+++ 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TopicSuggestion.java
 Tue Jan  3 17:11:32 2012
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.topic;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Data transfer object for the individual topic classification results.
+ */
+public class TopicSuggestion {
+
+    public final String uri;
+
+    public final List<String> paths = new ArrayList<String>();
+
+    public final double score;
+
+    public TopicSuggestion(String uri, List<String> paths, double score) {
+        this.uri = uri;
+        if (paths != null) {
+            this.paths.addAll(paths);
+        }
+        this.score = score;
+    }
+
+    public TopicSuggestion(String uri, double score) {
+        this(uri, null, score);
+    }
+
+}

Added: 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TrainingSet.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TrainingSet.java?rev=1226872&view=auto
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TrainingSet.java
 (added)
+++ 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TrainingSet.java
 Tue Jan  3 17:11:32 2012
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.topic;
+
+import java.util.Calendar;
+import java.util.List;
+import java.util.Set;
+
+/**
+ * Source of categorized text documents that can be used to build a the 
statistical model of a
+ * TopicClassifier.
+ */
+public interface TrainingSet {
+
+    /**
+     * @return true if the training set can be updated using the {@code 
registerExample} API. If false that
+     *         means that the component is a view on a remote datasource that 
has its own API for updates
+     *         (e.g. the document repository of a CMS).
+     */
+    boolean isUpdatable() throws TrainingSetException;
+
+    /**
+     * Register some text content to be used as an example of document that 
should be positively classified as
+     * topics by the model.
+     * 
+     * @param id
+     *            Unique identifier of the example to create or override. If 
null, a new example with a
+     *            generated id will be created.
+     * @param text
+     *            Text content of the example. If null the example with the 
matching id will be deleted.
+     * 
+     * @param topics
+     *            The list of all the topics the example should be classified 
as.
+     * @return the id of the registered example (can be automatically 
generated)
+     */
+    String registerExample(String exampleId, String text, List<String> topics) 
throws TrainingSetException;
+
+    /**
+     * @param lastModificationDate
+     *            typically the date of the last classifier model update or 
null to find the list of all
+     *            topics registered in the dataset.
+     * @return the set of topic ids that received some modifications (e.g. new 
or updated examples) since
+     *         {@code lastModificationDate}.
+     */
+    Set<String> getUpdatedTopics(Calendar lastModificationDate) throws 
TrainingSetException;
+
+    /**
+     * Fetch examples representative of the set of topics passed as argument 
so as to be able to build a
+     * statistical model.
+     * 
+     * @param topics
+     *            list of admissible topics to search examples for: each 
example in the batch will be
+     *            classified in at list one of the requested topics. This list 
would typically comprise a
+     *            topic along with it's direct narrower descendants (and maybe 
level 2 descendants too).
+     * @param offset
+     *            marker value to fetch the next batch. Pass null to fetch the 
first batch.
+     * @return a batch of example suitable for training a classifier model for 
the requested topics.
+     */
+    Batch<String> getPositiveExamples(List<String> topics, Object offset) 
throws TrainingSetException;
+
+    /**
+     * Fetch examples representative of any document not specifically 
classified in one of the passed topics.
+     * This can be useful to train a statistical model for a classifier of 
those topics to negatively weight
+     * generic features (term occurrences) and limit the number of false 
positives in the classification. It
+     * is up to the classifier model to decide to use such negative examples 
or not at training time.
+     * 
+     * @param topics
+     *            list of non-admissible topics to search example for: each 
example in the batch must no be
+     *            classified in any of the passed topics.
+     * @param offset
+     *            marker value to fetch the next batch. Pass null to fetch the 
first batch.
+     * @return a batch of examples suitable for training (negative-refinement) 
a classifier model for the
+     *         requested topics.
+     */
+    Batch<String> getNegativeExamples(List<String> topics, Object offset) 
throws TrainingSetException;
+
+    /**
+     * Number of examples to fetch at once.
+     */
+    public void setBatchSize();
+
+}

Added: 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TrainingSetException.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TrainingSetException.java?rev=1226872&view=auto
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TrainingSetException.java
 (added)
+++ 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TrainingSetException.java
 Tue Jan  3 17:11:32 2012
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.topic;
+
+import java.io.IOException;
+
+/**
+ * Unexpected Error while performing read or write access to a topic 
classifier training set.
+ */
+public class TrainingSetException extends IOException {
+
+    private static final long serialVersionUID = 1L;
+
+    public TrainingSetException(String message) {
+        super(message);
+    }
+
+}

Modified: 
incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/TopicEngineTest.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/TopicEngineTest.java?rev=1226872&r1=1226871&r2=1226872&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/TopicEngineTest.java
 (original)
+++ 
incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/TopicEngineTest.java
 Tue Jan  3 17:11:32 2012
@@ -42,6 +42,7 @@ import org.apache.solr.client.solrj.resp
 import org.apache.solr.common.params.CommonParams;
 import org.apache.solr.core.CoreContainer;
 import org.apache.stanbol.commons.solr.utils.StreamQueryRequest;
+import org.apache.stanbol.enhancer.topic.TopicSuggestion;
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
@@ -128,6 +129,7 @@ public class TopicEngineTest {
         config.put(TopicClassificationEngine.SOLR_CORE, solrServer);
         config.put(TopicClassificationEngine.TOPIC_URI_FIELD, "topic");
         config.put(TopicClassificationEngine.SIMILARTITY_FIELD, "text");
+        config.put(TopicClassificationEngine.BROADER_FIELD, "broader");
         return config;
     }
 
@@ -162,13 +164,53 @@ public class TopicEngineTest {
         // check accept language optional param
         Hashtable<String,Object> configWithAcceptLangage = new 
Hashtable<String,Object>();
         configWithAcceptLangage.putAll(config);
-        configWithAcceptLangage.put(TopicClassificationEngine.LANGUAGE, "en, 
fr");
+        configWithAcceptLangage.put(TopicClassificationEngine.LANGUAGES, "en, 
fr");
         engine = 
TopicClassificationEngine.fromParameters(configWithAcceptLangage);
         assertNotNull(engine);
         assertEquals(engine.acceptedLanguages, Arrays.asList("en", "fr"));
     }
 
     @Test
+    public void testProgrammaticThesaurusConstruction() throws Exception {
+        TopicClassificationEngine engine = 
TopicClassificationEngine.fromParameters(getDefaultConfigParams());
+
+        // Register the roots of the taxonomy
+        engine.addTopic("http://example.com/topics/root1";, null);
+        engine.addTopic("http://example.com/topics/root2";, null);
+        engine.addTopic("http://example.com/topics/root3";, new 
ArrayList<String>());
+        assertEquals(0, 
engine.getBroaderTopics("http://example.com/topics/root1";).size());
+        assertEquals(0, 
engine.getBroaderTopics("http://example.com/topics/root2";).size());
+        assertEquals(0, 
engine.getBroaderTopics("http://example.com/topics/root3";).size());
+        assertEquals(3, engine.getTopicRoots().size());
+
+        // Register some non root nodes
+        engine.addTopic("http://example.com/topics/node1";,
+            Arrays.asList("http://example.com/topics/root1";, 
"http://example.com/topics/root2";));
+        engine.addTopic("http://example.com/topics/node2";, 
Arrays.asList("http://example.com/topics/root3";));
+        engine.addTopic("http://example.com/topics/node3";,
+            Arrays.asList("http://example.com/topics/node1";, 
"http://example.com/topics/node2";));
+        
+        // the root where not impacted
+        assertEquals(0, 
engine.getBroaderTopics("http://example.com/topics/root1";).size());
+        assertEquals(0, 
engine.getBroaderTopics("http://example.com/topics/root2";).size());
+        assertEquals(0, 
engine.getBroaderTopics("http://example.com/topics/root3";).size());
+        assertEquals(3, engine.getTopicRoots().size());
+
+        // the other nodes have the same broader topics as at creation time
+        assertEquals(2, 
engine.getBroaderTopics("http://example.com/topics/node1";).size());
+        assertEquals(1, 
engine.getBroaderTopics("http://example.com/topics/node2";).size());
+        assertEquals(2, 
engine.getBroaderTopics("http://example.com/topics/node3";).size());
+
+        // check the induced narrower relationships
+        assertEquals(1, 
engine.getNarrowerTopics("http://example.com/topics/root1";).size());
+        assertEquals(1, 
engine.getNarrowerTopics("http://example.com/topics/root2";).size());
+        assertEquals(1, 
engine.getNarrowerTopics("http://example.com/topics/root3";).size());
+        assertEquals(1, 
engine.getNarrowerTopics("http://example.com/topics/node1";).size());
+        assertEquals(1, 
engine.getNarrowerTopics("http://example.com/topics/node2";).size());
+        assertEquals(0, 
engine.getNarrowerTopics("http://example.com/topics/node3";).size());
+    }
+
+    @Test
     public void testEmptyIndexTopicClassification() throws Exception {
         TopicClassificationEngine engine = 
TopicClassificationEngine.fromParameters(getDefaultConfigParams());
         List<TopicSuggestion> suggestedTopics = engine.suggestTopics("This is 
a test.");

Modified: 
incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/test_schema.xml
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/test_schema.xml?rev=1226872&r1=1226871&r2=1226872&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/test_schema.xml
 (original)
+++ 
incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/test_schema.xml
 Tue Jan  3 17:11:32 2012
@@ -59,7 +59,8 @@
  <fields>
    <field name="topic" type="string" indexed="true" stored="true" 
required="true" /> 
    <field name="type" type="string" indexed="true" stored="true" 
multiValued="true" />
-   <field name="paths" type="string" indexed="true" stored="true" 
multiValued="true" /> 
+   <field name="paths" type="string" indexed="true" stored="true" 
multiValued="true" />
+   <field name="broader" type="string" indexed="true" stored="true" 
multiValued="true" />
    <field name="text" type="text" indexed="true" stored="false"
      termVectors="true" termPositions="false" termOffsets="false" />
    <field name="popularity" type="int" indexed="true" stored="true" />

Modified: 
incubator/stanbol/trunk/enhancer/generic/servicesapi/src/main/java/org/apache/stanbol/enhancer/servicesapi/rdf/TechnicalClasses.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/generic/servicesapi/src/main/java/org/apache/stanbol/enhancer/servicesapi/rdf/TechnicalClasses.java?rev=1226872&r1=1226871&r2=1226872&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/generic/servicesapi/src/main/java/org/apache/stanbol/enhancer/servicesapi/rdf/TechnicalClasses.java
 (original)
+++ 
incubator/stanbol/trunk/enhancer/generic/servicesapi/src/main/java/org/apache/stanbol/enhancer/servicesapi/rdf/TechnicalClasses.java
 Tue Jan  3 17:11:32 2012
@@ -45,6 +45,20 @@ public class TechnicalClasses {
      */
     public static final UriRef ENHANCER_ENTITYANNOTATION = new UriRef(
             NamespaceEnum.enhancer+"EntityAnnotation");
+    
+    /**
+     * Type used for annotations documents. This type is intended
+     * to be used in combination with ENHANCER_ENHANCEMENT and
+     * ENHANCER_ENTITYANNOTATION as a complimentary marker to suggest
+     * that the referenced is the one of the primary topic of the
+     * whole document or of a specific section specified by a linked
+     * TextAnnotation. 
+     * 
+     * The entity or concept is not necessarily explicitly mentioned
+     * in the document (like a traditional entity occurrence would).
+     */
+    public static final UriRef ENHANCER_TOPICANNOTATION = new UriRef(
+            NamespaceEnum.enhancer+"TopicAnnotation");
 
     /**
      * To be used as a type pour any semantic knowledge extraction


Reply via email to