Author: ogrisel
Date: Tue Dec 6 20:09:44 2011
New Revision: 1211079
URL: http://svn.apache.org/viewvc?rev=1211079&view=rev
Log:
STANBOL-197: work in progress on topic classifier
Added:
incubator/stanbol/trunk/enhancer/engines/topic/ (with props)
incubator/stanbol/trunk/enhancer/engines/topic/pom.xml
incubator/stanbol/trunk/enhancer/engines/topic/sling/
incubator/stanbol/trunk/enhancer/engines/topic/sling/datafiles/
incubator/stanbol/trunk/enhancer/engines/topic/src/
incubator/stanbol/trunk/enhancer/engines/topic/src/main/
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicSuggestion.java
incubator/stanbol/trunk/enhancer/engines/topic/src/main/resources/
incubator/stanbol/trunk/enhancer/engines/topic/src/test/
incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/
incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/
incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/
incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/
incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/
incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/
incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/
incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/TopicEngineTest.java
incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/integration/
incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/integration/TopicClassificationOSGiTest.java
incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/
incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/test_schema.xml
incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/test_solr.xml
incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/test_solrconfig.xml
Modified:
incubator/stanbol/trunk/enhancer/engines/pom.xml
incubator/stanbol/trunk/parent/pom.xml
incubator/stanbol/trunk/pom.xml
Modified: incubator/stanbol/trunk/enhancer/engines/pom.xml
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/pom.xml?rev=1211079&r1=1211078&r2=1211079&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/pom.xml (original)
+++ incubator/stanbol/trunk/enhancer/engines/pom.xml Tue Dec 6 20:09:44 2011
@@ -43,10 +43,11 @@
<modules>
<module>../autotagging</module>
-
+
<module>autotagging</module>
<module>opennlp-ner</module>
<module>langid</module>
+ <module>topic</module>
<module>metaxa</module>
<module>geonames</module>
<module>entitytagging</module>
@@ -55,7 +56,7 @@
<!-- RICK based enhancement engine(s) -->
<module>opencalais</module>
<module>zemanta</module>
-
+
<module>../clerezza/org.apache.stanbol.enhancer.clerezza</module>
<module>../clerezza/clerezza-sparql</module>
<module>../SemiAutomaticContentEnhancer</module>
Propchange: incubator/stanbol/trunk/enhancer/engines/topic/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Tue Dec 6 20:09:44 2011
@@ -0,0 +1,4 @@
+target
+.settings
+.classpath
+.project
Added: incubator/stanbol/trunk/enhancer/engines/topic/pom.xml
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/pom.xml?rev=1211079&view=auto
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/topic/pom.xml (added)
+++ incubator/stanbol/trunk/enhancer/engines/topic/pom.xml Tue Dec 6 20:09:44
2011
@@ -0,0 +1,237 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more
contributor
+ license agreements. See the NOTICE file distributed with this work for
additional
+ information regarding copyright ownership. The ASF licenses this file to
+ You under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
http://maven.apache.org/maven-v4_0_0.xsd">
+
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.parent</artifactId>
+ <version>0.9.0-incubating-SNAPSHOT</version>
+ <relativePath>../../parent</relativePath>
+ </parent>
+
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.engine.topic</artifactId>
+ <packaging>bundle</packaging>
+
+ <name>Apache Stanbol Enhancer Enhancement Engine : Topic
Classification</name>
+ <description>
+ Implementation of an annotation engine that links the content item
+ to a set of possible categories from a dedicated Solr index using
+ MoreLikeThis queries.
+
+ The classification can be either applied to a complete document
+ (text in a given language) which is the default behavior or to a
+ specific portion of the text (using a TextAnnotation).
+ </description>
+
+ <inceptionYear>2010</inceptionYear>
+
+ <scm>
+ <connection>
+
scm:svn:http://svn.apache.org/repos/asf/incubator/stanbol/trunk/enhancer/engines/topic/
+ </connection>
+ <developerConnection>
+
scm:svn:https://svn.apache.org/repos/asf/incubator/stanbol/trunk/enhancer/engines/topic/
+ </developerConnection>
+ <url>http://incubator.apache.org/stanbol/</url>
+ </scm>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.servicemix.tooling</groupId>
+ <artifactId>depends-maven-plugin</artifactId>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>maven-bundle-plugin</artifactId>
+ <extensions>true</extensions>
+ <configuration>
+ <instructions>
+ <Export-Package>
+
org.apache.stanbol.enhancer.engine.topic.*;version=${project.version}
+ </Export-Package>
+ </instructions>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>maven-scr-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
+ <dependencies>
+ <!-- Runtime dependencies for testing
+
+ WARNING: the testing dependencies should be put first because that has an
+ impact on the classpath ordering and the OSGi implementation from the test
+ framework needs to be picked up over the one that comes from other build
+ dependencies (e.g. clerezza) otherwise the test will fail to run with maven
+ with the following error message:
+
+ java.lang.NoClassDefFoundError: org.osgi.vendor.framework property not set
+ -->
+ <dependency>
+ <groupId>org.ops4j.pax.exam</groupId>
+ <artifactId>pax-exam</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.ops4j.pax.exam</groupId>
+ <artifactId>pax-exam-junit4</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <!-- provides the javax.inject.Inject annotation -->
+ <groupId>org.apache.geronimo.specs</groupId>
+ <artifactId>geronimo-atinject_1.0_spec</artifactId>
+ <version>1.0</version>
+ <scope>provided</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.ops4j.pax.exam</groupId>
+ <artifactId>pax-exam-link-assembly</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.ops4j.pax.exam</groupId>
+ <artifactId>pax-exam-testforge</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.ops4j.pax.exam</groupId>
+ <artifactId>pax-exam-spi</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.ops4j.pax.url</groupId>
+ <artifactId>pax-url-mvn</artifactId>
+ <scope>test</scope>
+ <version>1.3.5</version>
+ </dependency>
+ <!-- The following container should make it possible to run the OSGi
+ faster by using felix in the host JVM rather than forking a dedicated
JVM. -->
+ <dependency>
+ <groupId>org.ops4j.pax.exam</groupId>
+ <artifactId>pax-exam-container-native</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>org.apache.felix.framework</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>org.apache.felix.scr</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <!-- The following container can be use instead of native if we want
+ to fork a new JVM and / or test against equinox for instance -->
+ <!--
+ <dependency>
+ <groupId>org.ops4j.pax.exam</groupId>
+ <artifactId>pax-exam-container-paxrunner</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.ops4j.pax.runner</groupId>
+ <artifactId>pax-runner-no-jcl</artifactId>
+ <scope>test</scope>
+ </dependency>
+ -->
+
+ <dependency>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>org.apache.felix.http.jetty</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>org.apache.felix.configadmin</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.httpcomponents</groupId>
+ <artifactId>httpcore-osgi</artifactId>
+ <version>4.0.1</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.clerezza</groupId>
+ <artifactId>utils</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-simple</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.clerezza.ext</groupId>
+ <artifactId>com.ibm.icu</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.wymiwyg</groupId>
+ <artifactId>wymiwyg-commons-core</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>commons-codec</groupId>
+ <artifactId>commons-codec</artifactId>
+ <scope>test</scope>
+ </dependency>
+
+
+ <!-- Normal build dependencies -->
+ <dependency>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.servicesapi</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.commons.solr.core</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.commons.solr.managed</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>commons-io</groupId>
+ <artifactId>commons-io</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>org.apache.felix.scr.annotations</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.clerezza</groupId>
+ <artifactId>rdf.core</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-api</artifactId>
+ </dependency>
+
+ </dependencies>
+
+</project>
Added:
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java?rev=1211079&view=auto
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
(added)
+++
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
Tue Dec 6 20:09:44 2011
@@ -0,0 +1,298 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.engine.topic;
+
+import static
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.NIE_PLAINTEXTCONTENT;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Dictionary;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.clerezza.rdf.core.Triple;
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.commons.io.IOUtils;
+import org.apache.felix.scr.annotations.Activate;
+import org.apache.felix.scr.annotations.Component;
+import org.apache.felix.scr.annotations.ConfigurationPolicy;
+import org.apache.felix.scr.annotations.Deactivate;
+import org.apache.felix.scr.annotations.Properties;
+import org.apache.felix.scr.annotations.Property;
+import org.apache.felix.scr.annotations.Service;
+import org.apache.solr.client.solrj.SolrQuery;
+import org.apache.solr.client.solrj.SolrServer;
+import org.apache.solr.client.solrj.SolrServerException;
+import org.apache.solr.client.solrj.response.QueryResponse;
+import org.apache.solr.common.SolrDocument;
+import org.apache.solr.common.SolrDocumentList;
+import org.apache.solr.common.params.CommonParams;
+import org.apache.solr.common.params.MoreLikeThisParams;
+import org.apache.stanbol.commons.solr.IndexReference;
+import org.apache.stanbol.commons.solr.RegisteredSolrServerTracker;
+import org.apache.stanbol.commons.solr.utils.StreamQueryRequest;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+import org.apache.stanbol.enhancer.servicesapi.EngineException;
+import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.InvalidContentException;
+import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
+import org.osgi.framework.InvalidSyntaxException;
+import org.osgi.service.cm.ConfigurationException;
+import org.osgi.service.component.ComponentContext;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Enhancement Engine that provides the ability to assign a text document to a
set of topics indexed in a
+ * dedicated Solr core. The assignment logic comes from terms frequencies
match of the text of the document to
+ * categorize with the text indexed for each topic.
+ *
+ * The solr server is expected to be configured with the MoreLikeThisHandler
and the matching fields from the
+ * engine configuration.
+ */
+@Component(metatype = true, immediate = true, configurationFactory = true,
policy = ConfigurationPolicy.REQUIRE)
+@Service
+@Properties(value = {@Property(name = TopicClassificationEngine.ENGINE_ID),
+ @Property(name = TopicClassificationEngine.ORDER,
intValue = 100),
+ @Property(name = TopicClassificationEngine.SOLR_CORE),
+ @Property(name = TopicClassificationEngine.LANGUAGE),
+ @Property(name =
TopicClassificationEngine.SIMILARTITY_FIELD),
+ @Property(name =
TopicClassificationEngine.TOPIC_URI_FIELD),
+ @Property(name =
TopicClassificationEngine.MATERIALIZED_PATH_FIELD)})
+public class TopicClassificationEngine implements EnhancementEngine,
ServiceProperties {
+
+ public static final String ENGINE_ID =
"org.apache.stanbol.enhancer.engine.id";
+
+ public static final String SOLR_CORE =
"org.apache.stanbol.enhancer.engine.topic.solrCore";
+
+ public static final String LANGUAGE =
"org.apache.stanbol.enhancer.engine.topic.language";
+
+ public static final String ORDER =
"org.apache.stanbol.enhancer.engine.topic.order";
+
+ public static final String SIMILARTITY_FIELD =
"org.apache.stanbol.enhancer.engine.topic.similarityField";
+
+ public static final String TOPIC_URI_FIELD =
"org.apache.stanbol.enhancer.engine.topic.uriField";
+
+ public static final String MATERIALIZED_PATH_FIELD =
"org.apache.stanbol.enhancer.engine.topic.materializedPathField";
+
+ private static final Logger log =
LoggerFactory.getLogger(TopicClassificationEngine.class);
+
+ protected String engineId;
+
+ protected String solrCoreId;
+
+ protected List<String> acceptedLanguages;
+
+ protected Integer order = ORDERING_EXTRACTION_ENHANCEMENT;
+
+ protected RegisteredSolrServerTracker indexTracker;
+
+ // instance of solrServer to use if not using the OSGi service tracker
(e.g. for tests)
+ protected SolrServer solrServer;
+
+ protected String similarityField;
+
+ protected String topicUriField;
+
+ protected String materializedPathField;
+
+ protected ComponentContext context;
+
+ protected int numTopics = 10;
+
+ @Activate
+ protected void activate(ComponentContext context) throws
ConfigurationException, InvalidSyntaxException {
+ @SuppressWarnings("unchecked")
+ Dictionary<String,Object> config = context.getProperties();
+ this.context = context;
+ configure(config);
+ }
+
+ @Deactivate
+ public void deactivate(ComponentContext context) {
+ if (indexTracker != null) {
+ indexTracker.close();
+ }
+ }
+
+ public void configure(Dictionary<String,Object> config) throws
ConfigurationException {
+ engineId = getRequiredStringParam(config, ENGINE_ID);
+ similarityField = getRequiredStringParam(config, SIMILARTITY_FIELD);
+ topicUriField = getRequiredStringParam(config, TOPIC_URI_FIELD);
+ acceptedLanguages = getStringListParan(config, LANGUAGE);
+ if (config.get(SOLR_CORE) instanceof SolrServer) {
+ // Bind a fixed Solr server client instead of doing dynamic OSGi
lookup using the service tracker.
+ // This can be useful both for unit-testing .
+ // The Solr server is expected to be configured with the
MoreLikeThisQueryHandler and the matching
+ // fields from the configuration.
+ solrServer = (SolrServer) config.get(SOLR_CORE);
+ } else {
+ String solrCoreId = getRequiredStringParam(config, SOLR_CORE);
+ if (context == null) {
+ throw new ConfigurationException(SOLR_CORE, SOLR_CORE
+ + " should be a
SolrServer instance for using"
+ + " the engine
without any OSGi context. Got: "
+ + solrCoreId);
+ }
+ try {
+ indexTracker = new
RegisteredSolrServerTracker(context.getBundleContext(),
+ IndexReference.parse(solrCoreId));
+ indexTracker.open();
+ } catch (InvalidSyntaxException e) {
+ throw new ConfigurationException(SOLR_CORE, e.getMessage(), e);
+ }
+ }
+ // optional field, can be null
+ materializedPathField = (String) config.get(TOPIC_URI_FIELD);
+ Object orderParamValue = config.get(ORDER);
+ if (orderParamValue != null) {
+ order = (Integer) orderParamValue;
+ }
+ }
+
+ protected String getRequiredStringParam(Dictionary<String,Object>
parameters, String paramName) throws ConfigurationException {
+ return getRequiredStringParam(parameters, paramName, null);
+ }
+
+ protected String getRequiredStringParam(Dictionary<String,Object> config,
+ String paramName,
+ String defaultValue) throws
ConfigurationException {
+ Object paramValue = config.get(paramName);
+ if (paramValue == null) {
+ if (defaultValue == null) {
+ throw new ConfigurationException(paramName, paramName + " is a
required parameter.");
+ } else {
+ return defaultValue;
+ }
+ }
+ return paramValue.toString();
+ }
+
+ @SuppressWarnings("unchecked")
+ protected List<String> getStringListParan(Dictionary<String,Object>
config, String paramName) throws ConfigurationException {
+ Object paramValue = config.get(paramName);
+ if (paramValue == null) {
+ return new ArrayList<String>();
+ } else if (paramValue instanceof String) {
+ return Arrays.asList(paramValue.toString().split(",\\s*"));
+ } else if (paramValue instanceof String[]) {
+ return Arrays.asList((String[]) paramValue);
+ } else if (paramValue instanceof List) {
+ return (List<String>) paramValue;
+ } else {
+ throw new ConfigurationException(paramName, String.format(
+ "Unexpected parameter type for '%s': %s", paramName,
paramValue));
+ }
+ }
+
+ @Override
+ public int canEnhance(ContentItem ci) throws EngineException {
+ String text = getTextFromContentItem(ci);
+ if (getActiveSolrServer() == null) {
+ log.warn(String.format("Solr Core '%s' is not available.",
solrCoreId));
+ return CANNOT_ENHANCE;
+ }
+ if (text.trim().length() == 0) {
+ return CANNOT_ENHANCE;
+ }
+ return ENHANCE_SYNCHRONOUS;
+ }
+
+ @Override
+ public void computeEnhancements(ContentItem ci) throws EngineException {
+ String text = getTextFromContentItem(ci);
+ suggestTopics(text);
+
+ // TODO: express the results as RDF.
+ }
+
+ public List<TopicSuggestion> suggestTopics(String text) throws
EngineException {
+ List<TopicSuggestion> suggestedTopics = new
ArrayList<TopicSuggestion>(numTopics);
+ SolrServer solrServer = getActiveSolrServer();
+ SolrQuery query = new SolrQuery();
+ query.setQueryType("/" + MoreLikeThisParams.MLT);
+ query.set(MoreLikeThisParams.MATCH_INCLUDE, false);
+ query.set(MoreLikeThisParams.MIN_DOC_FREQ, 1);
+ query.set(MoreLikeThisParams.MIN_TERM_FREQ, 1);
+ // TODO: find a way to parse the interesting terms and report them
+ // for debugging / explanation in dedicated RDF datastucture.
+ // query.set(MoreLikeThisParams.INTERESTING_TERMS, "details");
+ query.set(MoreLikeThisParams.SIMILARITY_FIELDS, similarityField);
+ query.set(CommonParams.STREAM_BODY, text);
+ query.setRows(numTopics);
+ try {
+ StreamQueryRequest request = new StreamQueryRequest(query);
+ QueryResponse response = request.process(solrServer);
+ SolrDocumentList results = response.getResults();
+ for (SolrDocument result : results.toArray(new SolrDocument[0])) {
+ suggestedTopics.add(new TopicSuggestion((String)
result.getFirstValue(TOPIC_URI_FIELD), 0.0));
+ }
+ } catch (SolrServerException e) {
+ if ("unknown handler: /mlt".equals(e.getCause().getMessage())) {
+ String message = String.format("SolrServer with id '%s' for
topic engine '%s' lacks"
+ + " configuration for the
MoreLikeThisHandler", solrCoreId,
+ engineId);
+ throw new EngineException(message, e);
+ } else {
+ throw new EngineException(e);
+ }
+ }
+ return suggestedTopics;
+ }
+
+ /**
+ * @return the manually bound solrServer instance or the one tracked by
the OSGi service tracker.
+ */
+ protected SolrServer getActiveSolrServer() {
+ return solrServer != null ? solrServer : indexTracker.getService();
+ }
+
+ @Override
+ public Map<String,Object> getServiceProperties() {
+ return
Collections.unmodifiableMap(Collections.singletonMap(ENHANCEMENT_ENGINE_ORDERING,
+ (Object) order));
+ }
+
+ protected String getTextFromContentItem(ContentItem ci) throws
InvalidContentException {
+ // Refactor the following using an adapter.
+ String text = "";
+ if (ci.getMimeType().startsWith("text/plain")) {
+ try {
+ // TODO: handle explicit charsets if any and fallback to UTF-8
if missing
+ text = IOUtils.toString(ci.getStream(), "UTF-8");
+ } catch (IOException e) {
+ throw new InvalidContentException(this, ci, e);
+ }
+ } else {
+ Iterator<Triple> it = ci.getMetadata().filter(new
UriRef(ci.getId()), NIE_PLAINTEXTCONTENT, null);
+ while (it.hasNext()) {
+ text += it.next().getObject();
+ }
+ }
+ return text;
+ }
+
+ public static TopicClassificationEngine
fromParameters(Dictionary<String,Object> config) throws ConfigurationException {
+ TopicClassificationEngine engine = new TopicClassificationEngine();
+ engine.configure(config);
+ return engine;
+ }
+
+}
Added:
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicSuggestion.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicSuggestion.java?rev=1211079&view=auto
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicSuggestion.java
(added)
+++
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicSuggestion.java
Tue Dec 6 20:09:44 2011
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.engine.topic;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Data transfer object for the individual topic classification results.
+ */
+public class TopicSuggestion {
+
+ public final String uri;
+
+ public final List<String> paths = new ArrayList<String>();
+
+ public final double score;
+
+ public TopicSuggestion(String uri, List<String> paths, double score) {
+ this.uri = uri;
+ if (paths != null) {
+ this.paths.addAll(paths);
+ }
+ this.score = score;
+ }
+
+ public TopicSuggestion(String uri, double score) {
+ this(uri, null, score);
+ }
+
+}
Added:
incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/TopicEngineTest.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/TopicEngineTest.java?rev=1211079&view=auto
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/TopicEngineTest.java
(added)
+++
incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/TopicEngineTest.java
Tue Dec 6 20:09:44 2011
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.engine.topic;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.fail;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Hashtable;
+import java.util.List;
+
+import javax.xml.parsers.ParserConfigurationException;
+
+import junit.framework.TestCase;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.io.IOUtils;
+import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
+import org.apache.solr.core.CoreContainer;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+import org.osgi.service.cm.ConfigurationException;
+import org.xml.sax.SAXException;
+
+public class TopicEngineTest {
+
+ EmbeddedSolrServer solrServer;
+
+ File solrHome;
+
+ @Before
+ public void makeEmptyEmbeddedSolrServer() throws IOException,
ParserConfigurationException, SAXException {
+ solrHome = File.createTempFile("topicEngineTest_", "_solr_folder");
+ solrHome.delete();
+ solrHome.mkdir();
+
+ // solr conf file
+ File solrFile = new File(solrHome, "solr.xml");
+ InputStream is = getClass().getResourceAsStream("/test_solr.xml");
+ TestCase.assertNotNull("missing test solr.xml file", is);
+ IOUtils.copy(is, new FileOutputStream(solrFile));
+
+ // solr conf folder with schema
+ File solrConfFolder = new File(solrHome, "conf");
+ solrConfFolder.mkdir();
+ File schemaFile = new File(solrConfFolder, "schema.xml");
+ is = getClass().getResourceAsStream("/test_schema.xml");
+ TestCase.assertNotNull("missing test solr schema.xml file", is);
+ IOUtils.copy(is, new FileOutputStream(schemaFile));
+
+ File solrConfigFile = new File(solrConfFolder, "solrconfig.xml");
+ is = getClass().getResourceAsStream("/test_solrconfig.xml");
+ TestCase.assertNotNull("missing test solrconfig.xml file", is);
+ IOUtils.copy(is, new FileOutputStream(solrConfigFile));
+
+ // create the embedded server
+ CoreContainer coreContainer = new
CoreContainer(solrHome.getAbsolutePath());
+ solrServer = new EmbeddedSolrServer(coreContainer, "test");
+ }
+
+ @After
+ public void cleanupEmbeddedSolrServer() {
+ FileUtils.deleteQuietly(solrHome);
+ }
+
+ protected Hashtable<String,Object> getDefaultConfigParams() {
+ Hashtable<String,Object> config = new Hashtable<String,Object>();
+ config.put(TopicClassificationEngine.ENGINE_ID, "test-engine");
+ config.put(TopicClassificationEngine.SOLR_CORE, solrServer);
+ config.put(TopicClassificationEngine.TOPIC_URI_FIELD, "topic");
+ config.put(TopicClassificationEngine.SIMILARTITY_FIELD, "text");
+ return config;
+ }
+
+ @Test
+ public void testEngineConfiguation() throws ConfigurationException {
+ Hashtable<String,Object> config = getDefaultConfigParams();
+ TopicClassificationEngine engine =
TopicClassificationEngine.fromParameters(config);
+ assertNotNull(engine);
+ assertEquals(engine.engineId, "test-engine");
+ assertEquals(engine.solrServer, solrServer);
+ assertEquals(engine.topicUriField, "topic");
+ assertEquals(engine.similarityField, "text");
+ assertEquals(engine.acceptedLanguages, new ArrayList<String>());
+
+ // check some required attributes
+ Hashtable<String,Object> configWithMissingTopicField = new
Hashtable<String,Object>();
+ configWithMissingTopicField.putAll(config);
+
configWithMissingTopicField.remove(TopicClassificationEngine.TOPIC_URI_FIELD);
+ try {
+
TopicClassificationEngine.fromParameters(configWithMissingTopicField);
+ fail("Should have raised a ConfigurationException");
+ } catch (ConfigurationException e) {}
+
+ Hashtable<String,Object> configWithMissingEngineId = new
Hashtable<String,Object>();
+ configWithMissingEngineId.putAll(config);
+ configWithMissingEngineId.remove(TopicClassificationEngine.ENGINE_ID);
+ try {
+
TopicClassificationEngine.fromParameters(configWithMissingEngineId);
+ fail("Should have raised a ConfigurationException");
+ } catch (ConfigurationException e) {}
+
+ // check accept language optional param
+ Hashtable<String,Object> configWithAcceptLangage = new
Hashtable<String,Object>();
+ configWithAcceptLangage.putAll(config);
+ configWithAcceptLangage.put(TopicClassificationEngine.LANGUAGE, "en,
fr");
+ engine =
TopicClassificationEngine.fromParameters(configWithAcceptLangage);
+ assertNotNull(engine);
+ assertEquals(engine.acceptedLanguages, Arrays.asList("en", "fr"));
+ }
+
+ //@Test
+ public void testClassificationTest() throws Exception {
+ TopicClassificationEngine engine =
TopicClassificationEngine.fromParameters(getDefaultConfigParams());
+ List<TopicSuggestion> suggestedTopics = engine.suggestTopics("This is
a test.");
+ assertNotNull(suggestedTopics);
+ // TODO implement me
+ }
+}
Added:
incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/integration/TopicClassificationOSGiTest.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/integration/TopicClassificationOSGiTest.java?rev=1211079&view=auto
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/integration/TopicClassificationOSGiTest.java
(added)
+++
incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/integration/TopicClassificationOSGiTest.java
Tue Dec 6 20:09:44 2011
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.engine.topic.integration;
+
+import static org.ops4j.pax.exam.CoreOptions.equinox;
+import static org.ops4j.pax.exam.CoreOptions.felix;
+import static org.ops4j.pax.exam.CoreOptions.junitBundles;
+import static org.ops4j.pax.exam.CoreOptions.mavenBundle;
+import static org.ops4j.pax.exam.CoreOptions.options;
+import static org.ops4j.pax.exam.CoreOptions.systemProperty;
+
+import java.util.Dictionary;
+import java.util.Hashtable;
+
+import javax.inject.Inject;
+
+import junit.framework.TestCase;
+
+import org.apache.stanbol.enhancer.engine.topic.TopicClassificationEngine;
+import org.junit.Before;
+import org.ops4j.pax.exam.Option;
+import org.osgi.framework.BundleContext;
+import org.osgi.framework.ServiceReference;
+import org.osgi.service.cm.Configuration;
+import org.osgi.service.cm.ConfigurationAdmin;
+import org.osgi.service.http.HttpService;
+
+// Disabled integration test because SCR configuration factory init is crashing
+//@RunWith(JUnit4TestRunner.class)
+//@ExamReactorStrategy(AllConfinedStagedReactorFactory.class)
+public class TopicClassificationOSGiTest {
+
+ @Inject
+ BundleContext context;
+
+ // inject http service to ensure that jetty init thread is finished before
tearing down otherwise the test
+ // harness will crash
+ @Inject
+ HttpService httpService;
+
+ @Before
+ public void registerSolrCore() throws Exception {
+ // TODO
+ }
+
+ @org.ops4j.pax.exam.junit.Configuration()
+ public Option[] config() {
+ return options(
+ systemProperty("org.osgi.service.http.port").value("8181"),
+
systemProperty("org.ops4j.pax.logging.DefaultServiceLog.level").value("WARN"),
+ mavenBundle("commons-codec", "commons-codec").versionAsInProject(),
+ mavenBundle("org.apache.httpcomponents",
"httpcore-osgi").versionAsInProject(),
+ mavenBundle("commons-io", "commons-io").versionAsInProject(),
+ // for some reason: versionAsInProject does not work for the
following:
+ mavenBundle("org.apache.clerezza.ext",
"com.ibm.icu").version("0.5-incubating-SNAPSHOT"),
+ mavenBundle("org.wymiwyg",
"wymiwyg-commons-core").versionAsInProject(),
+ mavenBundle("org.apache.commons",
"commons-compress").versionAsInProject(),
+ mavenBundle("org.apache.felix",
"org.apache.felix.configadmin").versionAsInProject(),
+ mavenBundle("org.apache.felix",
"org.apache.felix.http.jetty").versionAsInProject(),
+ mavenBundle("org.apache.felix",
"org.apache.felix.scr").versionAsInProject(),
+ mavenBundle("org.apache.stanbol",
"org.apache.stanbol.commons.stanboltools.datafileprovider")
+ .versionAsInProject(),
+ mavenBundle("org.apache.stanbol",
"org.apache.stanbol.commons.solr.core").versionAsInProject(),
+ mavenBundle("org.apache.stanbol",
"org.apache.stanbol.commons.solr.managed").versionAsInProject(),
+ mavenBundle("org.apache.clerezza", "utils").versionAsInProject(),
+ mavenBundle("org.apache.clerezza",
"rdf.core").versionAsInProject(),
+
+ mavenBundle("org.apache.stanbol",
"org.apache.stanbol.enhancer.servicesapi").versionAsInProject(),
+
+ // TODO: instead of deploying a previous version of the bundle
built by maven, find a way to wrap
+ // the engine class as a bundle directly in this test runtime.
+ mavenBundle("org.apache.stanbol",
"org.apache.stanbol.enhancer.engine.topic")
+ .versionAsInProject(), junitBundles(), felix(), equinox());
+ // Note: the equinox tests can only be run if the test container is
switched to the slower non-native,
+ // implementation
+ }
+
+ // Disabled integration test because SCR configuration factory init is
crashing
+ //@Test
+ public void testTopicClassification() throws Exception {
+ System.out.println("Running test on bundle: " + context.getBundle());
+ ServiceReference reference =
context.getServiceReference(ConfigurationAdmin.class.getName());
+
+ ConfigurationAdmin configAdmin = (ConfigurationAdmin)
context.getService(reference);
+ Configuration config =
configAdmin.createFactoryConfiguration(TopicClassificationEngine.class
+ .getName());
+ Dictionary<String,String> parameters = new Hashtable<String,String>();
+ parameters.put(TopicClassificationEngine.ENGINE_ID, "testclassifier");
+ // TODO: put the coreId of the solr server registered in @Before
+ config.update(parameters);
+
+ // TODO: use a service track to wait for the registration of the
service
+ ServiceReference topicEngineReference =
context.getServiceReference(TopicClassificationEngine.class
+ .getName());
+ TestCase.assertNotNull(topicEngineReference);
+ TopicClassificationEngine engine = (TopicClassificationEngine) context
+ .getService(topicEngineReference);
+ TestCase.assertNotNull(engine);
+ // TODO: test classification here
+ }
+}
Added:
incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/test_schema.xml
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/test_schema.xml?rev=1211079&view=auto
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/test_schema.xml
(added)
+++
incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/test_schema.xml
Tue Dec 6 20:09:44 2011
@@ -0,0 +1,71 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<schema name="example" version="1.3">
+ <types>
+ <fieldType name="string" class="solr.StrField" sortMissingLast="true"
+ omitNorms="true"/>
+
+ <fieldType name="int" class="solr.TrieIntField" precisionStep="0"
+ omitNorms="true" positionIncrementGap="0"/>
+
+ <fieldType name="random" class="solr.RandomSortField" indexed="true" />
+
+ <fieldType name="text" class="solr.TextField">
+ <analyzer type="index">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true"
+ words="stopwords_en.txt" enablePositionIncrements="false" />
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <!-- The use of Shingle might help improve the quality but they
increase
+ the size of the index far too much. It would be better to use a
+ collocation bloom filter to mitigate this effect:
+ http://issues.apache.org/jira/browse/MAHOUT-415
+
+ <filter class="solr.ShingleFilterFactory" maxShingleSize="2"
+ outputUnigrams="true"/>
+ -->
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true"
+ words="stopwords_en.txt" enablePositionIncrements="false" />
+ <filter class="solr.SynonymFilterFactory"
+ synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <!--
+ <filter class="solr.ShingleFilterFactory" maxShingleSize="2"
+ outputUnigrams="true"/>
+ -->
+ </analyzer>
+ </fieldType>
+
+ </types>
+
+ <fields>
+ <field name="id" type="string" indexed="true" stored="true" required="true"
/>
+ <field name="type" type="string" indexed="true" stored="true"
multiValued="true" />
+ <field name="paths" type="string" indexed="true" stored="true"
multiValued="true" />
+ <field name="text" type="text" indexed="true" stored="false"
+ termVectors="true" termPositions="false" termOffsets="false" />
+ <field name="popularity" type="int" indexed="true" stored="true" />
+ </fields>
+
+ <uniqueKey>id</uniqueKey>
+ <defaultSearchField>text</defaultSearchField>
+ <solrQueryParser defaultOperator="AND"/>
+</schema>
Added:
incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/test_solr.xml
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/test_solr.xml?rev=1211079&view=auto
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/test_solr.xml
(added)
+++
incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/test_solr.xml
Tue Dec 6 20:09:44 2011
@@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<solr persistent="false">
+
+ <!--
+ adminPath: RequestHandler path to manage cores.
+ If 'null' (or absent), cores will not be manageable via request handler
+ -->
+ <cores adminPath="/admin/cores" defaultCoreName="test">
+ <core name="test" instanceDir="test" />
+ </cores>
+</solr>