Author: rwesten
Date: Thu Aug 23 07:50:45 2012
New Revision: 1376396
URL: http://svn.apache.org/viewvc?rev=1376396&view=rev
Log:
Merged revisions 1376385 and 1376046 from trunk
Modified:
incubator/stanbol/branches/dbpedia-spotlight-engines/chain/allactive/src/main/java/org/apache/stanbol/enhancer/chain/allactive/impl/DefaultChain.java
incubator/stanbol/branches/dbpedia-spotlight-engines/chain/allactive/src/main/resources/OSGI-INF/metatype/metatype.properties
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/langdetect/src/main/java/org/apache/stanbol/enhancer/engines/langdetect/LanguageDetectionEnhancementEngine.java
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/langdetect/src/main/resources/OSGI-INF/metatype/metatype.properties
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/langdetect/src/test/java/org/apache/stanbol/enhancer/engines/langdetect/LanguageDetectionEngineTest.java
Modified:
incubator/stanbol/branches/dbpedia-spotlight-engines/chain/allactive/src/main/java/org/apache/stanbol/enhancer/chain/allactive/impl/DefaultChain.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/chain/allactive/src/main/java/org/apache/stanbol/enhancer/chain/allactive/impl/DefaultChain.java?rev=1376396&r1=1376395&r2=1376396&view=diff
==============================================================================
---
incubator/stanbol/branches/dbpedia-spotlight-engines/chain/allactive/src/main/java/org/apache/stanbol/enhancer/chain/allactive/impl/DefaultChain.java
(original)
+++
incubator/stanbol/branches/dbpedia-spotlight-engines/chain/allactive/src/main/java/org/apache/stanbol/enhancer/chain/allactive/impl/DefaultChain.java
Thu Aug 23 07:50:45 2012
@@ -29,6 +29,7 @@ import org.apache.stanbol.enhancer.servi
import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
import org.osgi.framework.Constants;
import org.osgi.framework.ServiceRegistration;
+import org.osgi.service.cm.ConfigurationException;
import org.osgi.service.component.ComponentContext;
/**
@@ -51,24 +52,47 @@ public class DefaultChain {
@Property(boolValue=DefaultChain.DEFAULT_STATE)
public static final String PROPERTY_ENABLED =
"stanbol.enhancer.chain.default.enabled";
-
+
+ @Property(value=DefaultChain.DEFAULT_NAME)
+ public static final String PROPERTY_NAME =
"stanbol.enhancer.chain.default.name";
+
+
public static final boolean DEFAULT_STATE = true;
+ public static final String DEFAULT_NAME = "default";
private ServiceRegistration defaultChainReg;
private AllActiveEnginesChain defaultChain;
@Activate
- protected void activate(ComponentContext ctx){
+ protected void activate(ComponentContext ctx) throws
ConfigurationException {
boolean enabled = DEFAULT_STATE;
Object value = ctx.getProperties().get(PROPERTY_ENABLED);
if(value != null){
enabled = Boolean.parseBoolean(value.toString());
}
+ value = ctx.getProperties().get(PROPERTY_NAME);
+ String name = value == null ? DEFAULT_NAME : value.toString();
+ if(name.isEmpty()){
+ throw new ConfigurationException(PROPERTY_NAME, "The parsed name
for the default chain MUST NOT be empty!");
+ }
+ int ranking;
+ value = ctx.getProperties().get(Constants.SERVICE_RANKING);
+ if(value instanceof Number){
+ ranking = ((Number)value).intValue();
+ } else if(value != null){
+ try {
+ ranking = Integer.parseInt(value.toString());
+ }catch (NumberFormatException e) {
+ throw new ConfigurationException(Constants.SERVICE_RANKING,
"Unable to pase Integer service.ranking value",e);
+ }
+ } else {
+ ranking = Integer.MIN_VALUE;
+ }
if(enabled){
- defaultChain = new
AllActiveEnginesChain(ctx.getBundleContext(),"default");
+ defaultChain = new
AllActiveEnginesChain(ctx.getBundleContext(),name);
Dictionary<String,Object> properties = new
Hashtable<String,Object>();
properties.put(Chain.PROPERTY_NAME, defaultChain.getName());
- properties.put(Constants.SERVICE_RANKING, Integer.MIN_VALUE);
+ properties.put(Constants.SERVICE_RANKING, ranking);
defaultChainReg = ctx.getBundleContext().registerService(
Chain.class.getName(), defaultChain, properties);
}
Modified:
incubator/stanbol/branches/dbpedia-spotlight-engines/chain/allactive/src/main/resources/OSGI-INF/metatype/metatype.properties
URL:
http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/chain/allactive/src/main/resources/OSGI-INF/metatype/metatype.properties?rev=1376396&r1=1376395&r2=1376396&view=diff
==============================================================================
---
incubator/stanbol/branches/dbpedia-spotlight-engines/chain/allactive/src/main/resources/OSGI-INF/metatype/metatype.properties
(original)
+++
incubator/stanbol/branches/dbpedia-spotlight-engines/chain/allactive/src/main/resources/OSGI-INF/metatype/metatype.properties
Thu Aug 23 07:50:45 2012
@@ -28,7 +28,12 @@ the default Chain including all currentl
stanbol.enhancer.chain.default.enabled.name=Enabled
stanbol.enhancer.chain.default.enabled.description=Allows to enable/disable
the registration \
-the default chain. See the documentation for more information about that
feature.
+the default chain including all currently active Enhancement Engines. \
+See the documentation for more information about that feature.
+
+stanbol.enhancer.chain.default.name.name=Name
+stanbol.enhancer.chain.default.name.description=The name of the Enhancement
Chain that includes \
+all active Enhancement Chains.
#===============================================================================
# AllActiveEnginesChain
Modified:
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/langdetect/src/main/java/org/apache/stanbol/enhancer/engines/langdetect/LanguageDetectionEnhancementEngine.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/langdetect/src/main/java/org/apache/stanbol/enhancer/engines/langdetect/LanguageDetectionEnhancementEngine.java?rev=1376396&r1=1376395&r2=1376396&view=diff
==============================================================================
---
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/langdetect/src/main/java/org/apache/stanbol/enhancer/engines/langdetect/LanguageDetectionEnhancementEngine.java
(original)
+++
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/langdetect/src/main/java/org/apache/stanbol/enhancer/engines/langdetect/LanguageDetectionEnhancementEngine.java
Thu Aug 23 07:50:45 2012
@@ -77,9 +77,14 @@ public class LanguageDetectionEnhancemen
/**
* a configurable value of the text segment length to check
*/
- @Property
+ @Property(intValue=LanguageDetectionEnhancementEngine.PROBE_LENGTH_DEFAULT)
public static final String PROBE_LENGTH_PROP =
"org.apache.stanbol.enhancer.engines.langdetect.probe-length";
+ /**
+ * a configurable value of the maximum number of suggested languages
+ */
+
@Property(intValue=LanguageDetectionEnhancementEngine.DEFAULT_MAX_SUGGESTED_LANGUAGES)
+ public static final String MAX_SUGGESTED_PROP =
"org.apache.stanbol.enhancer.engines.langdetect.max-suggested";
/**
* The default value for the Execution of this Engine. Currently set to
@@ -105,7 +110,19 @@ public class LanguageDetectionEnhancemen
*/
private static final Logger log =
LoggerFactory.getLogger(LanguageDetectionEnhancementEngine.class);
- private static final int PROBE_LENGTH_DEFAULT = 1000;
+ /*
+ * NOTE: Checked the Documentation: The tool already supports the taking
+ * of several shorter samples randomly distributed over the parsed text
+ * to imrpove results and reduce noise. See
+ * http://code.google.com/p/language-detection/wiki/FrequentlyAskedQuestion
+ * "Each detected language differs for the same document" for a hint.
+ */
+ private static final int PROBE_LENGTH_DEFAULT = -1;
+
+ /**
+ * Default value for the maximum number of suggested Languages
+ */
+ private static final int DEFAULT_MAX_SUGGESTED_LANGUAGES = 3;
/**
* How much text should be used for testing: If the value is 0 or smaller,
@@ -114,6 +131,8 @@ public class LanguageDetectionEnhancemen
*/
private int probeLength = PROBE_LENGTH_DEFAULT;
+ private int maxSuggestedLanguages = DEFAULT_MAX_SUGGESTED_LANGUAGES;
+
/**
* The literal factory
*/
@@ -134,8 +153,34 @@ public class LanguageDetectionEnhancemen
if (ce != null) {
@SuppressWarnings("unchecked")
Dictionary<String, String> properties = ce.getProperties();
- String lengthVal = properties.get(PROBE_LENGTH_PROP);
- probeLength = lengthVal == null ? PROBE_LENGTH_DEFAULT :
Integer.parseInt(lengthVal);
+ Object value = properties.get(PROBE_LENGTH_PROP);
+ if(value instanceof Number){
+ probeLength = ((Number)value).intValue();
+ } else if(value != null){
+ try {
+ probeLength = Integer.parseInt(value.toString());
+ } catch (NumberFormatException e) {
+ throw new ConfigurationException(PROBE_LENGTH_PROP,
+ "The parsed 'proble length' MUST be a valid Integer",
e);
+ }
+ } else {
+ probeLength = PROBE_LENGTH_DEFAULT;
+ }
+ value = properties.get(MAX_SUGGESTED_PROP);
+ if(value instanceof Number){
+ maxSuggestedLanguages = ((Number)value).intValue();
+ } else if(value != null){
+ try {
+ maxSuggestedLanguages = Integer.parseInt(value.toString());
+ } catch (NumberFormatException e) {
+ throw new ConfigurationException(MAX_SUGGESTED_PROP,
+ "The parsed number of the maximum suggested lanugages "
+ + "MUST BE a valid Integer", e);
+ }
+ }
+ if(maxSuggestedLanguages < 1){
+ maxSuggestedLanguages = DEFAULT_MAX_SUGGESTED_LANGUAGES;
+ }
}
languageIdentifier = new LanguageIdentifier();
}
@@ -143,6 +188,8 @@ public class LanguageDetectionEnhancemen
protected void deactivate(ComponentContext ce) {
super.deactivate(ce);
this.languageIdentifier = null;
+ this.maxSuggestedLanguages = -1;
+ this.probeLength = -1;
}
public int canEnhance(ContentItem ci) throws EngineException {
@@ -190,16 +237,20 @@ public class LanguageDetectionEnhancemen
}
// add language to metadata
- if (languages.size() > 0) {
+ if (languages != null) {
MGraph g = ci.getMetadata();
ci.getLock().writeLock().lock();
- // add best hypothesis
- Language oneLang = languages.get(0);
try {
- UriRef textEnhancement =
EnhancementEngineHelper.createTextEnhancement(ci, this);
- g.add(new TripleImpl(textEnhancement, DC_LANGUAGE, new
PlainLiteralImpl(oneLang.lang)));
- g.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE,
literalFactory.createTypedLiteral(oneLang.prob)));
- g.add(new TripleImpl(textEnhancement, DC_TYPE,
DCTERMS_LINGUISTIC_SYSTEM));
+ for(int i=0;i<maxSuggestedLanguages && i<languages.size();i++){
+ // add a hypothesis
+ Language hypothesis = languages.get(i);
+ UriRef textEnhancement =
EnhancementEngineHelper.createTextEnhancement(ci, this);
+ g.add(new TripleImpl(textEnhancement, DC_LANGUAGE, new
PlainLiteralImpl(hypothesis.lang)));
+ g.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE,
literalFactory.createTypedLiteral(hypothesis.prob)));
+ g.add(new TripleImpl(textEnhancement, DC_TYPE,
DCTERMS_LINGUISTIC_SYSTEM));
+ g.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE,
+ literalFactory.createTypedLiteral(hypothesis.prob)));
+ }
} finally {
ci.getLock().writeLock().unlock();
}
Modified:
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/langdetect/src/main/resources/OSGI-INF/metatype/metatype.properties
URL:
http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/langdetect/src/main/resources/OSGI-INF/metatype/metatype.properties?rev=1376396&r1=1376395&r2=1376396&view=diff
==============================================================================
---
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/langdetect/src/main/resources/OSGI-INF/metatype/metatype.properties
(original)
+++
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/langdetect/src/main/resources/OSGI-INF/metatype/metatype.properties
Thu Aug 23 07:50:45 2012
@@ -30,3 +30,13 @@ org.apache.stanbol.enhancer.engines.lang
Enhancer Engine: Language Identification
org.apache.stanbol.enhancer.engines.langdetect.LanguageDetectionEnhancementEngine.description=Detects
\
the Language for parsed Text.
+
+org.apache.stanbol.enhancer.engines.langdetect.max-suggested.name=Max
Suggested Languages
+org.apache.stanbol.enhancer.engines.langdetect.max-suggested.description=This \
+Engine supports the suggestion of multiple languages with confidence values.
This \
+allows to configure how much languages are suggested at a maximum (default: 3).
+
+org.apache.stanbol.enhancer.engines.langdetect.probe-length.name=Probe Length
+org.apache.stanbol.enhancer.engines.langdetect.probe-length.description= The \
+maximum number of characters used for language detection. Note that the used \
+library already supports random selection of text parts (default: -1
(deactivated))
Modified:
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/langdetect/src/test/java/org/apache/stanbol/enhancer/engines/langdetect/LanguageDetectionEngineTest.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/langdetect/src/test/java/org/apache/stanbol/enhancer/engines/langdetect/LanguageDetectionEngineTest.java?rev=1376396&r1=1376395&r2=1376396&view=diff
==============================================================================
---
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/langdetect/src/test/java/org/apache/stanbol/enhancer/engines/langdetect/LanguageDetectionEngineTest.java
(original)
+++
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/langdetect/src/test/java/org/apache/stanbol/enhancer/engines/langdetect/LanguageDetectionEngineTest.java
Thu Aug 23 07:50:45 2012
@@ -21,6 +21,7 @@ import static org.apache.stanbol.enhance
import static
org.apache.stanbol.enhancer.test.helper.EnhancementStructureHelper.validateAllTextAnnotations;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
import java.io.IOException;
import java.io.InputStream;
@@ -40,6 +41,7 @@ import org.apache.stanbol.enhancer.servi
import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
import org.apache.stanbol.enhancer.servicesapi.impl.StringSource;
import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;
+import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;
import org.osgi.service.cm.ConfigurationException;
@@ -122,10 +124,9 @@ public class LanguageDetectionEngineTest
expectedValues.put(Properties.DC_CREATOR,
LiteralFactory.getInstance().createTypedLiteral(
langIdEngine.getClass().getName()));
int textAnnotationCount = validateAllTextAnnotations(ci.getMetadata(),
text, expectedValues);
- assertEquals("A single TextAnnotation is expected",
1,textAnnotationCount);
- //even through this tests do not validate service quality but rather
- //the correct integration of the CELI service as EnhancementEngine
- //we expect the "en" is detected for the parsed text
+ assertTrue("A TextAnnotation is expected", textAnnotationCount > 0);
+ //even through this tests do not validate detection quality
+ //we expect the "en" is detected as best guess for the parsed text
assertEquals("The detected language for text '"+text+"' MUST BE 'en'",
"en",EnhancementEngineHelper.getLanguage(ci));