Author: rwesten
Date: Mon Sep 26 16:03:51 2011
New Revision: 1175921
URL: http://svn.apache.org/viewvc?rev=1175921&view=rev
Log:
Fixes STANBOL-330: Full Text Search tokens that do not contain a single
Alpha-Numeric chars are now ignored
other changes:
* Optimised POS tag set for swedish
* added "default Language" property information to the metatype.properties of
the KeywordLinkingEngine
Modified:
incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/resources/OSGI-INF/metatype/metatype.properties
incubator/stanbol/trunk/entityhub/query/clerezza/src/main/java/org/apache/stanbol/entityhub/query/clerezza/SparqlQueryUtils.java
incubator/stanbol/trunk/entityhub/query/clerezza/src/test/java/org/apache/stanbol/entityhub/query/clerezza/SparqlQueryUtilsTest.java
Modified:
incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java?rev=1175921&r1=1175920&r2=1175921&view=diff
==============================================================================
---
incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java
(original)
+++
incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java
Mon Sep 26 16:03:51 2011
@@ -141,10 +141,11 @@ public enum PosTagsCollectionEnum {
* NOTE: <ul>
* <li> This includes all typical noun categories as defined by MAMBA
* <li> Unclassifiable part-of-speech and
- * <li> Numerical ("RO" and "EN")
+ * <li> Numerical "RO"
+ * <li> EN is excluded
* </ul>
*/
-
SV_NOUN("sv",PosTypeCollectionType.NOUN,"NN","PN","AN","MN","VN","XX","EN","RO"),
+
SV_NOUN("sv",PosTypeCollectionType.NOUN,"NN","PN","AN","MN","VN","XX","RO"),
/**
* POS types for Verbs of the Swedish language based on the
* <a href="http://w3.msi.vxu.se/users/nivre/research/MAMBAlex.html">
Modified:
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/resources/OSGI-INF/metatype/metatype.properties
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/resources/OSGI-INF/metatype/metatype.properties?rev=1175921&r1=1175920&r2=1175921&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/resources/OSGI-INF/metatype/metatype.properties
(original)
+++
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/resources/OSGI-INF/metatype/metatype.properties
Mon Sep 26 16:03:51 2011
@@ -54,3 +54,6 @@ org.apache.stanbol.enhancer.engines.keyw
org.apache.stanbol.enhancer.engines.keywordextraction.processedLanguages.name=Languages
org.apache.stanbol.enhancer.engines.keywordextraction.processedLanguages.description=Languages
to process. An empty text indicates that all languages are processed. Use ','
as separator for languages (e.g. 'en,de' to enhance only English and German
texts).
+
+org.apache.stanbol.enhancer.engines.keywordextraction.defaultMatchingLanguage.name=Default
Matching Language
+org.apache.stanbol.enhancer.engines.keywordextraction.defaultMatchingLanguage.description=The
language used in addition to the language detected for the analysed text to
search for Entities. Typically this configuration is an empty string to search
for labels without any language defined, but for some data sets (such as
DBpedia.org) that add languages to any labels it might improve resuls to change
this configuration (e.g. to 'en' in the case of DBpedia.org).
Modified:
incubator/stanbol/trunk/entityhub/query/clerezza/src/main/java/org/apache/stanbol/entityhub/query/clerezza/SparqlQueryUtils.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/query/clerezza/src/main/java/org/apache/stanbol/entityhub/query/clerezza/SparqlQueryUtils.java?rev=1175921&r1=1175920&r2=1175921&view=diff
==============================================================================
---
incubator/stanbol/trunk/entityhub/query/clerezza/src/main/java/org/apache/stanbol/entityhub/query/clerezza/SparqlQueryUtils.java
(original)
+++
incubator/stanbol/trunk/entityhub/query/clerezza/src/main/java/org/apache/stanbol/entityhub/query/clerezza/SparqlQueryUtils.java
Mon Sep 26 16:03:51 2011
@@ -32,6 +32,7 @@ import org.apache.clerezza.rdf.core.Trip
import org.apache.clerezza.rdf.core.TripleCollection;
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.clerezza.rdf.core.impl.util.W3CDateFormat;
+import org.apache.commons.lang.StringUtils;
import org.apache.stanbol.entityhub.core.utils.AdaptingIterator;
import org.apache.stanbol.entityhub.model.clerezza.RdfRepresentation;
import org.apache.stanbol.entityhub.model.clerezza.RdfValueFactory;
@@ -687,7 +688,14 @@ public final class SparqlQueryUtils {
boolean firstAndWord = true;
for(String word : words){
word = word.trim();
- if(!word.isEmpty()){
+ boolean hasAlphaNumeric = false;
+ for(int i = 0; i < word.length() && !hasAlphaNumeric;i++){
+ char ch = word.charAt(i);
+ if(Character.isLetter(ch) || Character.isDigit(ch)){
+ hasAlphaNumeric = true;
+ }
+ }
+ if(hasAlphaNumeric){
if(firstAndWord){
firstAndWord = false;
} else {
Modified:
incubator/stanbol/trunk/entityhub/query/clerezza/src/test/java/org/apache/stanbol/entityhub/query/clerezza/SparqlQueryUtilsTest.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/query/clerezza/src/test/java/org/apache/stanbol/entityhub/query/clerezza/SparqlQueryUtilsTest.java?rev=1175921&r1=1175920&r2=1175921&view=diff
==============================================================================
---
incubator/stanbol/trunk/entityhub/query/clerezza/src/test/java/org/apache/stanbol/entityhub/query/clerezza/SparqlQueryUtilsTest.java
(original)
+++
incubator/stanbol/trunk/entityhub/query/clerezza/src/test/java/org/apache/stanbol/entityhub/query/clerezza/SparqlQueryUtilsTest.java
Mon Sep 26 16:03:51 2011
@@ -12,14 +12,21 @@ public class SparqlQueryUtilsTest {
@Test
public void testCreateFullTextQueryString() {
List<String> keywords = Arrays.asList("test", "keyword");
- assertEquals("\"test\" OR \"keyword\"",
SparqlQueryUtils.createFullTextQueryString(keywords));
+ assertEquals("\"test\" OR \"keyword\"",
+ SparqlQueryUtils.createFullTextQueryString(keywords));
keywords = Arrays.asList("test keyword");
- assertEquals("(\"test\" AND \"keyword\")",
SparqlQueryUtils.createFullTextQueryString(keywords));
+ assertEquals("(\"test\" AND \"keyword\")",
+ SparqlQueryUtils.createFullTextQueryString(keywords));
keywords = Arrays.asList("'test' \"keyword\"");
assertEquals("(\"'test'\" AND \"\\\"keyword\\\"\")",
SparqlQueryUtils.createFullTextQueryString(keywords));
+
+ keywords = Arrays.asList("1 Alpha ? Numeric Test .");
+ assertEquals("(\"1\" AND \"Alpha\" AND \"Numeric\" AND \"Test\")",
+ SparqlQueryUtils.createFullTextQueryString(keywords));
+
}
}