Author: rwesten
Date: Mon Sep 26 16:03:51 2011
New Revision: 1175921

URL: http://svn.apache.org/viewvc?rev=1175921&view=rev
Log:
Fixes STANBOL-330: Full Text Search tokens that do not contain a single 
Alpha-Numeric chars are now ignored

other changes:

* Optimised POS tag set for swedish
* added "default Language" property information to the metatype.properties of 
the KeywordLinkingEngine

Modified:
    
incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java
    
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/resources/OSGI-INF/metatype/metatype.properties
    
incubator/stanbol/trunk/entityhub/query/clerezza/src/main/java/org/apache/stanbol/entityhub/query/clerezza/SparqlQueryUtils.java
    
incubator/stanbol/trunk/entityhub/query/clerezza/src/test/java/org/apache/stanbol/entityhub/query/clerezza/SparqlQueryUtilsTest.java

Modified: 
incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java?rev=1175921&r1=1175920&r2=1175921&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java
 (original)
+++ 
incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java
 Mon Sep 26 16:03:51 2011
@@ -141,10 +141,11 @@ public enum PosTagsCollectionEnum {
      * NOTE: <ul>
      * <li> This includes all typical noun categories as defined by MAMBA
      * <li> Unclassifiable part-of-speech and
-     * <li> Numerical ("RO" and "EN") 
+     * <li> Numerical "RO"
+     * <li> EN is excluded 
      * </ul>
      */
-    
SV_NOUN("sv",PosTypeCollectionType.NOUN,"NN","PN","AN","MN","VN","XX","EN","RO"),
+    
SV_NOUN("sv",PosTypeCollectionType.NOUN,"NN","PN","AN","MN","VN","XX","RO"),
     /**
      * POS types for Verbs of the Swedish language based on the
      * <a href="http://w3.msi.vxu.se/users/nivre/research/MAMBAlex.html";>

Modified: 
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/resources/OSGI-INF/metatype/metatype.properties
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/resources/OSGI-INF/metatype/metatype.properties?rev=1175921&r1=1175920&r2=1175921&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/resources/OSGI-INF/metatype/metatype.properties
 (original)
+++ 
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/resources/OSGI-INF/metatype/metatype.properties
 Mon Sep 26 16:03:51 2011
@@ -54,3 +54,6 @@ org.apache.stanbol.enhancer.engines.keyw
 
 
org.apache.stanbol.enhancer.engines.keywordextraction.processedLanguages.name=Languages
 
org.apache.stanbol.enhancer.engines.keywordextraction.processedLanguages.description=Languages
 to process. An empty text indicates that all languages are processed. Use ',' 
as separator for languages (e.g. 'en,de' to enhance only English and German 
texts).
+
+org.apache.stanbol.enhancer.engines.keywordextraction.defaultMatchingLanguage.name=Default
 Matching Language
+org.apache.stanbol.enhancer.engines.keywordextraction.defaultMatchingLanguage.description=The
 language used in addition to the language detected for the analysed text to 
search for Entities. Typically this configuration is an empty string to search 
for labels without any language defined, but for some data sets (such as 
DBpedia.org) that add languages to any labels it might improve resuls to change 
this configuration (e.g. to 'en' in the case of DBpedia.org).

Modified: 
incubator/stanbol/trunk/entityhub/query/clerezza/src/main/java/org/apache/stanbol/entityhub/query/clerezza/SparqlQueryUtils.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/query/clerezza/src/main/java/org/apache/stanbol/entityhub/query/clerezza/SparqlQueryUtils.java?rev=1175921&r1=1175920&r2=1175921&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/entityhub/query/clerezza/src/main/java/org/apache/stanbol/entityhub/query/clerezza/SparqlQueryUtils.java
 (original)
+++ 
incubator/stanbol/trunk/entityhub/query/clerezza/src/main/java/org/apache/stanbol/entityhub/query/clerezza/SparqlQueryUtils.java
 Mon Sep 26 16:03:51 2011
@@ -32,6 +32,7 @@ import org.apache.clerezza.rdf.core.Trip
 import org.apache.clerezza.rdf.core.TripleCollection;
 import org.apache.clerezza.rdf.core.UriRef;
 import org.apache.clerezza.rdf.core.impl.util.W3CDateFormat;
+import org.apache.commons.lang.StringUtils;
 import org.apache.stanbol.entityhub.core.utils.AdaptingIterator;
 import org.apache.stanbol.entityhub.model.clerezza.RdfRepresentation;
 import org.apache.stanbol.entityhub.model.clerezza.RdfValueFactory;
@@ -687,7 +688,14 @@ public final class SparqlQueryUtils {
                 boolean firstAndWord = true;
                 for(String word : words){
                     word = word.trim();
-                    if(!word.isEmpty()){
+                    boolean hasAlphaNumeric = false;
+                    for(int i = 0; i < word.length() && !hasAlphaNumeric;i++){
+                        char ch = word.charAt(i);
+                        if(Character.isLetter(ch) || Character.isDigit(ch)){
+                            hasAlphaNumeric = true;
+                        }
+                    }
+                    if(hasAlphaNumeric){
                         if(firstAndWord){
                             firstAndWord = false;
                         } else {

Modified: 
incubator/stanbol/trunk/entityhub/query/clerezza/src/test/java/org/apache/stanbol/entityhub/query/clerezza/SparqlQueryUtilsTest.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/query/clerezza/src/test/java/org/apache/stanbol/entityhub/query/clerezza/SparqlQueryUtilsTest.java?rev=1175921&r1=1175920&r2=1175921&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/entityhub/query/clerezza/src/test/java/org/apache/stanbol/entityhub/query/clerezza/SparqlQueryUtilsTest.java
 (original)
+++ 
incubator/stanbol/trunk/entityhub/query/clerezza/src/test/java/org/apache/stanbol/entityhub/query/clerezza/SparqlQueryUtilsTest.java
 Mon Sep 26 16:03:51 2011
@@ -12,14 +12,21 @@ public class SparqlQueryUtilsTest {
     @Test
     public void testCreateFullTextQueryString() {
         List<String> keywords = Arrays.asList("test", "keyword");
-        assertEquals("\"test\" OR \"keyword\"", 
SparqlQueryUtils.createFullTextQueryString(keywords));
+        assertEquals("\"test\" OR \"keyword\"", 
+            SparqlQueryUtils.createFullTextQueryString(keywords));
 
         keywords = Arrays.asList("test keyword");
-        assertEquals("(\"test\" AND \"keyword\")", 
SparqlQueryUtils.createFullTextQueryString(keywords));
+        assertEquals("(\"test\" AND \"keyword\")", 
+            SparqlQueryUtils.createFullTextQueryString(keywords));
 
         keywords = Arrays.asList("'test' \"keyword\"");
         assertEquals("(\"'test'\" AND \"\\\"keyword\\\"\")",
             SparqlQueryUtils.createFullTextQueryString(keywords));
+        
+        keywords = Arrays.asList("1 Alpha ? Numeric Test .");
+        assertEquals("(\"1\" AND \"Alpha\" AND \"Numeric\" AND \"Test\")",
+            SparqlQueryUtils.createFullTextQueryString(keywords));
+        
     }
 
 }


Reply via email to