enha...

rwesten Mon, 26 Sep 2011 06:06:58 -0700

Author: rwesten
Date: Mon Sep 26 13:06:26 2011
New Revision: 1175851

URL: http://svn.apache.org/viewvc?rev=1175851&view=rev
Log:
Some Improvements for the KeywordLinkingEngine:


* Added Support for POS (part of speech) based keyword extraction for three 
additional languages
    * Portuguese (pt)
    * Dutch (nl)
    * Swedish (sv)
* Added "default language" config param: see description of this param for 
details
* The OpenNLP based AnalysedContent implementation now caches the Tokenizer to 
avoid multiple calls to Open
* Corrected a bug in the matching of languages within the EntityLinker

Modified:
    
incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java
    incubator/stanbol/trunk/data/opennlp/lang/da/   (props changed)
    incubator/stanbol/trunk/data/opennlp/lang/de/   (props changed)
    incubator/stanbol/trunk/data/opennlp/lang/nl/   (props changed)
    incubator/stanbol/trunk/data/opennlp/lang/pt/   (props changed)
    incubator/stanbol/trunk/data/opennlp/ner/es/   (props changed)
    incubator/stanbol/trunk/data/opennlp/ner/nl/   (props changed)
    
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java
    
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/ProcessingState.java
    
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java
    
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java
    
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/OpenNlpAnalysedContentFactory.java

Modified: 
incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java?rev=1175851&r1=1175850&r2=1175851&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java
 (original)
+++ 
incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java
 Mon Sep 26 13:06:26 2011
@@ -90,8 +90,74 @@ public enum PosTagsCollectionEnum {
      * </ul>
      */
     DA_FOLLOW("da",PosTypeCollectionType.FOLLOW,"XP","XA","SP","CS","CC","U"),
-    
-    ;
+    /**
+     * POS types for Nouns based on the
+     * <a 
href="http://beta.visl.sdu.dk/visl/pt/symbolset-floresta.html";>PALAVRAS tag 
set</a>
+     * for Portuguese.<p>
+     * TODO: Someone who speaks this language should check this List<p>
+     * NOTES: Currently this includes nouns, proper nouns and numbers.
+     */
+    PT_NOUN("pt",PosTypeCollectionType.NOUN,"n","num","prop"),
+    /**
+     * POS types for Verbs based on the
+     * <a 
href="http://beta.visl.sdu.dk/visl/pt/symbolset-floresta.html";>PALAVRAS tag 
set</a>
+     * for Portuguese.<p>
+     * TODO: Someone who speaks this language should check this List<p>
+     */
+    PT_VERB("pt",PosTypeCollectionType.VERB,"v-pcp","v-fin","v-inf","v-ger"),
+    /**
+     * POS types followed to build Chunks based on the
+     * <a 
href="http://beta.visl.sdu.dk/visl/pt/symbolset-floresta.html";>PALAVRAS tag 
set</a>
+     * for Portuguese.<p>
+     * TODO: Someone who speaks this language should check this List<p>
+     * NOTES: Currently this pubctations and prepositions.
+     */
+    PT_FOLLOW("pt",PosTypeCollectionType.FOLLOW,"punc", "prp"),
+    /**
+     * POS types for Nouns based on the WOTAN tagset for Dutch (as used with 
+     * Mbt).<p>
+     * TODOO: Someone who speaks this language should checkthis List<p>
+     * NOTES: This includes now Nouns, Numbers and "others".
+     */
+    NL_NOUN("nl",PosTypeCollectionType.NOUN,"N","Num","Misc"),
+    /**
+     * POS types for Verbs based on the WOTAN tagset for Dutch (as used with 
+     * Mbt).<p>
+     * The tagger does not distinguish the different forms fo verbs. Therefore
+     * it is enough so include "V"
+     */
+    NL_VERB("nl",PosTypeCollectionType.VERB,"V"),
+    /**
+     * POS types followed to build Chunks based on the WOTAN tagset for Dutch 
+     * (as used with Mbt).<p>
+     * NOTES: THis includes only prepositions and punctuations
+     * 
+     */
+    NL_FOLLOW("nl",PosTypeCollectionType.FOLLOW,"Punc","Prep"),
+    /**
+     * POS types for Nouns for Swedish language based on 
+     * <a href="http://w3.msi.vxu.se/users/nivre/research/MAMBAlex.html";>
+     * Lexical categories in MAMBA</a>
+     * NOTE: <ul>
+     * <li> This includes all typical noun categories as defined by MAMBA
+     * <li> Unclassifiable part-of-speech and
+     * <li> Numerical ("RO" and "EN") 
+     * </ul>
+     */
+    
SV_NOUN("sv",PosTypeCollectionType.NOUN,"NN","PN","AN","MN","VN","XX","EN","RO"),
+    /**
+     * POS types for Verbs of the Swedish language based on the
+     * <a href="http://w3.msi.vxu.se/users/nivre/research/MAMBAlex.html";>
+     * Lexical categories in MAMBA</a>
+     */
+    
SV_VERB("sv",PosTypeCollectionType.VERB,"MV","AV","BV","FV","GV","HV","KV","QV","SV","VV","WV"),
+    /**
+     * POS types followed to build Chunks based on the TODO
+     * <p>
+     * NOTES: this includes  prepositions, Part of idiom, Infinitive marker
+     *  as well as all kinds of punctuations
+     */
+    
SV_FOLLOW("sv",PosTypeCollectionType.FOLLOW,"PR","ID","IM","I?","IC","IG","IK","IP","IQ","IR","IS","IT","IU");
     Set<String> tags;
     private String language;
     private PosTypeCollectionType type;

Propchange: incubator/stanbol/trunk/data/opennlp/lang/da/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Mon Sep 26 13:06:26 2011
@@ -0,0 +1 @@
+target

Propchange: incubator/stanbol/trunk/data/opennlp/lang/de/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Mon Sep 26 13:06:26 2011
@@ -0,0 +1 @@
+target

Propchange: incubator/stanbol/trunk/data/opennlp/lang/nl/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Mon Sep 26 13:06:26 2011
@@ -0,0 +1 @@
+target

Propchange: incubator/stanbol/trunk/data/opennlp/lang/pt/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Mon Sep 26 13:06:26 2011
@@ -0,0 +1 @@
+target

Propchange: incubator/stanbol/trunk/data/opennlp/ner/es/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Mon Sep 26 13:06:26 2011
@@ -0,0 +1 @@
+target

Propchange: incubator/stanbol/trunk/data/opennlp/ner/nl/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Mon Sep 26 13:06:26 2011
@@ -0,0 +1 @@
+target

Modified: 
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java?rev=1175851&r1=1175850&r2=1175851&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java
 (original)
+++ 
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java
 Mon Sep 26 13:06:26 2011
@@ -89,7 +89,8 @@ import org.slf4j.LoggerFactory;
         intValue=EntityLinkerConfig.DEFAULT_MIN_SEARCH_TOKEN_LENGTH),
     @Property(name=KeywordLinkingEngine.MAX_SUGGESTIONS,
         intValue=EntityLinkerConfig.DEFAULT_SUGGESTIONS),
-    @Property(name=KeywordLinkingEngine.PROCESSED_LANGUAGES,value="")
+    @Property(name=KeywordLinkingEngine.PROCESSED_LANGUAGES,value=""),
+    @Property(name=KeywordLinkingEngine.DEFAULT_MATCHING_LANGUAGE,value="")
 })
 public class KeywordLinkingEngine implements EnhancementEngine, 
ServiceProperties{
 
@@ -117,6 +118,7 @@ public class KeywordLinkingEngine implem
     public static final String MAX_SUGGESTIONS = 
"org.apache.stanbol.enhancer.engines.keywordextraction.maxSuggestions";
     public static final String PROCESSED_LANGUAGES = 
"org.apache.stanbol.enhancer.engines.keywordextraction.processedLanguages";
     public static final String MIN_FOUND_TOKENS= 
"org.apache.stanbol.enhancer.engines.keywordextraction.minFoundTokens";
+    public static final String DEFAULT_MATCHING_LANGUAGE = 
"org.apache.stanbol.enhancer.engines.keywordextraction.defaultMatchingLanguage";
 //  public static final String SIMPLE_TOKENIZER = 
"org.apache.stanbol.enhancer.engines.keywordextraction.simpleTokenizer";
 //  public static final String ENABLE_CHUNKER = 
"org.apache.stanbol.enhancer.engines.keywordextraction.enableChunker";
     /**
@@ -605,6 +607,19 @@ public class KeywordLinkingEngine implem
                     Arrays.toString(RedirectProcessingMode.values()));
             }
         }
+        //init the DEFAULT_LANGUAGE
+        value = configuration.get(DEFAULT_MATCHING_LANGUAGE);
+        if(value != null){
+            String defaultLang = value.toString().trim();
+            if(defaultLang.isEmpty()){
+                config.setDefaultLanguage(null);
+            } else if(defaultLang.length() == 1){
+                throw new ConfigurationException(DEFAULT_MATCHING_LANGUAGE, 
"Illegal language code '"+
+                    defaultLang+"'! Language Codes MUST BE at least 2 chars 
long.");
+            } else {
+                config.setDefaultLanguage(defaultLang);
+            }
+        }
     }
 
     /**

Modified: 
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/ProcessingState.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/ProcessingState.java?rev=1175851&r1=1175850&r2=1175851&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/ProcessingState.java
 (original)
+++ 
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/ProcessingState.java
 Mon Sep 26 13:06:26 2011
@@ -75,6 +75,14 @@ public class ProcessingState {
         return token;
     }
     /**
+     * Getter for the language of the current Token (based on the current
+     * sentence)
+     * @return the language
+     */
+    public final String getLanguage() {
+        return sentence.getLanguage();
+    }
+    /**
      * The currently active chunk or <code>null</code> if no chunks are
      * available. If chunks are present this can not be <code>null</code>
      * because {@link Token}s outside of chunks are skiped.

Modified: 
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java?rev=1175851&r1=1175850&r2=1175851&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java
 (original)
+++ 
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java
 Mon Sep 26 13:06:26 2011
@@ -250,7 +250,7 @@ public class EntityLinker {
     private List<Suggestion> lookupEntities(List<String> searchStrings) {
         Collection<? extends Representation> results = entitySearcher.lookup(
             config.getNameField(),config.getSelectedFields(),
-            searchStrings, state.getSentence().getLanguage());
+            searchStrings, 
state.getSentence().getLanguage(),config.getDefaultLanguage());
         List<Suggestion> suggestions = new ArrayList<Suggestion>();
         for(Representation result : results){
             Suggestion match = matchLabels(result);
@@ -282,13 +282,27 @@ public class EntityLinker {
      * @return The result of the matching.
      */
     private Suggestion matchLabels(Representation rep) {
+        String curLang = state.getLanguage(); //language of the current 
sentence
+        String defLang = config.getDefaultLanguage(); //configured default 
language 
+//        Iterator<Text> labels = rep.get(config.getNameField(), //get all 
labels
+//            state.getLanguage(), //in the current language
+//            config.getDefaultLanguage()); //and the default language
         Iterator<Text> labels = rep.getText(config.getNameField());
         Suggestion match = new Suggestion(rep);
         while(labels.hasNext()){
             Text label = labels.next();
-            //NOTE: I use here startWith language because I want 'en-GB' 
labels accepted for 'en'
-            if(label.getLanguage() == null || label.getLanguage().startsWith(
-                    state.getSentence().getLanguage())){
+            String lang = label.getLanguage();
+            //check the language of the current label
+            //NOTE: Stirng.startWith is used to match'en-GB' with 'en'
+            if((lang == null && ( //if lang is null
+                            defLang == null || //default lang is null
+                            curLang == null)) //or current lang is null
+                    || (lang != null && ( //if lang is not null
+                            //NOTE: starsWith does not like parsing NULL
+                            curLang != null && lang.startsWith(curLang) || 
//match with default
+                            defLang != null && lang.startsWith(defLang)) //or 
match with current
+                        ) //end or
+                    ){ //end if
                 String text = label.getText().toLowerCase();
                 List<String> labelTokens = 
Arrays.asList(content.tokenize(text));
                 int foundTokens = 0;
@@ -307,6 +321,7 @@ public class EntityLinker {
                         if(isProcessable){
                             foundTokens++; //only count processable Tokens
                         }
+                        //TODO: maybe move this also in the "isProcessable" ...
                         foundInLabelIndex = 
found+currentToken.getText().length();
                         lastFoundIndex = currentIndex;
                     } else { //not found

Modified: 
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java?rev=1175851&r1=1175850&r2=1175851&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java
 (original)
+++ 
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java
 Mon Sep 26 13:06:26 2011
@@ -66,6 +66,12 @@ public class EntityLinkerConfig {
      */
     public static final String DEFAULT_REDIRECT_FIELD = "rdfs:seeAlso";
     /**
+     * The default language used to search for labels regardless of the 
language
+     * of the text. The default value is <code>null</code> causing to include
+     * labels that do not have a language assigned.
+     */
+    public static final String DEFAULT_LANGUAGE = null;
+    /**
      * Default mapping for Concept types to dc:type values added for
      * TextAnnotations.
      */
@@ -158,6 +164,11 @@ public class EntityLinkerConfig {
     private String typeField;
     private Set<String> selectedFields = new HashSet<String>();
     /**
+     * The language always included in searches (regardless of the language
+     * detected for the text.
+     */
+    private String defaultLanguage = DEFAULT_LANGUAGE;
+    /**
      * Default constructor the initialises the configuration with the 
      * default values
      */
@@ -396,4 +407,23 @@ public class EntityLinkerConfig {
     public Map<String,UriRef> getTypeMappings() {
         return unmodTypeMappings;
     }
+    /**
+     * Setter for the language of labels searched in addition to the current
+     * language of the text. Setting this to <code>null</code> (also the 
default)
+     * will cause to search labels without any defined language.<p>
+     * Changing this makes only sense if a dataset (such as dbpedia.org) adds
+     * language tags to labels even if they are typically used in any language.
+     * @param defaultLanguage the default language
+     */
+    public void setDefaultLanguage(String defaultLanguage) {
+        this.defaultLanguage = defaultLanguage;
+    }
+    /**
+     * Getter for the language of labels searched in addition to the current
+     * language of the text.
+     * @return the default language 
+     */
+    public String getDefaultLanguage() {
+        return defaultLanguage;
+    }
 }
\ No newline at end of file

Modified: 
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/OpenNlpAnalysedContentFactory.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/OpenNlpAnalysedContentFactory.java?rev=1175851&r1=1175850&r2=1175851&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/OpenNlpAnalysedContentFactory.java
 (original)
+++ 
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/OpenNlpAnalysedContentFactory.java
 Mon Sep 26 13:06:26 2011
@@ -6,6 +6,7 @@ import java.util.Iterator;
 import java.util.Map;
 import java.util.Set;
 
+import opennlp.tools.tokenize.Tokenizer;
 import opennlp.tools.util.Span;
 
 import org.apache.stanbol.commons.opennlp.PosTagsCollectionEnum;
@@ -91,11 +92,13 @@ public class OpenNlpAnalysedContentFacto
         private final String language;
         private final Iterator<AnalysedText> sentences;
         private final Set<String> posTags;
+        private final Tokenizer tokenizer;
 
         private OpenNlpAnalysedContent(String text, String lang){
             this.language = lang;
             this.sentences = textAnalyzer.analyse(text, lang);
             this.posTags = PosTagsCollectionEnum.getPosTagCollection(lang, 
PosTypeCollectionType.NOUN);
+            this.tokenizer = textAnalyzer.getTokenizer(lang);
         }
         
         /**
@@ -131,7 +134,7 @@ public class OpenNlpAnalysedContentFacto
         }
         @Override
         public String[] tokenize(String label) {
-            return textAnalyzer.getTokenizer(language).tokenize(label);
+            return tokenizer.tokenize(label);
         }
     }
 }

svn commit: r1175851 - in /incubator/stanbol/trunk: commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/ data/opennlp/lang/da/ data/opennlp/lang/de/ data/opennlp/lang/nl/ data/opennlp/lang/pt/ data/opennlp/ner/es/ data/opennlp/ner/nl/ enha...

Reply via email to