Author: rwesten
Date: Mon Sep 26 13:06:26 2011
New Revision: 1175851
URL: http://svn.apache.org/viewvc?rev=1175851&view=rev
Log:
Some Improvements for the KeywordLinkingEngine:
* Added Support for POS (part of speech) based keyword extraction for three
additional languages
* Portuguese (pt)
* Dutch (nl)
* Swedish (sv)
* Added "default language" config param: see description of this param for
details
* The OpenNLP based AnalysedContent implementation now caches the Tokenizer to
avoid multiple calls to Open
* Corrected a bug in the matching of languages within the EntityLinker
Modified:
incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java
incubator/stanbol/trunk/data/opennlp/lang/da/ (props changed)
incubator/stanbol/trunk/data/opennlp/lang/de/ (props changed)
incubator/stanbol/trunk/data/opennlp/lang/nl/ (props changed)
incubator/stanbol/trunk/data/opennlp/lang/pt/ (props changed)
incubator/stanbol/trunk/data/opennlp/ner/es/ (props changed)
incubator/stanbol/trunk/data/opennlp/ner/nl/ (props changed)
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/ProcessingState.java
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/OpenNlpAnalysedContentFactory.java
Modified:
incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java?rev=1175851&r1=1175850&r2=1175851&view=diff
==============================================================================
---
incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java
(original)
+++
incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java
Mon Sep 26 13:06:26 2011
@@ -90,8 +90,74 @@ public enum PosTagsCollectionEnum {
* </ul>
*/
DA_FOLLOW("da",PosTypeCollectionType.FOLLOW,"XP","XA","SP","CS","CC","U"),
-
- ;
+ /**
+ * POS types for Nouns based on the
+ * <a
href="http://beta.visl.sdu.dk/visl/pt/symbolset-floresta.html">PALAVRAS tag
set</a>
+ * for Portuguese.<p>
+ * TODO: Someone who speaks this language should check this List<p>
+ * NOTES: Currently this includes nouns, proper nouns and numbers.
+ */
+ PT_NOUN("pt",PosTypeCollectionType.NOUN,"n","num","prop"),
+ /**
+ * POS types for Verbs based on the
+ * <a
href="http://beta.visl.sdu.dk/visl/pt/symbolset-floresta.html">PALAVRAS tag
set</a>
+ * for Portuguese.<p>
+ * TODO: Someone who speaks this language should check this List<p>
+ */
+ PT_VERB("pt",PosTypeCollectionType.VERB,"v-pcp","v-fin","v-inf","v-ger"),
+ /**
+ * POS types followed to build Chunks based on the
+ * <a
href="http://beta.visl.sdu.dk/visl/pt/symbolset-floresta.html">PALAVRAS tag
set</a>
+ * for Portuguese.<p>
+ * TODO: Someone who speaks this language should check this List<p>
+ * NOTES: Currently this pubctations and prepositions.
+ */
+ PT_FOLLOW("pt",PosTypeCollectionType.FOLLOW,"punc", "prp"),
+ /**
+ * POS types for Nouns based on the WOTAN tagset for Dutch (as used with
+ * Mbt).<p>
+ * TODOO: Someone who speaks this language should checkthis List<p>
+ * NOTES: This includes now Nouns, Numbers and "others".
+ */
+ NL_NOUN("nl",PosTypeCollectionType.NOUN,"N","Num","Misc"),
+ /**
+ * POS types for Verbs based on the WOTAN tagset for Dutch (as used with
+ * Mbt).<p>
+ * The tagger does not distinguish the different forms fo verbs. Therefore
+ * it is enough so include "V"
+ */
+ NL_VERB("nl",PosTypeCollectionType.VERB,"V"),
+ /**
+ * POS types followed to build Chunks based on the WOTAN tagset for Dutch
+ * (as used with Mbt).<p>
+ * NOTES: THis includes only prepositions and punctuations
+ *
+ */
+ NL_FOLLOW("nl",PosTypeCollectionType.FOLLOW,"Punc","Prep"),
+ /**
+ * POS types for Nouns for Swedish language based on
+ * <a href="http://w3.msi.vxu.se/users/nivre/research/MAMBAlex.html">
+ * Lexical categories in MAMBA</a>
+ * NOTE: <ul>
+ * <li> This includes all typical noun categories as defined by MAMBA
+ * <li> Unclassifiable part-of-speech and
+ * <li> Numerical ("RO" and "EN")
+ * </ul>
+ */
+
SV_NOUN("sv",PosTypeCollectionType.NOUN,"NN","PN","AN","MN","VN","XX","EN","RO"),
+ /**
+ * POS types for Verbs of the Swedish language based on the
+ * <a href="http://w3.msi.vxu.se/users/nivre/research/MAMBAlex.html">
+ * Lexical categories in MAMBA</a>
+ */
+
SV_VERB("sv",PosTypeCollectionType.VERB,"MV","AV","BV","FV","GV","HV","KV","QV","SV","VV","WV"),
+ /**
+ * POS types followed to build Chunks based on the TODO
+ * <p>
+ * NOTES: this includes prepositions, Part of idiom, Infinitive marker
+ * as well as all kinds of punctuations
+ */
+
SV_FOLLOW("sv",PosTypeCollectionType.FOLLOW,"PR","ID","IM","I?","IC","IG","IK","IP","IQ","IR","IS","IT","IU");
Set<String> tags;
private String language;
private PosTypeCollectionType type;
Propchange: incubator/stanbol/trunk/data/opennlp/lang/da/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Mon Sep 26 13:06:26 2011
@@ -0,0 +1 @@
+target
Propchange: incubator/stanbol/trunk/data/opennlp/lang/de/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Mon Sep 26 13:06:26 2011
@@ -0,0 +1 @@
+target
Propchange: incubator/stanbol/trunk/data/opennlp/lang/nl/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Mon Sep 26 13:06:26 2011
@@ -0,0 +1 @@
+target
Propchange: incubator/stanbol/trunk/data/opennlp/lang/pt/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Mon Sep 26 13:06:26 2011
@@ -0,0 +1 @@
+target
Propchange: incubator/stanbol/trunk/data/opennlp/ner/es/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Mon Sep 26 13:06:26 2011
@@ -0,0 +1 @@
+target
Propchange: incubator/stanbol/trunk/data/opennlp/ner/nl/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Mon Sep 26 13:06:26 2011
@@ -0,0 +1 @@
+target
Modified:
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java?rev=1175851&r1=1175850&r2=1175851&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java
Mon Sep 26 13:06:26 2011
@@ -89,7 +89,8 @@ import org.slf4j.LoggerFactory;
intValue=EntityLinkerConfig.DEFAULT_MIN_SEARCH_TOKEN_LENGTH),
@Property(name=KeywordLinkingEngine.MAX_SUGGESTIONS,
intValue=EntityLinkerConfig.DEFAULT_SUGGESTIONS),
- @Property(name=KeywordLinkingEngine.PROCESSED_LANGUAGES,value="")
+ @Property(name=KeywordLinkingEngine.PROCESSED_LANGUAGES,value=""),
+ @Property(name=KeywordLinkingEngine.DEFAULT_MATCHING_LANGUAGE,value="")
})
public class KeywordLinkingEngine implements EnhancementEngine,
ServiceProperties{
@@ -117,6 +118,7 @@ public class KeywordLinkingEngine implem
public static final String MAX_SUGGESTIONS =
"org.apache.stanbol.enhancer.engines.keywordextraction.maxSuggestions";
public static final String PROCESSED_LANGUAGES =
"org.apache.stanbol.enhancer.engines.keywordextraction.processedLanguages";
public static final String MIN_FOUND_TOKENS=
"org.apache.stanbol.enhancer.engines.keywordextraction.minFoundTokens";
+ public static final String DEFAULT_MATCHING_LANGUAGE =
"org.apache.stanbol.enhancer.engines.keywordextraction.defaultMatchingLanguage";
// public static final String SIMPLE_TOKENIZER =
"org.apache.stanbol.enhancer.engines.keywordextraction.simpleTokenizer";
// public static final String ENABLE_CHUNKER =
"org.apache.stanbol.enhancer.engines.keywordextraction.enableChunker";
/**
@@ -605,6 +607,19 @@ public class KeywordLinkingEngine implem
Arrays.toString(RedirectProcessingMode.values()));
}
}
+ //init the DEFAULT_LANGUAGE
+ value = configuration.get(DEFAULT_MATCHING_LANGUAGE);
+ if(value != null){
+ String defaultLang = value.toString().trim();
+ if(defaultLang.isEmpty()){
+ config.setDefaultLanguage(null);
+ } else if(defaultLang.length() == 1){
+ throw new ConfigurationException(DEFAULT_MATCHING_LANGUAGE,
"Illegal language code '"+
+ defaultLang+"'! Language Codes MUST BE at least 2 chars
long.");
+ } else {
+ config.setDefaultLanguage(defaultLang);
+ }
+ }
}
/**
Modified:
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/ProcessingState.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/ProcessingState.java?rev=1175851&r1=1175850&r2=1175851&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/ProcessingState.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/ProcessingState.java
Mon Sep 26 13:06:26 2011
@@ -75,6 +75,14 @@ public class ProcessingState {
return token;
}
/**
+ * Getter for the language of the current Token (based on the current
+ * sentence)
+ * @return the language
+ */
+ public final String getLanguage() {
+ return sentence.getLanguage();
+ }
+ /**
* The currently active chunk or <code>null</code> if no chunks are
* available. If chunks are present this can not be <code>null</code>
* because {@link Token}s outside of chunks are skiped.
Modified:
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java?rev=1175851&r1=1175850&r2=1175851&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java
Mon Sep 26 13:06:26 2011
@@ -250,7 +250,7 @@ public class EntityLinker {
private List<Suggestion> lookupEntities(List<String> searchStrings) {
Collection<? extends Representation> results = entitySearcher.lookup(
config.getNameField(),config.getSelectedFields(),
- searchStrings, state.getSentence().getLanguage());
+ searchStrings,
state.getSentence().getLanguage(),config.getDefaultLanguage());
List<Suggestion> suggestions = new ArrayList<Suggestion>();
for(Representation result : results){
Suggestion match = matchLabels(result);
@@ -282,13 +282,27 @@ public class EntityLinker {
* @return The result of the matching.
*/
private Suggestion matchLabels(Representation rep) {
+ String curLang = state.getLanguage(); //language of the current
sentence
+ String defLang = config.getDefaultLanguage(); //configured default
language
+// Iterator<Text> labels = rep.get(config.getNameField(), //get all
labels
+// state.getLanguage(), //in the current language
+// config.getDefaultLanguage()); //and the default language
Iterator<Text> labels = rep.getText(config.getNameField());
Suggestion match = new Suggestion(rep);
while(labels.hasNext()){
Text label = labels.next();
- //NOTE: I use here startWith language because I want 'en-GB'
labels accepted for 'en'
- if(label.getLanguage() == null || label.getLanguage().startsWith(
- state.getSentence().getLanguage())){
+ String lang = label.getLanguage();
+ //check the language of the current label
+ //NOTE: Stirng.startWith is used to match'en-GB' with 'en'
+ if((lang == null && ( //if lang is null
+ defLang == null || //default lang is null
+ curLang == null)) //or current lang is null
+ || (lang != null && ( //if lang is not null
+ //NOTE: starsWith does not like parsing NULL
+ curLang != null && lang.startsWith(curLang) ||
//match with default
+ defLang != null && lang.startsWith(defLang)) //or
match with current
+ ) //end or
+ ){ //end if
String text = label.getText().toLowerCase();
List<String> labelTokens =
Arrays.asList(content.tokenize(text));
int foundTokens = 0;
@@ -307,6 +321,7 @@ public class EntityLinker {
if(isProcessable){
foundTokens++; //only count processable Tokens
}
+ //TODO: maybe move this also in the "isProcessable" ...
foundInLabelIndex =
found+currentToken.getText().length();
lastFoundIndex = currentIndex;
} else { //not found
Modified:
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java?rev=1175851&r1=1175850&r2=1175851&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java
Mon Sep 26 13:06:26 2011
@@ -66,6 +66,12 @@ public class EntityLinkerConfig {
*/
public static final String DEFAULT_REDIRECT_FIELD = "rdfs:seeAlso";
/**
+ * The default language used to search for labels regardless of the
language
+ * of the text. The default value is <code>null</code> causing to include
+ * labels that do not have a language assigned.
+ */
+ public static final String DEFAULT_LANGUAGE = null;
+ /**
* Default mapping for Concept types to dc:type values added for
* TextAnnotations.
*/
@@ -158,6 +164,11 @@ public class EntityLinkerConfig {
private String typeField;
private Set<String> selectedFields = new HashSet<String>();
/**
+ * The language always included in searches (regardless of the language
+ * detected for the text.
+ */
+ private String defaultLanguage = DEFAULT_LANGUAGE;
+ /**
* Default constructor the initialises the configuration with the
* default values
*/
@@ -396,4 +407,23 @@ public class EntityLinkerConfig {
public Map<String,UriRef> getTypeMappings() {
return unmodTypeMappings;
}
+ /**
+ * Setter for the language of labels searched in addition to the current
+ * language of the text. Setting this to <code>null</code> (also the
default)
+ * will cause to search labels without any defined language.<p>
+ * Changing this makes only sense if a dataset (such as dbpedia.org) adds
+ * language tags to labels even if they are typically used in any language.
+ * @param defaultLanguage the default language
+ */
+ public void setDefaultLanguage(String defaultLanguage) {
+ this.defaultLanguage = defaultLanguage;
+ }
+ /**
+ * Getter for the language of labels searched in addition to the current
+ * language of the text.
+ * @return the default language
+ */
+ public String getDefaultLanguage() {
+ return defaultLanguage;
+ }
}
\ No newline at end of file
Modified:
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/OpenNlpAnalysedContentFactory.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/OpenNlpAnalysedContentFactory.java?rev=1175851&r1=1175850&r2=1175851&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/OpenNlpAnalysedContentFactory.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/OpenNlpAnalysedContentFactory.java
Mon Sep 26 13:06:26 2011
@@ -6,6 +6,7 @@ import java.util.Iterator;
import java.util.Map;
import java.util.Set;
+import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.util.Span;
import org.apache.stanbol.commons.opennlp.PosTagsCollectionEnum;
@@ -91,11 +92,13 @@ public class OpenNlpAnalysedContentFacto
private final String language;
private final Iterator<AnalysedText> sentences;
private final Set<String> posTags;
+ private final Tokenizer tokenizer;
private OpenNlpAnalysedContent(String text, String lang){
this.language = lang;
this.sentences = textAnalyzer.analyse(text, lang);
this.posTags = PosTagsCollectionEnum.getPosTagCollection(lang,
PosTypeCollectionType.NOUN);
+ this.tokenizer = textAnalyzer.getTokenizer(lang);
}
/**
@@ -131,7 +134,7 @@ public class OpenNlpAnalysedContentFacto
}
@Override
public String[] tokenize(String label) {
- return textAnalyzer.getTokenizer(language).tokenize(label);
+ return tokenizer.tokenize(label);
}
}
}