Author: rwesten
Date: Fri Sep 30 08:21:35 2011
New Revision: 1177554
URL: http://svn.apache.org/viewvc?rev=1177554&view=rev
Log:
Implemented a new Label Matching algorithm for Keyword Extraction:
Now the matching of Labels with the Text is fully based on Tokens. The Tokens
following the current position in the Text are matched with the Tokens of the
Label.
* Matching of Tokens do no longer use equals, but checks the common
prefix/suffix. If the common part is > 0.75 than the Token is marked as matched.
* Matching does no longer enforce the correct order. However matches in the
wrong oder still use exact matches and get a core lowerd to 0.75. This ensures
that a text mentioning "Berlusconi, Silvio" is linked with the entity "Silvio
Berlusconi". However it also means that a text mentioning the person "Jack
Black" also matches "Black Jack". However Black Jack will only get a confidence
of about 0.5.
* The score algorithm does no longer use the number of matched tokens. Now it
uses the sum of the Token matches. So if a label matches 3 Tokens in the text
however the 2nd tokens matches only 9 out of 10 chars and the 3rd token is in
the wrong oder the sum of the token matches would be 1+0.9+0.75= 2,65 instead
of 3.
Modified:
incubator/stanbol/trunk/data/opennlp/lang/sv/ (props changed)
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/Suggestion.java
Propchange: incubator/stanbol/trunk/data/opennlp/lang/sv/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Fri Sep 30 08:21:35 2011
@@ -0,0 +1 @@
+target
Modified:
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java?rev=1177554&r1=1177553&r2=1177554&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java
Fri Sep 30 08:21:35 2011
@@ -7,6 +7,7 @@ import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
+import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
@@ -14,6 +15,7 @@ import java.util.Set;
import opennlp.tools.util.Span;
import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.commons.lang.StringUtils;
import org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText.Token;
import
org.apache.stanbol.enhancer.engines.keywordextraction.impl.ProcessingState;
import
org.apache.stanbol.enhancer.engines.keywordextraction.linking.EntityLinkerConfig.RedirectProcessingMode;
@@ -101,12 +103,13 @@ public class EntityLinker {
suggestion.getMatchCount() <
config.getMinFoundTokens()){
it.remove();
} else { //calculate the score
+ double suggestionMatchScore =
suggestion.getMatchCount()*suggestion.getMatchScore();
//how good is the current match in relation to the
best one
- double spanScore =
((double)suggestion.getMatchCount())/bestMatchCount;
+ double spanScore =
suggestionMatchScore/bestMatchCount;
//how good is the match to the span selected by
this suggestion
- double textScore =
((double)suggestion.getMatchCount())/suggestion.getSpan();
+ double textScore =
suggestionMatchScore/suggestion.getSpan();
//how good is the match in relation to the tokens
of the suggested label
- double labelScore =
((double)suggestion.getMatchCount()/suggestion.getLabelTokenCount());
+ double labelScore =
suggestionMatchScore/suggestion.getLabelTokenCount();
suggestion.setScore(spanScore*spanScore*textScore*labelScore);
}
}
@@ -289,89 +292,193 @@ public class EntityLinker {
// config.getDefaultLanguage()); //and the default language
Iterator<Text> labels = rep.getText(config.getNameField());
Suggestion match = new Suggestion(rep);
+ Collection<Text> defaultLabels = new ArrayList<Text>();
+ boolean matchedCurLangLabel = false;
while(labels.hasNext()){
Text label = labels.next();
String lang = label.getLanguage();
- //check the language of the current label
- //NOTE: Stirng.startWith is used to match'en-GB' with 'en'
- if((lang == null && ( //if lang is null
- defLang == null || //default lang is null
- curLang == null)) //or current lang is null
- || (lang != null && ( //if lang is not null
- //NOTE: starsWith does not like parsing NULL
- curLang != null && lang.startsWith(curLang) ||
//match with default
- defLang != null && lang.startsWith(defLang)) //or
match with current
- ) //end or
- ){ //end if
- String text = label.getText().toLowerCase();
- List<String> labelTokens =
Arrays.asList(content.tokenize(text));
- int foundTokens = 0;
- //ensure the correct order of the tokens in the suggested
entity
- int foundInLabelIndex = 0;
- boolean search = true;
- int lastFoundIndex = -1;
- Token currentToken;
- int maxNotFound = 1; //TODO make configureable
- int notFound = 0;
- for(int currentIndex = state.getTokenIndex();currentIndex <
state.getSentence().getTokens().size() && search;currentIndex++){
- currentToken =
state.getSentence().getTokens().get(currentIndex);
- boolean isProcessable = isProcessableToken(currentToken);
- int found =
text.indexOf(currentToken.getText().toLowerCase());
- if(found>=foundInLabelIndex){ //found
- if(isProcessable){
- foundTokens++; //only count processable Tokens
- }
- //TODO: maybe move this also in the "isProcessable" ...
- foundInLabelIndex =
found+currentToken.getText().length();
- lastFoundIndex = currentIndex;
- } else { //not found
- notFound++;
- if(isProcessable || notFound > maxNotFound){
- //stop as soon as a token that needs to be
processed is
- //not found in the label or the maximum number of
tokens
- //that are not processable are not found
- search = false;
- }
- } //else it is OK if non processable tokens are not found
- }
- MATCH labelMatch;
- int coveredTokens = lastFoundIndex-state.getTokenIndex()+1;
- //Matching rules
- // - if less than config#minTokenFound() than accept only EXACT
- // - override PARTIAL matches with FULL/EXACT matches only if
- // foundTokens of the PARTIAL match is > than of the
FULL/EXACT
- // match (this will be very rare
- if(foundTokens > 0 && match.getMatchCount() <= foundTokens) {
- String currentText = state.getTokenText(coveredTokens);
- if(currentText.equalsIgnoreCase(label.getText())){
- labelMatch = MATCH.EXACT;
- //set found to covered: May be lower because only
- //processable tokens are counted, but Exact also checks
- //of non-processable!
- foundTokens = coveredTokens;
- } else if(foundTokens >= config.getMinFoundTokens()){
- if(foundTokens == coveredTokens){
- labelMatch = MATCH.FULL;
- } else {
- labelMatch = MATCH.PARTIAL;
- }
- } else {
- labelMatch = MATCH.NONE;
- }
- if(labelMatch != MATCH.NONE){
- if(match.getMatchCount() < foundTokens ||
- match.getMatchCount() < foundTokens &&
- labelMatch.ordinal() >
match.getMatch().ordinal()){
- match.updateMatch(labelMatch, coveredTokens,
foundTokens,label,labelTokens.size());
- } //else this match is not better as the existing one
- } //else ignore labels with MATCH.NONE
- } //else NO tokens found -> nothing to do
- } // else worng language
+ if((lang == null && curLang == null) ||
+ (lang != null && curLang != null &&
lang.startsWith(curLang))){
+ matchLabel(match, label);
+ matchedCurLangLabel = true;
+ } else if((lang ==null && defLang == null) ||
+ (lang != null && defLang != null &&
lang.startsWith(defLang))){
+ defaultLabels.add(label);
+ }
+ }
+ //use only labels in the default language if there is
+ // * no label in the current language or
+ if(!matchedCurLangLabel){// || match.getMatch() == MATCH.NONE){
+ for(Text defaultLangLabel : defaultLabels){
+ matchLabel(match, defaultLangLabel);
+ }
}
return match;
}
/**
+ * The default value for the maximum number or non-processable tokens
+ * allowed to be not matching with a label of an entity before the matching
+ * is stopped.
+ */
+ private static int DEFAULT_MAX_NOT_FOUND = 1;
+ /**
+ * The value for the maximum number or non-processable tokens
+ * allowed to be not matching with a label of an entity before the matching
+ * is stopped.
+ * TODO: make configurable!
+ */
+ private int maxNotFound = DEFAULT_MAX_NOT_FOUND;
+ /**
+ * @param match
+ * @param label
+ */
+ private void matchLabel(Suggestion match, Text label) {
+ String text = label.getText().toLowerCase();
+ String[] labelTokens = content.tokenize(text);
+ Set<String> labelTokenSet = new HashSet<String>(
+ Arrays.asList(labelTokens));
+ int foundTokens = 0;
+ float foundTokenMatch = 0;
+ //ensure the correct order of the tokens in the suggested entity
+ boolean search = true;
+ int lastFoundIndex = -1;
+ int lastfoundLabelIndex = -1;
+ Token currentToken;
+ String currentTokenText;
+ int currentTokenLength;
+ int notFound = 0;
+ //search for matches within the correct order
+ for(int currentIndex = state.getTokenIndex();currentIndex <
state.getSentence().getTokens().size() && search;currentIndex++){
+ currentToken = state.getSentence().getTokens().get(currentIndex);
+ currentTokenText = currentToken.getText().toLowerCase();
+ currentTokenLength = currentTokenText.length();
+ boolean isProcessable = isProcessableToken(currentToken);
+ boolean found = false;
+ float matchFactor = 0f;
+ //iteration starts at the next token after the last matched one
+ //so it is OK to skip tokens in the label, but not within the text
+ for(int i = lastfoundLabelIndex+1;!found && i <
labelTokens.length;i ++){
+ String labelTokenText = labelTokens[i];
+ int labelTokenLength = labelTokenText.length();
+ float maxLength = currentTokenLength > labelTokenLength ?
currentTokenLength : labelTokenLength;
+ float lengthDif = Math.abs(currentTokenLength -
labelTokenLength);
+ if((lengthDif/maxLength)<=0.3f){ //this prevents unnecessary
string comparison
+ int matchCount = compairTokens(currentTokenText,
labelTokens[i]);
+ if(matchCount/maxLength >= 0.7f){
+ lastfoundLabelIndex = i; //set the last found index to
the current position
+ found = true; //set found to true -> stops iteration
+ matchFactor = matchCount/maxLength; //how good is the
match
+ //remove matched labels from the set to disable them
for
+ //a later random oder search
+ labelTokenSet.remove(labelTokenText);
+ }
+ }
+ }
+ if(!found){
+ //search for a match in the wrong order
+ //currently only exact matches (for testing)
+ if(found = labelTokenSet.remove(currentTokenText)){
+ matchFactor = 0.7f;
+ }
+ }
+ //int found = text.indexOf(currentToken.getText().toLowerCase());
+ if(found){ //found
+ if(isProcessable){
+ foundTokens++; //only count processable Tokens
+ foundTokenMatch = foundTokenMatch + matchFactor; //sum up
the matches
+ }
+ lastFoundIndex = currentIndex;
+ } else { //not found
+ notFound++;
+ if(isProcessable || notFound > maxNotFound){
+ //stop as soon as a token that needs to be processed is
+ //not found in the label or the maximum number of tokens
+ //that are not processable are not found
+ search = false;
+ }
+ }
+ }
+ //Now we make a second round to search tokens that match in the wrong
order
+ //e.g. if given and family name of persons are switched
+ MATCH labelMatch;
+ int coveredTokens = lastFoundIndex-state.getTokenIndex()+1;
+ float labelMatchScore = (foundTokenMatch/(float)labelTokens.length);
+ //Matching rules
+ // - if less than config#minTokenFound() than accept only EXACT
+ // - override PARTIAL matches with FULL/EXACT matches only if
+ // foundTokens of the PARTIAL match is > than of the FULL/EXACT
+ // match (this will be very rare
+ if(foundTokens > 0 && match.getMatchCount() <= foundTokens) {
+ String currentText = state.getTokenText(coveredTokens);
+ if(currentText.equalsIgnoreCase(label.getText())){
+ labelMatch = MATCH.EXACT;
+ //set found to covered: May be lower because only
+ //processable tokens are counted, but Exact also checks
+ //of non-processable!
+ foundTokens = coveredTokens;
+ } else if(foundTokens >= config.getMinFoundTokens() &&
+ labelMatchScore >= 0.6f){
+ if(foundTokens == coveredTokens){
+ labelMatch = MATCH.FULL;
+ } else {
+ labelMatch = MATCH.PARTIAL;
+ }
+ } else {
+ labelMatch = MATCH.NONE;
+ }
+ if(labelMatch != MATCH.NONE){
+ if(match.getMatchCount() < foundTokens ||
+ match.getMatchCount() < foundTokens &&
+ labelMatch.ordinal() > match.getMatch().ordinal()){
+ match.updateMatch(labelMatch, coveredTokens, foundTokens,
+ foundTokenMatch/foundTokens,label,labelTokens.length);
+ } //else this match is not better as the existing one
+ } //else ignore labels with MATCH.NONE
+ } //else NO tokens found -> nothing to do
+ }
+ /**
+ * Compares to token with each other and returns the longest match. The
+ * tokens are compared from the beginning and from the end.
+ * @param token1 the first token
+ * @param token2 the second token
+ * @return the number of matching chars
+ */
+ private int compairTokens(String token1,String token2){
+ int l1 = token1.length(); //length of the first token
+ int l2 = token2.length(); //length of the second token
+ //in case of same length check for equals first
+ if(l1 == l2 && token1.equals(token2)){
+ return l1;
+ }
+ int ml = l1>l2?l2:l1; //minimum length of a token
+ if(ml == 0){
+ return ml;
+ }
+ int f = 0; //forward match count + 1
+ int b = 0; //backward match count + 1
+ boolean match = true; //still matches
+ while(match && f < ml){
+ match = token1.charAt(f) == token2.charAt(f);
+ f++;
+ }
+ if(!match){
+ f--;
+ }
+ if(f < ml){
+ match = true;
+ while(match && b < ml){
+ b++;
+ match = token1.charAt(l1-b) == token2.charAt(l2-b);
+ }
+ if(!match){
+ b--;
+ }
+ }
+ return f > b ? f : b;
+ }
+
+ /**
* Checks if the current token of {@link #state} is processable.
* @param token the {@link Token} to check.
* @return <code>true</code> if the parsed token needs to be processed.
Modified:
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/Suggestion.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/Suggestion.java?rev=1177554&r1=1177553&r2=1177554&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/Suggestion.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/Suggestion.java
Fri Sep 30 08:21:35 2011
@@ -32,6 +32,12 @@ public class Suggestion implements Compa
private boolean redirectProcessed;
private double score;
+ /**
+ * The score of the matches (e.g. when a match is based on stemming or some
+ * oder kind of fuzziness, than matchers might assign a match score than
+ * 1.0.
+ */
+ private float matchScore;
public static enum MATCH {
/**
* No match (to less tokens, wrong oder ...)
@@ -65,16 +71,21 @@ public class Suggestion implements Compa
* @param match the math type
* @param span the number of token this suggestion spans
* @param count the number of token that match with the suggestion within
the span
+ * @param matchScore the score of the match. MUST BE in the range between
+ * <code>[0..1]</code>. For {@link MATCH#EXACT} and {@link MATCH#NONE} this
+ * parameter is ignored and the value is set to <code>1</code>,
<code>0</code>
+ * respectively.
* @param label the label that matches the tokens
* @param labelTokenCount the number of tokens of the label
*/
- protected void updateMatch(MATCH match,int span,int count,Text label,int
labelTokenCount){
+ protected void updateMatch(MATCH match,int span,int count,float
matchScore,Text label,int labelTokenCount){
this.match = match;
//check the validity of the parameters to avoid later errors that are
//than hard to debug
if(match == MATCH.NONE){
this.span = 0;
this.matchCount = 0;
+ this.matchScore = 0f;
this.label = null;
} else {
if(span < 1 || count < 1){
@@ -91,9 +102,19 @@ public class Suggestion implements Compa
}
}
this.span = span;
- this.matchCount = count;
this.label = label;
- this.labelTokenCount = labelTokenCount;
+ if(match == MATCH.EXACT){ //for exact matches the matchScore needs to
be
+ this.matchScore = 1f; // ignored and set to 1.0f
+ this.matchCount = span; //and the match count needs to be equals
to the span
+ this.labelTokenCount = span;
+ } else {
+ if(matchScore > 1f){
+ throw new IllegalArgumentException("The matchScore MUST NOT be
greater than one (parsed value = "+matchScore+")");
+ }
+ this.matchScore = matchScore;
+ this.matchCount = count;
+ this.labelTokenCount = labelTokenCount;
+ }
}
/**
* Getter for the number of Tokens of the label. Usually needed to
calculate
@@ -119,6 +140,20 @@ public class Suggestion implements Compa
return match;
}
/**
+ * Getter for the matching score. This is a modifier in the range
+ * between [0..1] that tells about the quality of the matches for the
+ * {@link #getMatchCount() matched} tokens. <p>
+ * As an example if a match is based on stemming a word a label matcher
+ * implementation might want to assign a matching score below
<code>1</code>.
+ * Score calculations that use the {@link #getMatchCount()} should use
+ * <code>{@link #getMatchCount()} * {@link #getMatchScore()}</code> as a
+ * bases.
+ * @return the matchScore
+ */
+ public final float getMatchScore() {
+ return matchScore;
+ }
+ /**
* Getter for the number of the token matched by this suggestion
* @return The number of the token matched by this suggestion
*/