Author: rwesten
Date: Thu May 9 15:08:33 2013
New Revision: 1480676
URL: http://svn.apache.org/r1480676
Log:
fixes for two issues related to STANBOL-1049; fix for STANBOL-1063 and
implementation of STANBOL-1064. In addition this changes the
IllegalStateException as mentioned in
http://markmail.org/message/acv7xkg2festbpjk to a WARN level logging
Modified:
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/LabelMatch.java
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java
Modified:
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java?rev=1480676&r1=1480675&r2=1480676&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java
(original)
+++
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java
Thu May 9 15:08:33 2013
@@ -135,7 +135,7 @@ public class LanguageProcessingConfig im
/**
* The minimum confidence that a POS annotation
*/
- private double minExcludePosAnnotationProbability =
DEFAULT_MIN_EXCLUDE_POS_ANNOTATION_PROBABILITY/2;
+ private double minExcludePosAnnotationProbability =
DEFAULT_MIN_EXCLUDE_POS_ANNOTATION_PROBABILITY;
private boolean ignoreChunksState = DEFAULT_IGNORE_CHUNK_STATE;
Modified:
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java?rev=1480676&r1=1480675&r2=1480676&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java
(original)
+++
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java
Thu May 9 15:08:33 2013
@@ -93,13 +93,10 @@ public class EntityLinker {
while(state.next()) {
TokenData token = state.getToken();
if(log.isDebugEnabled()){
- log.debug("--- preocess Token {}: {} (lemma: {} | pos:{})
chunk: {}",
- new Object[]{token.index,token.token.getSpan(),
- token.morpho != null ?
token.morpho.getLemma() : "none",
- token.token.getAnnotations(POS_ANNOTATION),
- token.inChunk != null ?
- (token.inChunk.chunk + " "+
token.inChunk.chunk.getSpan()) :
- "none"});
+ log.debug("--- preocess Token {}: {} (lemma: {}) linkable={},
matchable={} | chunk: {}",
+ new
Object[]{token.index,token.getTokenText(),token.getTokenLemma(),
+ token.isLinkable, token.isMatchable, token.inChunk !=
null ?
+ (token.inChunk.chunk + " "+
token.inChunk.chunk.getSpan()) : "none"});
}
List<String> searchStrings = new
ArrayList<String>(linkerConfig.getMaxSearchTokens());
String searchString = linkerConfig.isLemmaMatching() ?
token.getTokenLemma() :
@@ -134,11 +131,10 @@ public class EntityLinker {
if(minIncludeIndex <= prevIndex){
TokenData prevToken = state.getTokens().get(prevIndex);
if(log.isDebugEnabled()){
- log.debug(" {} {}:'{}' (lemma: {} | pos:{})",new
Object[]{
+ log.debug(" {} {}:'{}' (lemma: {}) linkable={},
matchable={}",new Object[]{
prevToken.isMatchable? '+':'-',prevToken.index,
- prevToken.token.getSpan(),
- prevToken.morpho != null ?
prevToken.morpho.getLemma() : "none",
- prevToken.token.getAnnotations(POS_ANNOTATION)
+ prevToken.getTokenText(),
prevToken.getTokenLemma(),
+ prevToken.isLinkable, prevToken.isMatchable
});
}
if(prevToken.isMatchable){
@@ -153,11 +149,10 @@ public class EntityLinker {
if(maxIndcludeIndex >= pastIndex){
TokenData pastToken = state.getTokens().get(pastIndex);
if(log.isDebugEnabled()){
- log.debug(" {} {}:'{}' (lemma: {} | pos:{})",new
Object[]{
- pastToken.isMatchable? '+':'-',pastToken.index,
- pastToken.token.getSpan(),
- pastToken.morpho != null ?
pastToken.morpho.getLemma() : "none",
- pastToken.token.getAnnotations(POS_ANNOTATION)
+ log.debug(" {} {}:'{}' (lemma: {}) linkable={},
matchable={}",new Object[]{
+ pastToken.isMatchable? '+':'-',pastToken.index,
+ pastToken.getTokenText(),
pastToken.getTokenLemma(),
+ pastToken.isLinkable, pastToken.isMatchable
});
}
if(pastToken.isMatchable){
Modified:
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/LabelMatch.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/LabelMatch.java?rev=1480676&r1=1480675&r2=1480676&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/LabelMatch.java
(original)
+++
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/LabelMatch.java
Thu May 9 15:08:33 2013
@@ -20,8 +20,12 @@ import java.util.Comparator;
import org.apache.clerezza.rdf.core.PlainLiteral;
import org.apache.stanbol.enhancer.engines.entitylinking.impl.Suggestion.MATCH;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
public class LabelMatch {
+
+ private final Logger log = LoggerFactory.getLogger(LabelMatch.class);
/**
* To be used in case no match is present
*/
@@ -89,12 +93,14 @@ public class LabelMatch {
score = textScore*labelScore;
if(span < processableMatchCount){
throw new IllegalArgumentException("The span '" + span
- + "' MUST BE >= the number of matched processable tokens'"
+ + "' MUST BE >= then number of matched processable tokens'"
+ processableMatchCount+"': "+toString()+"!");
}
if(span < matchCount){
- throw new IllegalArgumentException("The span '" + span
- + "' MUST BE >= the number of matched tokens '"+matchCount+"':
"+toString()+"!");
+ log.warn("The span '{}' MUST BE >= then number of matched tokens
'{}"
+ + "': {}! Set span to {}.", new Object[]{
+ span, matchCount, toString(), matchCount});
+ span = matchCount;
}
if(processableMatchCount > matchCount){
throw new IllegalArgumentException("The number of matched
processable tokens '"
Modified:
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java?rev=1480676&r1=1480675&r2=1480676&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java
(original)
+++
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java
Thu May 9 15:08:33 2013
@@ -271,11 +271,11 @@ public class ProcessingState {
} else if(span.getType() == SpanTypeEnum.Token){
TokenData tokenData = new
TokenData(tokens.size(),(Token)span,activeChunk);
if(log.isDebugEnabled()){
- log.debug(" > Token {}: {} (pos:{}) chunk: '{}' |Â
morpho: {}",
- new Object[]{tokenData.index,tokenData.token,
-
tokenData.token.getAnnotations(POS_ANNOTATION),
- tokenData.inChunk != null ?
tokenData.inChunk.chunk.getSpan() : "none",
- tokenData.morpho != null ?
tokenData.morpho : "none"});
+ log.debug(" > {}: {} {}(pos:{}) chunk: '{}'",
+ new Object[]{tokenData.index,tokenData.token,
+ tokenData.morpho != null ? ("(lemma:
"+tokenData.morpho.getLemma()+") ") : "",
+ tokenData.token.getAnnotations(POS_ANNOTATION),
+ tokenData.inChunk != null ?
tokenData.inChunk.chunk.getSpan() : "none"});
}
if(!tokenData.hasAlphaNumeric){
tokenData.isLinkable = false;
@@ -296,6 +296,7 @@ public class ProcessingState {
if(tpc.isLinkUpperCaseTokens()){
if(tokenData.isMatchable) { //convert
matchable to
tokenData.isLinkable = true; //linkable
+ tokenData.isMatchable = true;
} else { // and other tokens to
tokenData.isMatchable = true; //matchable
}
@@ -309,36 +310,43 @@ public class ProcessingState {
} //else not an upper case token
//(3) Unknown POS tag Rules (see STANBOL-1049)
- if(!tokenData.isLinkable && tokenData.isLinkablePos ==
null &&
- tokenData.isLinkablePos == null){
+ if(!tokenData.isLinkable && (tokenData.isLinkablePos
== null ||
+ tokenData.isMatchablePos == null)){
if(isUnicaseLanguage ||
!tpc.isLinkOnlyUpperCaseTokensWithUnknownPos()){
- if(tokenData.hasSearchableLength){
+ if(tokenData.isLinkablePos == null &&
tokenData.hasSearchableLength){
tokenData.isLinkable = true;
+ tokenData.isMatchable = true;
} //else no need to change the state
} else { //non unicase language and link only
upper case tokens enabled
if(tokenData.upperCase && // upper case token
tokenData.index > 0 && //not a
sentence or sub-sentence start
!tokens.get(tokenData.index-1).isSubSentenceStart){
- if(tokenData.hasSearchableLength){
+ if(tokenData.hasSearchableLength &&
tokenData.isLinkablePos == null){
tokenData.isLinkable = true;
- } else {
+ tokenData.isMatchable = true;
+ } else if(tokenData.isMatchablePos ==
null){
tokenData.isMatchable = true;
}
- } else if(tokenData.hasSearchableLength){
//lower case and long token
+ } else if(tokenData.hasSearchableLength &&
//lower case and long token
+ tokenData.isMatchablePos == null){
tokenData.isMatchable = true;
} //else lower case and short word
}
} //else already linkable or POS tag present
}
+ log.debug(" - {}",tokenData);
//add the token to the list
tokens.add(tokenData);
if(!foundLinkableToken){
foundLinkableToken = tokenData.isLinkable;
}
if(activeChunk != null){
- if(tokenData.isMatchable){
+ if (tokenData.isLinkable){
+ //ignore matchableCount in Chunks with linkable
Tokens
+ activeChunk.matchableCount = -10; //by setting the
count to -10
+ } else if(tokenData.isMatchable){
activeChunk.matchableCount++;
- }
+ }
if (span.getEnd() >= activeChunk.getEndChar()){
//this is the last token in the current chunk
activeChunk.endToken = tokens.size()-1;
@@ -705,7 +713,15 @@ public class ProcessingState {
public String getTokenLemma(){
return morpho != null ? morpho.getLemma() : null;
}
-
+ @Override
+ public String toString() {
+ return new StringBuilder("TokenData: '").append(getTokenText())
+
.append("'[linkable=").append(isLinkable).append("(linkabkePos=").append(isLinkablePos)
+ .append(")|
matchable=").append(isMatchable).append("(matchablePos=").append(isMatchablePos)
+ .append(")| alpha=").append(hasAlphaNumeric).append("|
seachLength=")
+ .append(hasSearchableLength).append("|
upperCase=").append(upperCase)
+ .append("]").toString();
+ }
}
/**
* Represents a Chunk (group of tokens) used as context for EntityLinking.