Author: rwesten
Date: Tue Nov 19 10:55:24 2013
New Revision: 1543373
URL: http://svn.apache.org/r1543373
Log:
implementation of STANBOL-1211 for the lucene FST linking engine
Modified:
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java
Modified:
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java?rev=1543373&r1=1543372&r2=1543373&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java
(original)
+++
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java
Tue Nov 19 10:55:24 2013
@@ -269,9 +269,14 @@ public class FstLinkingEngine implements
double length = Math.max(alength,
matchLabel.getLexicalForm().length());
match.setMatch(1d -
((double)distance/length),matchLabel);
}
- log.trace(" ... add suggestion: label: '{}'; conf: {}",
+ if(match.getScore() >= elConfig.getMinMatchScore()){
+ log.trace(" ... add suggestion: label: '{}'; conf:
{}",
matchLabel, match.getScore());
- suggestions.add(match);
+ suggestions.add(match);
+ } else {
+ log.trace(" ... filtered because match score < {}",
+ elConfig.getMinMatchScore());
+ }
} else { //the type of the current Entity is blacklisted
log.trace(" ... filtered because of entity types");
}
@@ -356,7 +361,8 @@ public class FstLinkingEngine implements
TokenStream baseTokenStream =
corpus.getTaggingAnalyzer().tokenStream("",
new CharSequenceReader(at.getText()));
LinkableTokenFilter linkableTokenFilter = new
LinkableTokenFilter(baseTokenStream,
- at, session.getLanguage(),
tpConfig.getConfiguration(session.getLanguage()));
+ at, session.getLanguage(),
tpConfig.getConfiguration(session.getLanguage()),
+ elConfig.getMinChunkMatchScore());
//we use two TagClusterReducer implementations.
// (1) the linkableTokenFilter filters all tags that do not overlap any
// linkable Token
Modified:
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java?rev=1543373&r1=1543372&r2=1543373&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java
(original)
+++
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java
Tue Nov 19 10:55:24 2013
@@ -55,6 +55,7 @@ import org.apache.felix.scr.annotations.
import org.apache.felix.scr.annotations.Property;
import org.apache.felix.scr.annotations.PropertyOption;
import org.apache.felix.scr.annotations.Reference;
+import org.apache.felix.scr.annotations.ReferenceCardinality;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.FieldInfo;
@@ -222,7 +223,7 @@ public class FstLinkingEngineComponent {
/**
* used to resolve '{prefix}:{local-name}' used within the engines
configuration
*/
- @Reference
+ @Reference(cardinality=ReferenceCardinality.OPTIONAL_UNARY)
protected NamespacePrefixService prefixService;
/**
Modified:
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java?rev=1543373&r1=1543372&r2=1543373&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java
(original)
+++
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java
Tue Nov 19 10:55:24 2013
@@ -36,10 +36,12 @@ import org.apache.lucene.analysis.tokena
import
org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig;
import
org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig;
import
org.apache.stanbol.enhancer.engines.entitylinking.engine.EntityLinkingEngine;
+import org.apache.stanbol.enhancer.engines.entitylinking.impl.ChunkData;
import org.apache.stanbol.enhancer.engines.entitylinking.impl.ProcessingState;
import org.apache.stanbol.enhancer.engines.entitylinking.impl.SectionData;
import org.apache.stanbol.enhancer.engines.entitylinking.impl.TokenData;
import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
+import org.apache.stanbol.enhancer.nlp.model.Chunk;
import org.apache.stanbol.enhancer.nlp.model.Section;
import org.apache.stanbol.enhancer.nlp.model.Sentence;
import org.apache.stanbol.enhancer.nlp.model.SpanTypeEnum;
@@ -153,10 +155,15 @@ public final class LinkableTokenFilter e
* the {@link #reduce(TagLL[])} method to check if {@link TagLL tags}
* do overlap with any linkable token.
*/
- private final List<Token> linkableTokens = new LinkedList<Token>();
+ private final List<TokenData> linkableTokens = new LinkedList<TokenData>();
+ /**
+ * The minimum score a tag needs to match processable tokens within a
+ * {@link Chunk} so that is is not omitted.
+ */
+ private double minChunkMatchScore;
protected LinkableTokenFilter(TokenStream input, AnalysedText at,
- String lang, LanguageProcessingConfig lpc) {
+ String lang, LanguageProcessingConfig lpc, double
minChunkMatchScore) {
super(input);
//STANBOL-1177: add attributes in doPrivileged to avoid
//AccessControlException: access denied ("java.lang.RuntimePermission"
"getClassLoader")
@@ -177,6 +184,7 @@ public final class LinkableTokenFilter e
this.lpc = lpc;
this.isUnicaseLanguage = lang != null && !lang.isEmpty() &&
UNICASE_SCRIPT_LANUAGES.contains(lang);
+ this.minChunkMatchScore = minChunkMatchScore;
}
@Override
@@ -312,11 +320,19 @@ public final class LinkableTokenFilter e
return true;
}
}
+ /**
+ * Adds a token. Also cares about adding tokens to {@link #linkableTokens}
+ * @param token the tokens - MUST NOT be NULL.
+ */
private void addToken(TokenData token){
tokens.add(token);
if(token.isLinkable){
//add to the list of linkable for #reduce(TagLL[])
- linkableTokens.add(token.token);
+ linkableTokens.add(token);
+ } else if(token.isMatchable && !lpc.isIgnoreChunks() //matchable token
+ && token.inChunk != null && //in chunks with two ore more
+ token.inChunk.getMatchableCount() > 1){ //matchable tokens
+ linkableTokens.add(token);
}
}
/**
@@ -329,26 +345,71 @@ public final class LinkableTokenFilter e
@Override
public void reduce(TagLL[] head) {
- Token linkableToken;
+ TokenData linkableToken;
for(TagLL tag = head[0]; tag != null; tag = tag.getNextTag()) {
int start = tag.getStartOffset();
int end = tag.getEndOffset();
linkableToken = linkableTokens.isEmpty() ? null :
linkableTokens.get(0);
- while(linkableToken != null && linkableToken.getEnd() <= start){
+ while(linkableToken != null && linkableToken.token.getEnd() <=
start){
linkableTokens.remove(0);
linkableToken = linkableTokens.isEmpty() ? null :
linkableTokens.get(0);
}
- if(linkableToken == null || linkableToken.getStart() >= end){
+ if(linkableToken == null || linkableToken.token.getStart() >= end){
//does not overlap any linkable token
tag.removeLL(); //remove the tag from the cluster
if(log.isTraceEnabled()){
CharSequence tagSequence = at.getText().subSequence(start,
end);
log.trace(" > reduce tag {}", tagSequence);
}
- } else {
- if(log.isTraceEnabled()){
+ } else { //if the tag overlaps a linkable token
+ ChunkData cd = linkableToken.inChunk; //check if it maches >
50% of the chunk
+ if(!lpc.isIgnoreChunks() && cd != null &&
+ cd.isProcessable){
+ int cstart = cd.getMatchableStartChar() >= 0 ?
cd.getMatchableStartChar() :
+ start;
+ int cend = cd.getMatchableEndChar();
+ if(cstart < start || cend > end){ //if the tag does not
cover the whole chunk
+ int num = 0;
+ int match = 0;
+ List<TokenData> tokens = sectionData.getTokens();
+ for(int i = cd.getMatchableStart(); i <=
cd.getMatchableEnd(); i++){
+ TokenData td = tokens.get(i);
+ if(td.isMatchable){
+ num++;
+ if(match < 1 && td.token.getStart() >= start ||
+ match > 0 && td.token.getEnd() <= end){
+ match++;
+ }
+ }
+ }
+ //only accept tags with more as half of the matchable
+ //tokens in the Chunk are matched!
+ if(((float)match/(float)num) < minChunkMatchScore){
+ tag.removeLL(); //ignore
+ if(log.isDebugEnabled()){
+ CharSequence text = at.getText();
+ log.debug(" - reduce tag {}[{},{}] because it
does only match "
+ + "{} of {} of matchable Chunk {}[{},{}]",
+ new Object[]{text.subSequence(start, end),
start, end, match,
+ num, text.subSequence(cstart,
cend), cstart, cend});
+ }
+ } else if(log.isDebugEnabled()){
+ CharSequence text = at.getText();
+ log.debug(" + keep tag {}[{},{}] matching {} of {}
"
+ + "matchable Tokens for matchable Chunk
{}[{},{}]",
+ new Object[]{text.subSequence(start, end),
start, end, match,
+ num, text.subSequence(cstart, cend),
cstart, cend});
+ }
+ } else if(log.isDebugEnabled()){
+ CharSequence text = at.getText();
+ log.debug(" + keep tag {}[{},{}] for matchable Chunk
{}[{},{}]",
+ new Object[]{text.subSequence(start, end), start,
end,
+ text.subSequence(cstart, cend), cstart,
cend});
+ }
+ }
+ if(log.isDebugEnabled()){
CharSequence tagSequence = at.getText().subSequence(start,
end);
- log.trace(" > keep tag {}", tagSequence);
+ log.debug(" + keep tag {}", tagSequence);
}
}
}