Author: rwesten
Date: Thu Nov 21 07:44:36 2013
New Revision: 1544052
URL: http://svn.apache.org/r1544052
Log:
STANBOL-1211: merged changes in LinkableTokenFilter
Modified:
stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java
Modified:
stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java
URL:
http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java?rev=1544052&r1=1544051&r2=1544052&view=diff
==============================================================================
---
stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java
(original)
+++
stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java
Thu Nov 21 07:44:36 2013
@@ -36,10 +36,12 @@ import org.apache.lucene.analysis.tokena
import
org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig;
import
org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig;
import
org.apache.stanbol.enhancer.engines.entitylinking.engine.EntityLinkingEngine;
+import org.apache.stanbol.enhancer.engines.entitylinking.impl.ChunkData;
import org.apache.stanbol.enhancer.engines.entitylinking.impl.ProcessingState;
import org.apache.stanbol.enhancer.engines.entitylinking.impl.SectionData;
import org.apache.stanbol.enhancer.engines.entitylinking.impl.TokenData;
import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
+import org.apache.stanbol.enhancer.nlp.model.Chunk;
import org.apache.stanbol.enhancer.nlp.model.Section;
import org.apache.stanbol.enhancer.nlp.model.Sentence;
import org.apache.stanbol.enhancer.nlp.model.Span.SpanTypeEnum;
@@ -151,12 +153,20 @@ public final class LinkableTokenFilter e
/**
* List with {@link TokenData#isLinkable linkable} {@link Token}s used by
* the {@link #reduce(TagLL[])} method to check if {@link TagLL tags}
- * do overlap with any linkable token.
+ * do overlap with any linkable token. With STANBOL-1211 this was changed
to
+ * us {@link LinkableTokenContext} as {@link #reduce(TagLL[])} requires
+ * to access surrounding {@link Token}s to check for {@link Chunk}s
*/
- private final List<Token> linkableTokens = new LinkedList<Token>();
+ private final List<LinkableTokenContext> linkableTokens = new
LinkedList<LinkableTokenContext>();
+
+ /**
+ * The minimum score a tag needs to match processable tokens within a
+ * {@link Chunk} so that is is not omitted.
+ */
+ private double minChunkMatchScore;
protected LinkableTokenFilter(TokenStream input, AnalysedText at,
- String lang, LanguageProcessingConfig lpc) {
+ String lang, LanguageProcessingConfig lpc, double
minChunkMatchScore) {
super(input);
//STANBOL-1177: add attributes in doPrivileged to avoid
//AccessControlException: access denied ("java.lang.RuntimePermission"
"getClassLoader")
@@ -177,6 +187,7 @@ public final class LinkableTokenFilter e
this.lpc = lpc;
this.isUnicaseLanguage = lang != null && !lang.isEmpty() &&
UNICASE_SCRIPT_LANUAGES.contains(lang);
+ this.minChunkMatchScore = minChunkMatchScore;
}
@Override
@@ -312,11 +323,20 @@ public final class LinkableTokenFilter e
return true;
}
}
+ /**
+ * Adds a token. Also cares about adding tokens to {@link #linkableTokens}
+ * @param token the tokens - MUST NOT be NULL.
+ */
private void addToken(TokenData token){
tokens.add(token);
if(token.isLinkable){
//add to the list of linkable for #reduce(TagLL[])
- linkableTokens.add(token.token);
+ linkableTokens.add(new
LinkableTokenContext(token,sectionData.getTokens()));
+ } else if(token.isMatchable && !lpc.isIgnoreChunks() &&//matchable
token
+ token.inChunk != null && //in processable chunks with more
+ token.inChunk.isProcessable && //as two matchable tokens
+ token.inChunk.getMatchableCount() > 1){ //matchable tokens
+ linkableTokens.add(new
LinkableTokenContext(token,sectionData.getTokens()));
}
}
/**
@@ -326,33 +346,99 @@ public final class LinkableTokenFilter e
private TokenData getToken(){
return tokens.isEmpty() ? null : tokens.get(tokensCursor);
}
-
@Override
public void reduce(TagLL[] head) {
- Token linkableToken;
+ LinkableTokenContext linkableTokenContext;
for(TagLL tag = head[0]; tag != null; tag = tag.getNextTag()) {
int start = tag.getStartOffset();
int end = tag.getEndOffset();
- linkableToken = linkableTokens.isEmpty() ? null :
linkableTokens.get(0);
- while(linkableToken != null && linkableToken.getEnd() <= start){
+ linkableTokenContext = linkableTokens.isEmpty() ? null :
linkableTokens.get(0);
+ while(linkableTokenContext != null &&
linkableTokenContext.linkableToken.token.getEnd() <= start){
linkableTokens.remove(0);
- linkableToken = linkableTokens.isEmpty() ? null :
linkableTokens.get(0);
+ linkableTokenContext = linkableTokens.isEmpty() ? null :
linkableTokens.get(0);
}
- if(linkableToken == null || linkableToken.getStart() >= end){
+ if(linkableTokenContext == null ||
linkableTokenContext.linkableToken.token.getStart() >= end){
//does not overlap any linkable token
tag.removeLL(); //remove the tag from the cluster
if(log.isTraceEnabled()){
CharSequence tagSequence = at.getText().subSequence(start,
end);
log.trace(" > reduce tag {}", tagSequence);
}
- } else {
+ } else { //if the tag overlaps a linkable token
+ TokenData linkableToken = linkableTokenContext.linkableToken;
+ List<TokenData> tokens = linkableTokenContext.context;
+ ChunkData cd = linkableToken.inChunk; //check if it maches >
50% of the chunk
+ if(!lpc.isIgnoreChunks() && cd != null &&
+ cd.isProcessable){
+ int cstart = cd.getMatchableStartChar() >= 0 ?
cd.getMatchableStartChar() :
+ start;
+ int cend = cd.getMatchableEndChar();
+ if(cstart < start || cend > end){ //if the tag does not
cover the whole chunk
+ int num = 0;
+ int match = 0;
+ for(int i = cd.getMatchableStart(); i <=
cd.getMatchableEnd(); i++){
+ TokenData td = tokens.get(i);
+ if(td.isMatchable){
+ num++;
+ if(match < 1 && td.token.getStart() >= start ||
+ match > 0 && td.token.getEnd() <= end){
+ match++;
+ }
+ }
+ }
+ //only accept tags with more as half of the matchable
+ //tokens in the Chunk are matched!
+ if(((float)match/(float)num) < minChunkMatchScore){
+ tag.removeLL(); //ignore
+ if(log.isTraceEnabled()){
+ CharSequence text = at.getText();
+ log.trace(" - reduce tag {}[{},{}] because it
does only match "
+ + "{} of {} of matchable Chunk {}[{},{}]",
+ new Object[]{text.subSequence(start, end),
start, end, match,
+ num, text.subSequence(cstart,
cend), cstart, cend});
+ }
+ } else if(log.isTraceEnabled()){
+ CharSequence text = at.getText();
+ log.trace(" + keep tag {}[{},{}] matching {} of {}
"
+ + "matchable Tokens for matchable Chunk
{}[{},{}]",
+ new Object[]{text.subSequence(start, end),
start, end, match,
+ num, text.subSequence(cstart, cend),
cstart, cend});
+ }
+ } else if(log.isTraceEnabled()){
+ CharSequence text = at.getText();
+ log.trace(" + keep tag {}[{},{}] for matchable Chunk
{}[{},{}]",
+ new Object[]{text.subSequence(start, end), start,
end,
+ text.subSequence(cstart, cend), cstart,
cend});
+ }
+ }
if(log.isTraceEnabled()){
CharSequence tagSequence = at.getText().subSequence(start,
end);
- log.trace(" > keep tag {}", tagSequence);
+ log.trace(" + keep tag {}", tagSequence);
}
}
}
}
+
+ /**
+ * Holds the context for a linkable {@link Token}s. This ensures that the
+ * list of Tokens of the current {@link Section} (typically a {@link
Sentence})
+ * is still available even if the {@link LinkableTokenFilter#sectionData}
does hold
+ * already tokens for the next section.<p>
+ * This is necessary as {@link LinkableTokenFilter#reduce(TagLL[])} can
+ * be called for the previous sentence in cases where a Tag cluster
includes
+ * the last {@link Token} of a {@link Section}.
+ * @author Rupert Westenthaler
+ *
+ */
+ private static class LinkableTokenContext {
+ final TokenData linkableToken;
+ final List<TokenData> context;
+
+ LinkableTokenContext(TokenData linkableToken, List<TokenData> context){
+ this.linkableToken = linkableToken;
+ this.context = context;
+ }
+ }
}