LinkableTokenFilter.java

rwesten Wed, 20 Nov 2013 23:46:04 -0800

Author: rwesten
Date: Thu Nov 21 07:44:36 2013
New Revision: 1544052

URL: http://svn.apache.org/r1544052
Log:
STANBOL-1211: merged changes in LinkableTokenFilter


Modified:
    
stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java

Modified: 
stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java
URL: 
http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java?rev=1544052&r1=1544051&r2=1544052&view=diff
==============================================================================
--- 
stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java
 (original)
+++ 
stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java
 Thu Nov 21 07:44:36 2013
@@ -36,10 +36,12 @@ import org.apache.lucene.analysis.tokena
 import 
org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig;
 import 
org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig;
 import 
org.apache.stanbol.enhancer.engines.entitylinking.engine.EntityLinkingEngine;
+import org.apache.stanbol.enhancer.engines.entitylinking.impl.ChunkData;
 import org.apache.stanbol.enhancer.engines.entitylinking.impl.ProcessingState;
 import org.apache.stanbol.enhancer.engines.entitylinking.impl.SectionData;
 import org.apache.stanbol.enhancer.engines.entitylinking.impl.TokenData;
 import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
+import org.apache.stanbol.enhancer.nlp.model.Chunk;
 import org.apache.stanbol.enhancer.nlp.model.Section;
 import org.apache.stanbol.enhancer.nlp.model.Sentence;
 import org.apache.stanbol.enhancer.nlp.model.Span.SpanTypeEnum;
@@ -151,12 +153,20 @@ public final class LinkableTokenFilter e
     /**
      * List with {@link TokenData#isLinkable linkable} {@link Token}s used by
      * the {@link #reduce(TagLL[])} method to check if {@link TagLL tags} 
-     * do overlap with any linkable token.
+     * do overlap with any linkable token. With STANBOL-1211 this was changed 
to
+     * us {@link LinkableTokenContext} as {@link #reduce(TagLL[])} requires
+     * to access surrounding {@link Token}s to check for {@link Chunk}s
      */
-    private final List<Token> linkableTokens = new LinkedList<Token>();
+    private final List<LinkableTokenContext> linkableTokens = new 
LinkedList<LinkableTokenContext>();
+
+    /**
+     * The minimum score a tag needs to match processable tokens within a
+     * {@link Chunk} so that is is not omitted. 
+     */
+    private double minChunkMatchScore;
     
     protected LinkableTokenFilter(TokenStream input, AnalysedText at, 
-            String lang, LanguageProcessingConfig lpc) {
+            String lang, LanguageProcessingConfig lpc, double 
minChunkMatchScore) {
         super(input);
         //STANBOL-1177: add attributes in doPrivileged to avoid 
         //AccessControlException: access denied ("java.lang.RuntimePermission" 
"getClassLoader")
@@ -177,6 +187,7 @@ public final class LinkableTokenFilter e
         this.lpc = lpc;
         this.isUnicaseLanguage = lang != null && !lang.isEmpty() &&
                 UNICASE_SCRIPT_LANUAGES.contains(lang);
+        this.minChunkMatchScore = minChunkMatchScore;
     }
 
     @Override
@@ -312,11 +323,20 @@ public final class LinkableTokenFilter e
             return true;
         }
     }
+    /**
+     * Adds a token. Also cares about adding tokens to {@link #linkableTokens}
+     * @param token the tokens - MUST NOT be NULL.
+     */
     private void addToken(TokenData token){
         tokens.add(token);
         if(token.isLinkable){
             //add to the list of linkable for #reduce(TagLL[])
-            linkableTokens.add(token.token);
+            linkableTokens.add(new 
LinkableTokenContext(token,sectionData.getTokens()));
+        } else if(token.isMatchable && !lpc.isIgnoreChunks() &&//matchable 
token
+                token.inChunk != null && //in processable chunks with more
+                token.inChunk.isProcessable && //as two matchable tokens
+                token.inChunk.getMatchableCount() > 1){ //matchable tokens
+            linkableTokens.add(new 
LinkableTokenContext(token,sectionData.getTokens()));
         }
     }
     /**
@@ -326,33 +346,99 @@ public final class LinkableTokenFilter e
     private TokenData getToken(){
         return tokens.isEmpty() ? null : tokens.get(tokensCursor);
     }
-
     @Override
     public void reduce(TagLL[] head) {
-        Token linkableToken;
+       LinkableTokenContext linkableTokenContext;
         for(TagLL tag = head[0]; tag != null; tag = tag.getNextTag()) {
             int start = tag.getStartOffset();
             int end = tag.getEndOffset();
-            linkableToken = linkableTokens.isEmpty() ? null : 
linkableTokens.get(0);
-            while(linkableToken != null && linkableToken.getEnd() <= start){
+            linkableTokenContext = linkableTokens.isEmpty() ? null : 
linkableTokens.get(0);
+            while(linkableTokenContext != null && 
linkableTokenContext.linkableToken.token.getEnd() <= start){
                 linkableTokens.remove(0);
-                linkableToken = linkableTokens.isEmpty() ? null : 
linkableTokens.get(0);
+                linkableTokenContext = linkableTokens.isEmpty() ? null : 
linkableTokens.get(0);
             }
-            if(linkableToken == null || linkableToken.getStart() >= end){
+            if(linkableTokenContext == null || 
linkableTokenContext.linkableToken.token.getStart() >= end){
                 //does not overlap any linkable token
                 tag.removeLL(); //remove the tag from the cluster
                 if(log.isTraceEnabled()){
                     CharSequence tagSequence = at.getText().subSequence(start, 
end);
                     log.trace(" > reduce tag {}", tagSequence);
                 }
-            } else {
+            } else { //if the tag overlaps a linkable token 
+                TokenData linkableToken = linkableTokenContext.linkableToken;
+                List<TokenData> tokens = linkableTokenContext.context;
+                ChunkData cd = linkableToken.inChunk; //check if it maches > 
50% of the chunk
+                if(!lpc.isIgnoreChunks() && cd != null &&
+                        cd.isProcessable){
+                    int cstart = cd.getMatchableStartChar() >= 0 ? 
cd.getMatchableStartChar() :
+                        start;
+                    int cend = cd.getMatchableEndChar();
+                    if(cstart < start || cend > end){ //if the tag does not 
cover the whole chunk
+                        int num = 0;
+                        int match = 0;
+                        for(int i = cd.getMatchableStart(); i <= 
cd.getMatchableEnd(); i++){
+                            TokenData td = tokens.get(i);
+                            if(td.isMatchable){
+                                num++;
+                                if(match < 1 && td.token.getStart() >= start ||
+                                        match > 0 && td.token.getEnd() <= end){
+                                    match++;
+                                }
+                            }
+                        }
+                        //only accept tags with more as half of the matchable
+                        //tokens in the Chunk are matched!
+                        if(((float)match/(float)num) < minChunkMatchScore){
+                            tag.removeLL(); //ignore
+                            if(log.isTraceEnabled()){
+                                CharSequence text = at.getText();
+                                log.trace(" - reduce tag {}[{},{}] because it 
does only match "
+                                    + "{} of {} of matchable Chunk {}[{},{}]", 
+                                    new Object[]{text.subSequence(start, end), 
start, end, match,  
+                                            num, text.subSequence(cstart, 
cend), cstart, cend});
+                            }
+                        } else if(log.isTraceEnabled()){
+                            CharSequence text = at.getText();
+                            log.trace(" + keep tag {}[{},{}] matching {} of {} 
"
+                                + "matchable Tokens for matchable Chunk 
{}[{},{}]", 
+                                new Object[]{text.subSequence(start, end), 
start, end, match,
+                                        num, text.subSequence(cstart, cend), 
cstart, cend});
+                        }
+                    } else if(log.isTraceEnabled()){
+                        CharSequence text = at.getText();
+                        log.trace(" + keep tag {}[{},{}] for matchable Chunk 
{}[{},{}]", 
+                            new Object[]{text.subSequence(start, end), start, 
end, 
+                                 text.subSequence(cstart, cend), cstart, 
cend});
+                    }
+                }
                 if(log.isTraceEnabled()){
                     CharSequence tagSequence = at.getText().subSequence(start, 
end);
-                    log.trace(" > keep tag {}", tagSequence);
+                    log.trace(" + keep tag {}", tagSequence);
                 }
             }
         }
         
     }
+
+    /**
+     * Holds the context for a linkable {@link Token}s. This ensures that the
+     * list of Tokens of the current {@link Section} (typically a {@link 
Sentence}) 
+     * is still available even if the {@link LinkableTokenFilter#sectionData} 
does hold
+     * already tokens for the next section.<p>
+     * This is necessary as {@link LinkableTokenFilter#reduce(TagLL[])} can
+     * be called for the previous sentence in cases where a Tag cluster 
includes
+     * the last {@link Token} of a {@link Section}.
+     * @author Rupert Westenthaler
+     *
+     */
+    private static class LinkableTokenContext {
+        final TokenData linkableToken;
+        final List<TokenData> context;
+        
+        LinkableTokenContext(TokenData linkableToken, List<TokenData> context){
+            this.linkableToken = linkableToken;
+            this.context = context;
+        }
+    }
     
 }

svn commit: r1544052 - /stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java

Reply via email to