Author: rwesten
Date: Tue Nov 19 10:55:24 2013
New Revision: 1543373

URL: http://svn.apache.org/r1543373
Log:
implementation of STANBOL-1211 for the lucene FST linking engine

Modified:
    
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java
    
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java
    
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java

Modified: 
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java?rev=1543373&r1=1543372&r2=1543373&view=diff
==============================================================================
--- 
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java
 (original)
+++ 
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java
 Tue Nov 19 10:55:24 2013
@@ -269,9 +269,14 @@ public class FstLinkingEngine implements
                         double length = Math.max(alength, 
matchLabel.getLexicalForm().length());
                         match.setMatch(1d - 
((double)distance/length),matchLabel);
                     }
-                    log.trace(" ... add suggestion: label: '{}'; conf: {}", 
+                    if(match.getScore() >= elConfig.getMinMatchScore()){
+                        log.trace(" ... add suggestion: label: '{}'; conf: 
{}", 
                             matchLabel, match.getScore());
-                    suggestions.add(match);
+                        suggestions.add(match);
+                    } else {
+                        log.trace(" ... filtered because match score < {}", 
+                            elConfig.getMinMatchScore());
+                    }
                 } else { //the type of the current Entity is blacklisted
                     log.trace("  ... filtered because of entity types");
                 }
@@ -356,7 +361,8 @@ public class FstLinkingEngine implements
         TokenStream baseTokenStream = 
corpus.getTaggingAnalyzer().tokenStream("", 
             new CharSequenceReader(at.getText()));
         LinkableTokenFilter linkableTokenFilter = new 
LinkableTokenFilter(baseTokenStream, 
-            at, session.getLanguage(), 
tpConfig.getConfiguration(session.getLanguage()));
+            at, session.getLanguage(), 
tpConfig.getConfiguration(session.getLanguage()),
+            elConfig.getMinChunkMatchScore());
         //we use two TagClusterReducer implementations.
         // (1) the linkableTokenFilter filters all tags that do not overlap any
         //     linkable Token

Modified: 
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java?rev=1543373&r1=1543372&r2=1543373&view=diff
==============================================================================
--- 
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java
 (original)
+++ 
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java
 Tue Nov 19 10:55:24 2013
@@ -55,6 +55,7 @@ import org.apache.felix.scr.annotations.
 import org.apache.felix.scr.annotations.Property;
 import org.apache.felix.scr.annotations.PropertyOption;
 import org.apache.felix.scr.annotations.Reference;
+import org.apache.felix.scr.annotations.ReferenceCardinality;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.index.AtomicReader;
 import org.apache.lucene.index.FieldInfo;
@@ -222,7 +223,7 @@ public class FstLinkingEngineComponent {
     /**
      * used to resolve '{prefix}:{local-name}' used within the engines 
configuration
      */
-    @Reference
+    @Reference(cardinality=ReferenceCardinality.OPTIONAL_UNARY)
     protected NamespacePrefixService prefixService;    
 
     /**

Modified: 
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java?rev=1543373&r1=1543372&r2=1543373&view=diff
==============================================================================
--- 
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java
 (original)
+++ 
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java
 Tue Nov 19 10:55:24 2013
@@ -36,10 +36,12 @@ import org.apache.lucene.analysis.tokena
 import 
org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig;
 import 
org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig;
 import 
org.apache.stanbol.enhancer.engines.entitylinking.engine.EntityLinkingEngine;
+import org.apache.stanbol.enhancer.engines.entitylinking.impl.ChunkData;
 import org.apache.stanbol.enhancer.engines.entitylinking.impl.ProcessingState;
 import org.apache.stanbol.enhancer.engines.entitylinking.impl.SectionData;
 import org.apache.stanbol.enhancer.engines.entitylinking.impl.TokenData;
 import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
+import org.apache.stanbol.enhancer.nlp.model.Chunk;
 import org.apache.stanbol.enhancer.nlp.model.Section;
 import org.apache.stanbol.enhancer.nlp.model.Sentence;
 import org.apache.stanbol.enhancer.nlp.model.SpanTypeEnum;
@@ -153,10 +155,15 @@ public final class LinkableTokenFilter e
      * the {@link #reduce(TagLL[])} method to check if {@link TagLL tags} 
      * do overlap with any linkable token.
      */
-    private final List<Token> linkableTokens = new LinkedList<Token>();
+    private final List<TokenData> linkableTokens = new LinkedList<TokenData>();
+    /**
+     * The minimum score a tag needs to match processable tokens within a
+     * {@link Chunk} so that is is not omitted. 
+     */
+    private double minChunkMatchScore;
     
     protected LinkableTokenFilter(TokenStream input, AnalysedText at, 
-            String lang, LanguageProcessingConfig lpc) {
+            String lang, LanguageProcessingConfig lpc, double 
minChunkMatchScore) {
         super(input);
         //STANBOL-1177: add attributes in doPrivileged to avoid 
         //AccessControlException: access denied ("java.lang.RuntimePermission" 
"getClassLoader")
@@ -177,6 +184,7 @@ public final class LinkableTokenFilter e
         this.lpc = lpc;
         this.isUnicaseLanguage = lang != null && !lang.isEmpty() &&
                 UNICASE_SCRIPT_LANUAGES.contains(lang);
+        this.minChunkMatchScore = minChunkMatchScore;
     }
 
     @Override
@@ -312,11 +320,19 @@ public final class LinkableTokenFilter e
             return true;
         }
     }
+    /**
+     * Adds a token. Also cares about adding tokens to {@link #linkableTokens}
+     * @param token the tokens - MUST NOT be NULL.
+     */
     private void addToken(TokenData token){
         tokens.add(token);
         if(token.isLinkable){
             //add to the list of linkable for #reduce(TagLL[])
-            linkableTokens.add(token.token);
+            linkableTokens.add(token);
+        } else if(token.isMatchable && !lpc.isIgnoreChunks() //matchable token
+                && token.inChunk != null && //in chunks with two ore more
+                token.inChunk.getMatchableCount() > 1){ //matchable tokens
+            linkableTokens.add(token);
         }
     }
     /**
@@ -329,26 +345,71 @@ public final class LinkableTokenFilter e
 
     @Override
     public void reduce(TagLL[] head) {
-        Token linkableToken;
+        TokenData linkableToken;
         for(TagLL tag = head[0]; tag != null; tag = tag.getNextTag()) {
             int start = tag.getStartOffset();
             int end = tag.getEndOffset();
             linkableToken = linkableTokens.isEmpty() ? null : 
linkableTokens.get(0);
-            while(linkableToken != null && linkableToken.getEnd() <= start){
+            while(linkableToken != null && linkableToken.token.getEnd() <= 
start){
                 linkableTokens.remove(0);
                 linkableToken = linkableTokens.isEmpty() ? null : 
linkableTokens.get(0);
             }
-            if(linkableToken == null || linkableToken.getStart() >= end){
+            if(linkableToken == null || linkableToken.token.getStart() >= end){
                 //does not overlap any linkable token
                 tag.removeLL(); //remove the tag from the cluster
                 if(log.isTraceEnabled()){
                     CharSequence tagSequence = at.getText().subSequence(start, 
end);
                     log.trace(" > reduce tag {}", tagSequence);
                 }
-            } else {
-                if(log.isTraceEnabled()){
+            } else { //if the tag overlaps a linkable token 
+                ChunkData cd = linkableToken.inChunk; //check if it maches > 
50% of the chunk
+                if(!lpc.isIgnoreChunks() && cd != null &&
+                        cd.isProcessable){
+                    int cstart = cd.getMatchableStartChar() >= 0 ? 
cd.getMatchableStartChar() :
+                        start;
+                    int cend = cd.getMatchableEndChar();
+                    if(cstart < start || cend > end){ //if the tag does not 
cover the whole chunk
+                        int num = 0;
+                        int match = 0;
+                        List<TokenData> tokens = sectionData.getTokens();
+                        for(int i = cd.getMatchableStart(); i <= 
cd.getMatchableEnd(); i++){
+                            TokenData td = tokens.get(i);
+                            if(td.isMatchable){
+                                num++;
+                                if(match < 1 && td.token.getStart() >= start ||
+                                        match > 0 && td.token.getEnd() <= end){
+                                    match++;
+                                }
+                            }
+                        }
+                        //only accept tags with more as half of the matchable
+                        //tokens in the Chunk are matched!
+                        if(((float)match/(float)num) < minChunkMatchScore){
+                            tag.removeLL(); //ignore
+                            if(log.isDebugEnabled()){
+                                CharSequence text = at.getText();
+                                log.debug(" - reduce tag {}[{},{}] because it 
does only match "
+                                    + "{} of {} of matchable Chunk {}[{},{}]", 
+                                    new Object[]{text.subSequence(start, end), 
start, end, match,  
+                                            num, text.subSequence(cstart, 
cend), cstart, cend});
+                            }
+                        } else if(log.isDebugEnabled()){
+                            CharSequence text = at.getText();
+                            log.debug(" + keep tag {}[{},{}] matching {} of {} 
"
+                                + "matchable Tokens for matchable Chunk 
{}[{},{}]", 
+                                new Object[]{text.subSequence(start, end), 
start, end, match,
+                                        num, text.subSequence(cstart, cend), 
cstart, cend});
+                        }
+                    } else if(log.isDebugEnabled()){
+                        CharSequence text = at.getText();
+                        log.debug(" + keep tag {}[{},{}] for matchable Chunk 
{}[{},{}]", 
+                            new Object[]{text.subSequence(start, end), start, 
end, 
+                                 text.subSequence(cstart, cend), cstart, 
cend});
+                    }
+                }
+                if(log.isDebugEnabled()){
                     CharSequence tagSequence = at.getText().subSequence(start, 
end);
-                    log.trace(" > keep tag {}", tagSequence);
+                    log.debug(" + keep tag {}", tagSequence);
                 }
             }
         }


Reply via email to