Author: rwesten
Date: Fri Jan 10 07:29:22 2014
New Revision: 1557044

URL: http://svn.apache.org/r1557044
Log:
STANBOL-1252: merged NUM_TOKEN_FOUND feature of Lucene FST Linking Engine to 
0.12 branch

Modified:
    
stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java
    
stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java
    
stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java

Modified: 
stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java
URL: 
http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java?rev=1557044&r1=1557043&r2=1557044&view=diff
==============================================================================
--- 
stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java
 (original)
+++ 
stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java
 Fri Jan 10 07:29:22 2014
@@ -370,13 +370,13 @@ public class FstLinkingEngine implements
             new CharSequenceReader(at.getText()));
         LinkableTokenFilter linkableTokenFilter = new 
LinkableTokenFilter(baseTokenStream, 
             at, session.getLanguage(), 
tpConfig.getConfiguration(session.getLanguage()),
-            elConfig.getMinChunkMatchScore());
+            elConfig.getMinChunkMatchScore(), elConfig.getMinFoundTokens());
         //we use two TagClusterReducer implementations.
         // (1) the linkableTokenFilter filters all tags that do not overlap any
         //     linkable Token
         // (2) the LONGEST_DOMINANT_RIGHT reducer (TODO: make configurable)
         TagClusterReducer reducer = new ChainedTagClusterReducer(
-            linkableTokenFilter,TagClusterReducer.LONGEST_DOMINANT_RIGHT);
+            TagClusterReducer.LONGEST_DOMINANT_RIGHT, linkableTokenFilter);
         final long[] time = new long[]{0};
         new Tagger(corpus.getFst(), linkableTokenFilter, 
reducer,session.isSkipAltTokens()) {
             

Modified: 
stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java
URL: 
http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java?rev=1557044&r1=1557043&r2=1557044&view=diff
==============================================================================
--- 
stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java
 (original)
+++ 
stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java
 Fri Jan 10 07:29:22 2014
@@ -213,6 +213,13 @@ public class FstLinkingEngineComponent {
      * The default size of the Entity Cache is set to 65k entities.
      */
     public static final int DEFAULT_ENTITY_CACHE_SIZE = 65536;
+
+    /**
+     * Changed default for the {@link EntityLinkerConfig#MIN_FOUND_TOKENS} 
property.
+     * This Engine uses <code>2</code> as default. While the {@link 
EntityLinkerConfig}
+     * currently sets the default to <code>1</code>
+     */
+    private static final Integer FST_DEFAULT_MIN_FOUND_TOKENS = 2;
     
     private final Logger log = 
LoggerFactory.getLogger(FstLinkingEngineComponent.class);
     /**
@@ -352,7 +359,13 @@ public class FstLinkingEngineComponent {
         //(1) parse the TextProcessing configuration
         //TODO: decide if we should use the TextProcessingConfig for this 
engine
         textProcessingConfig = TextProcessingConfig.createInstance(properties);
+        //change default for EntityLinkerConfig.MIN_FOUND_TOKENS
+        value = properties.get(EntityLinkerConfig.MIN_FOUND_TOKENS);
         entityLinkerConfig = EntityLinkerConfig.createInstance(properties, 
prefixService);
+        if(value == null){ //no MIN_FOUND_TOKENS config present
+            //manually set the default to the value used by this engine
+            entityLinkerConfig.setMinFoundTokens(FST_DEFAULT_MIN_FOUND_TOKENS);
+        }
         
         //(2) parse the configured IndexReference
         value = properties.get(SOLR_CORE);

Modified: 
stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java
URL: 
http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java?rev=1557044&r1=1557043&r2=1557044&view=diff
==============================================================================
--- 
stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java
 (original)
+++ 
stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java
 Fri Jan 10 07:29:22 2014
@@ -164,9 +164,14 @@ public final class LinkableTokenFilter e
      * {@link Chunk} so that is is not omitted. 
      */
     private double minChunkMatchScore;
+    /**
+     * The minimum amount of matched (matchable) Tokens so that an Entity is
+     * considered. Only used within processable chunks
+     */
+    private int minFoundTokens;
     
     protected LinkableTokenFilter(TokenStream input, AnalysedText at, 
-            String lang, LanguageProcessingConfig lpc, double 
minChunkMatchScore) {
+            String lang, LanguageProcessingConfig lpc, double 
minChunkMatchScore, int minFoundTokens) {
         super(input);
         //STANBOL-1177: add attributes in doPrivileged to avoid 
         //AccessControlException: access denied ("java.lang.RuntimePermission" 
"getClassLoader")
@@ -188,6 +193,7 @@ public final class LinkableTokenFilter e
         this.isUnicaseLanguage = lang != null && !lang.isEmpty() &&
                 UNICASE_SCRIPT_LANUAGES.contains(lang);
         this.minChunkMatchScore = minChunkMatchScore;
+        this.minFoundTokens = minFoundTokens;
     }
 
     @Override
@@ -362,13 +368,13 @@ public final class LinkableTokenFilter e
                 tag.removeLL(); //remove the tag from the cluster
                 if(log.isTraceEnabled()){
                     CharSequence tagSequence = at.getText().subSequence(start, 
end);
-                    log.trace(" > reduce tag {}", tagSequence);
+                    log.trace(" > reduce tag {} - no overlapp with linkable 
token", tagSequence);
                 }
             } else { //if the tag overlaps a linkable token 
                 TokenData linkableToken = linkableTokenContext.linkableToken;
                 List<TokenData> tokens = linkableTokenContext.context;
                 ChunkData cd = linkableToken.inChunk; //check if it maches > 
50% of the chunk
-                if(!lpc.isIgnoreChunks() && cd != null &&
+                 if(!lpc.isIgnoreChunks() && cd != null &&
                         cd.isProcessable){
                     int cstart = cd.getMatchableStartChar() >= 0 ? 
cd.getMatchableStartChar() :
                         start;
@@ -388,32 +394,32 @@ public final class LinkableTokenFilter e
                         }
                         //only accept tags with more as half of the matchable
                         //tokens in the Chunk are matched!
-                        if(((float)match/(float)num) < minChunkMatchScore){
+                        if(((float)match/(float)num) < minChunkMatchScore &&
+                                match < minFoundTokens){
                             tag.removeLL(); //ignore
                             if(log.isTraceEnabled()){
                                 CharSequence text = at.getText();
-                                log.trace(" - reduce tag {}[{},{}] because it 
does only match "
+                                log.trace(" - reduce tag {}[{},{}] - does only 
match "
                                     + "{} of {} of matchable Chunk {}[{},{}]", 
                                     new Object[]{text.subSequence(start, end), 
start, end, match,  
                                             num, text.subSequence(cstart, 
cend), cstart, cend});
                             }
                         } else if(log.isTraceEnabled()){
                             CharSequence text = at.getText();
-                            log.trace(" + keep tag {}[{},{}] matching {} of {} 
"
+                            log.trace(" + keep tag {}[{},{}] - matches {} of 
{} "
                                 + "matchable Tokens for matchable Chunk 
{}[{},{}]", 
                                 new Object[]{text.subSequence(start, end), 
start, end, match,
                                         num, text.subSequence(cstart, cend), 
cstart, cend});
                         }
                     } else if(log.isTraceEnabled()){
                         CharSequence text = at.getText();
-                        log.trace(" + keep tag {}[{},{}] for matchable Chunk 
{}[{},{}]", 
+                        log.trace(" + keep tag {}[{},{}] - matches whole Chunk 
{}[{},{}]", 
                             new Object[]{text.subSequence(start, end), start, 
end, 
                                  text.subSequence(cstart, cend), cstart, 
cend});
                     }
-                }
-                if(log.isTraceEnabled()){
+                } else if(log.isTraceEnabled()){
                     CharSequence tagSequence = at.getText().subSequence(start, 
end);
-                    log.trace(" + keep tag {}", tagSequence);
+                    log.trace(" + keep tag {} - not in processable chunk", 
tagSequence);
                 }
             }
         }


Reply via email to