Author: rwesten
Date: Tue Nov 19 13:51:46 2013
New Revision: 1543431

URL: http://svn.apache.org/r1543431
Log:
merged STANBOL-1211 to the 0.12 releasing branch

Modified:
    stanbol/branches/release-0.12/   (props changed)
    
stanbol/branches/release-0.12/data/defaultconfig/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-dbpedia_fst.config
    stanbol/branches/release-0.12/enhancement-engines/   (props changed)
    
stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/EntityLinkerConfig.java
    
stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ChunkData.java
    
stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java
    
stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/SectionData.java
    
stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/test/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngineTest.java
    
stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java
    
stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java
    
stanbol/branches/release-0.12/integration-tests/src/test/java/org/apache/stanbol/enhancer/it/FstLinkingTest.java

Propchange: stanbol/branches/release-0.12/
------------------------------------------------------------------------------
  Merged /stanbol/trunk:r1543372-1543373,1543405

Modified: 
stanbol/branches/release-0.12/data/defaultconfig/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-dbpedia_fst.config
URL: 
http://svn.apache.org/viewvc/stanbol/branches/release-0.12/data/defaultconfig/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-dbpedia_fst.config?rev=1543431&r1=1543430&r2=1543431&view=diff
==============================================================================
--- 
stanbol/branches/release-0.12/data/defaultconfig/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-dbpedia_fst.config
 (original)
+++ 
stanbol/branches/release-0.12/data/defaultconfig/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-dbpedia_fst.config
 Tue Nov 19 13:51:46 2013
@@ -1,3 +1,3 @@
 stanbol.enhancer.chain.name="dbpedia-fst-linking"
-stanbol.enhancer.chain.weighted.chain=["tika;optional","langdetect","opennlp-sentence","opennlp-token","opennlp-pos","dbpedia-fst"]
+stanbol.enhancer.chain.weighted.chain=["tika;optional","langdetect","opennlp-sentence","opennlp-token","opennlp-pos","opennlp-chunker","dbpedia-fst"]
 service.ranking=I"0"
\ No newline at end of file

Propchange: stanbol/branches/release-0.12/enhancement-engines/
------------------------------------------------------------------------------
  Merged /stanbol/trunk/enhancement-engines:r1543372-1543373

Modified: 
stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/EntityLinkerConfig.java
URL: 
http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/EntityLinkerConfig.java?rev=1543431&r1=1543430&r2=1543431&view=diff
==============================================================================
--- 
stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/EntityLinkerConfig.java
 (original)
+++ 
stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/EntityLinkerConfig.java
 Tue Nov 19 13:51:46 2013
@@ -37,6 +37,7 @@ import org.apache.stanbol.enhancer.engin
 import org.apache.stanbol.enhancer.engines.entitylinking.impl.Suggestion;
 import org.apache.stanbol.enhancer.engines.entitylinking.impl.Suggestion.MATCH;
 import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
+import org.apache.stanbol.enhancer.nlp.model.Chunk;
 import org.apache.stanbol.enhancer.nlp.model.Token;
 import org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures;
 import org.apache.stanbol.enhancer.nlp.pos.Pos;
@@ -168,6 +169,14 @@ public class EntityLinkerConfig {
      */
     public static final String MIN_MATCH_FACTOR = 
"enhancer.engines.linking.minMatchScore";
     /**
+     * The minimum score an Entity must match matchable {@link Token}s within 
a processable
+     * {@link Chunk}. By {@link #DEFAULT_MIN_CHUNK_MATCH_SCORE default} this is
+     * set to <code>51%</code> to filter Entities that do only match a single 
token
+     * within a NounPhrase of two words. This feature was introduced with
+     * <a 
href="https://issues.apache.org/jira/browse/STANBOL-1211";>STANBOL-1211</a>
+     */
+    public static final String MIN_CHUNK_MATCH_SCORE = 
"enhancer.engines.linking.minChunkMatchScore";
+    /**
      * The maximum number of {@link Token} used as search terms with the 
      * {@link EntitySearcher#lookup(String, Set, java.util.List, String[], 
Integer)}
      * method
@@ -263,6 +272,13 @@ public class EntityLinkerConfig {
     public static final double DEFAULT_MIN_TEXT_SCORE = 0.4;
     public static final double DEFAULT_MIN_MATCH_SCORE = 0.3;
     /**
+     * By default more as 50% of the matchable tokens of a processable chunk
+     * need to match so that a Entity is considered to be mentioned in the text
+     * (STANBOL-1211)
+     */
+    public static final double DEFAULT_MIN_CHUNK_MATCH_SCORE = 0.51;
+    
+    /**
      * Default mapping for Concept types to dc:type values added for
      * TextAnnotations.
      */
@@ -449,6 +465,11 @@ public class EntityLinkerConfig {
     private double minLabelScore = DEFAULT_MIN_LABEL_SCORE;
     private double minTextScore = DEFAULT_MIN_TEXT_SCORE;
     private double minMatchScore = DEFAULT_MIN_MATCH_SCORE;
+    /**
+     * The minimum score an entity needs to match matchable tokens within a
+     * chunk so that is is considered as a mentions (STANBOL-1211)
+     */
+    private double minChunkMatchScore = DEFAULT_MIN_CHUNK_MATCH_SCORE;
 
     private boolean rankEqualScoresBasedOnEntityRankings = 
DEFAULT_RANK_EQUAL_SCORES_BASED_ON_ENTITY_RANKINGS;
 
@@ -632,7 +653,25 @@ public class EntityLinkerConfig {
         } catch (IllegalArgumentException e){
             throw new ConfigurationException(MIN_MATCH_FACTOR, e.getMessage());
         }
-                
+        
+        value = configuration.get(MIN_CHUNK_MATCH_SCORE);
+        Double minChunkMatchScore = null;
+        if(value instanceof Number){
+            minChunkMatchScore = Double.valueOf(((Number)value).doubleValue());
+        } else if(value != null){
+            try {
+                minChunkMatchScore = Double.valueOf(value.toString());
+            } catch (NumberFormatException e) {
+                throw new ConfigurationException(MIN_CHUNK_MATCH_SCORE, 
"Parsed value '"
+                        +value+"' is not an valid double!");
+            }
+        }
+        try {
+            linkerConfig.setMinChunkMatchScore(minChunkMatchScore);
+        } catch (IllegalArgumentException e){
+            throw new ConfigurationException(MIN_CHUNK_MATCH_SCORE, 
e.getMessage());
+        }
+        
         //init LEMMA_MATCHING_STATE
         value = configuration.get(LEMMA_MATCHING_STATE);
         if(value instanceof Boolean){
@@ -1085,14 +1124,15 @@ public class EntityLinkerConfig {
      */
     public UriRef setTypeMapping(String conceptType, UriRef dcType){
         if(dcType == null) {
-            throw new IllegalArgumentException("The parsed dc:type URI MUST 
NOT be NULL!");
-        }
-        if(conceptType == null){ //handle setting of the default dc:type value
-            UriRef oldDefault = getDefaultDcType();
-            setDefaultDcType(dcType);
-            return oldDefault;
+            return typeMappings.remove(conceptType == null ? null : new 
UriRef(conceptType));
+        } else {
+            if(conceptType == null){ //handle setting of the default dc:type 
value
+                UriRef oldDefault = getDefaultDcType();
+                setDefaultDcType(dcType);
+                return oldDefault;
+            }
+            return typeMappings.put(new UriRef(conceptType), dcType);
         }
-        return typeMappings.put(new UriRef(conceptType), dcType);
     }
     
     /**
@@ -1306,7 +1346,35 @@ public class EntityLinkerConfig {
         } else {
             minTextScore = score;
         }
-    }    
+    }
+    /**
+     * Getter for the minimum amount of matchable {@link Token}s an Entity 
must match
+     * within an {@link Chunk} to be considered (see STANBOL-1211).<p>
+     * The default is <code>&gt;0.5</code> to omit matches for a single token
+     * in a chunk - typically a noun phrase - including two words.
+     * @return the minimum chunk match score.
+     */
+    public double getMinChunkMatchScore() {
+        return minChunkMatchScore;
+    }
+    /**
+     * Setter for the minimum amount of matchable {@link Token}s an Entity 
must match
+     * within an {@link Chunk} to be considered (see STANBOL-1211).<p>
+     * The default is <code>&gt;0.5</code> to omit matches for a single token
+     * in a chunk - typically a noun phrase - including two words.
+     * @param minChunkMatchScore the minimum chunk match score or 
<code>null</code>
+     * to reset to the default value
+     */
+    public void setMinChunkMatchScore(Double minChunkMatchScore) {
+        if(minChunkMatchScore == null){
+            this.minChunkMatchScore = DEFAULT_MIN_CHUNK_MATCH_SCORE;
+        } else if(minChunkMatchScore < 0.0 || minChunkMatchScore > 1.0){
+            throw new IllegalArgumentException("The minChunkMatchScore MUST BE 
"
+                + "in the range [0..1] (parsed: "+minChunkMatchScore+")!");
+        } else {
+            this.minChunkMatchScore = minChunkMatchScore;
+        }
+    }
     /**
      * Getter for the minimum match Score of Entity labels against the
      * Text.<p>

Modified: 
stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ChunkData.java
URL: 
http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ChunkData.java?rev=1543431&r1=1543430&r2=1543431&view=diff
==============================================================================
--- 
stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ChunkData.java
 (original)
+++ 
stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ChunkData.java
 Tue Nov 19 13:51:46 2013
@@ -63,6 +63,23 @@ public class ChunkData {
      */
     int matchableCount;
     /**
+     * The start position of the first matchable {@link Token} within this
+     * chunk
+     */
+    int matchableStart = -1;
+    /**
+     * The start char offset of the first matchable {@link Token} within this 
chunk
+     */
+    int matchableStartCharIndex = -1;
+    /**
+     * The end position of the last matchable {@link Token} within this chunk
+     */
+    int matchableEnd = -1;
+    /**
+     * The end char offset of the last matchable {@link Token} within this 
chunk
+     */
+    int matchableEndCharIndex = -1;
+    /**
      * constructs and initializes the meta data for the parsed {@link Chunk}
      * @param chunk
      */
@@ -121,4 +138,37 @@ public class ChunkData {
     public int getEndTokenIndex() {
         return endToken;
     }
+    /**
+     * The index of the first matchable Token within the {@link Chunk} or
+     * <code>-1</code> if none
+     * @return
+     */
+    public int getMatchableStart() {
+        return matchableStart;
+    }
+    /**
+     * The index of the last matchable Token within the {@link Chunk} or
+     * <code>-1</code> if none
+     * @return
+     */
+    public int getMatchableEnd() {
+        return matchableEnd;
+    }
+    /**
+     * The char index of the start character of the first matchable {@link 
Token}
+     * within the {@link Chunk} or <code>-1</code> if none.
+     * @return
+     */
+    public int getMatchableStartChar() {
+        return matchableStartCharIndex;
+    }
+    /**
+     * the char indes of the end character of the last matchable {@link Token}
+     * within the {@link Chunk} or <code>-1</code> if none
+     * @return
+     */
+    public int getMatchableEndChar() {
+        return matchableEndCharIndex;
+    }
+    
 }
\ No newline at end of file

Modified: 
stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java
URL: 
http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java?rev=1543431&r1=1543430&r2=1543431&view=diff
==============================================================================
--- 
stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java
 (original)
+++ 
stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java
 Tue Nov 19 13:51:46 2013
@@ -25,6 +25,7 @@ import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
+import java.util.Locale;
 import java.util.Map;
 import java.util.NavigableMap;
 import java.util.Set;
@@ -35,6 +36,7 @@ import org.apache.clerezza.rdf.core.Trip
 import org.apache.clerezza.rdf.core.TripleCollection;
 import org.apache.clerezza.rdf.core.UriRef;
 import org.apache.clerezza.rdf.core.impl.TripleImpl;
+import org.apache.commons.lang.LocaleUtils;
 import org.apache.commons.lang.StringUtils;
 import org.apache.stanbol.enhancer.engines.entitylinking.Entity;
 import org.apache.stanbol.enhancer.engines.entitylinking.EntitySearcher;
@@ -162,24 +164,19 @@ public class EntityLinker {
             //Determine the range we are allowed to search for tokens
             final int minIncludeIndex;
             final int maxIndcludeIndex;
+            int consumedIndex = state.getConsumedIndex();
             //NOTE: testing has shown that using Chunks to restrict search for
             //      additional matchable tokens does have an negative impact on
             //      recall. Because of that this restriction is for now 
deactivated
-           //TODO: maybe make configurable via an own property
-            boolean restrirctContextByChunks = 
textProcessingConfig.isIgnoreChunks();
-            int consumedIndex = state.getConsumedIndex();
-            if(token.inChunk != null && !textProcessingConfig.isIgnoreChunks() 
&&
-                    restrirctContextByChunks){
-                minIncludeIndex = token.inChunk.getStartTokenIndex();
-//                minIncludeIndex = Math.max(
-//                    state.getConsumedIndex()+1, 
-//                    token.inChunk.getStartTokenIndex());
-                maxIndcludeIndex = token.inChunk.getEndTokenIndex();
-            } else {
+//            if(token.inChunk != null && 
!textProcessingConfig.isIgnoreChunks()){
+//                minIncludeIndex = token.inChunk.getStartTokenIndex();
+//                maxIndcludeIndex = token.inChunk.getEndTokenIndex();
+//                log.debug("  - restrict context to chunk[{}, {}]",
+//                    minIncludeIndex, maxIndcludeIndex);
+//            } else {
                 maxIndcludeIndex = state.getTokens().size() - 1;
-//                minIncludeIndex = state.getConsumedIndex() + 1;
                 minIncludeIndex = 0;
-            }
+//            }
             int prevIndex = token.index;
             int pastIndex = token.index;
             int pastNonMatchable = 0;
@@ -766,12 +763,19 @@ public class EntityLinker {
             PlainLiteral label = labels.next();
             numLabels++;
             String lang = label.getLanguage() != null ? 
label.getLanguage().toString() : null;
+            String text = label.getLexicalForm();
+            //if case-insensitive matching ... compare lower case versions
+            if(!linkerConfig.isCaseSensitiveMatching()){
+                text = text.toLowerCase(Locale.ROOT);
+            }
             if((lang == null && curLang == null) ||
                     (lang != null && curLang != null && 
lang.equalsIgnoreCase(curLang))){
-                if(!matchedLabels.contains(label.getLexicalForm())){
+                if(!matchedLabels.contains(text)){
                     matchLabel(searchTokens, match, label);
-                    matchedLabels.add(label.getLexicalForm());
+                    matchedLabels.add(text);
                     matchedLangLabel = true;
+                } else if(!matchedLangLabel){
+                    matchedLangLabel = true; //found a equivalent label in the 
matchlang
                 }
             } else if((lang == null && mainLang == null) ||
                     (lang != null && mainLang != null && 
lang.equalsIgnoreCase(mainLang))){
@@ -1043,6 +1047,43 @@ public class EntityLinker {
             final LabelMatch labelMatch;
             int coveredTokens = lastFoundIndex-firstFoundIndex+1;
             int coveredProcessableTokens = 
lastProcessableFoundIndex-firstProcessableFoundIndex+1;
+            //check if we lookup Entities within a processable chunk
+            final float chunkMatchScore;
+            if(!textProcessingConfig.isIgnoreChunks() &&
+                    state.getToken().inChunk != null &&  //there is a chunk
+                    state.getToken().inChunk.isProcessable){ //the chunk is 
processable
+                ChunkData cd = state.getToken().inChunk;
+                List<TokenData> tokens = state.getTokens();
+                if(log.isTraceEnabled()){
+                    log.trace("  ... checking match with chunk {}: {}", 
+                        cd.chunk, cd.chunk.getSpan());
+                }
+                int cstart = cd.getMatchableStart() >= 0 ? 
cd.getMatchableStart() :
+                    firstProcessableFoundIndex;
+                int cend = cd.getMatchableEndChar();
+                //if the match does not cover the whole chunk
+                if(cstart < firstProcessableFoundIndex || cend > 
lastProcessableFoundIndex){ 
+                    int foundInChunk = 0;
+                    int numInChunk = 0;
+                    for(int i = cd.matchableStart; i <= cd.matchableEnd ; i++){
+                        TokenData td = tokens.get(i);
+                        if(td.isMatchable){
+                            numInChunk++;
+                            if(i >= firstProcessableFoundIndex &&
+                                    i <= lastProcessableFoundIndex){
+                                foundInChunk++;
+                            }
+                        }
+                    }
+                    chunkMatchScore = (float) foundInChunk / (float) 
numInChunk;
+                    log.trace("  ... label matches {} of {} matchable token in 
Chunk", 
+                        foundInChunk, numInChunk);
+                } else { //matches the whole chunk
+                    chunkMatchScore = 1f;
+                }
+            } else { //no chunk (or ignoreChuncks == true) .. set 
chunkMatchScore to 1f
+                chunkMatchScore = 1f;
+            }
             //matched tokens only within the span of the first/last 
processable token
             //Matching rules
             // - if less than config#minTokenFound() than accept only EXACT
@@ -1050,10 +1091,12 @@ public class EntityLinker {
             //   foundTokens of the PARTIAL match is > than of the FULL/EXACT
             //   match (this will be very rare
             String currentText = 
state.getTokenText(firstFoundIndex,coveredTokens);
-            if(linkerConfig.isCaseSensitiveMatching() ? 
currentText.equals(text) : currentText.equalsIgnoreCase(text)){ 
+            if(chunkMatchScore == 1f && //the whole chunk matches
+                    (linkerConfig.isCaseSensitiveMatching() ? 
currentText.equals(text) : currentText.equalsIgnoreCase(text))){ 
                 labelMatch = new LabelMatch(firstFoundIndex, coveredTokens, 
label);
-            } else {
-                int coveredLabelTokens = 
matchedLabelTokens.lastKey().intValue()-matchedLabelTokens.firstKey().intValue()+1;
+            } else if(chunkMatchScore >= linkerConfig.getMinChunkMatchScore()){
+                int coveredLabelTokens = 
matchedLabelTokens.lastKey().intValue() -
+                        matchedLabelTokens.firstKey().intValue() + 1;
                 if(foundTokens == labelTokens.length && foundTokens == 
coveredTokens){
                     //if all token matched set found to covered: May be lower 
because only
                     //processable tokens are counted, but FULL also checks
@@ -1064,10 +1107,30 @@ public class EntityLinker {
                 labelMatch = new LabelMatch(firstProcessableFoundIndex, 
coveredProcessableTokens, 
                     
foundProcessableTokens,foundTokensWithinCoveredProcessableTokens,
                     
foundTokenMatch/(float)foundTokens,label,labelTokens.length, 
coveredLabelTokens);
+            } else {
+                if(log.isTraceEnabled()){ //trace level logging for 
STANBOL-1211
+                    List<TokenData> tokens = state.getTokens();
+                    int start = 
tokens.get(firstProcessableFoundIndex).token.getStart();
+                    int end = 
tokens.get(lastProcessableFoundIndex).token.getEnd();
+                    CharSequence content = 
state.getToken().token.getContext().getText();
+                    CharSequence match = content.subSequence(start, end);
+                    ChunkData cd = state.getToken().inChunk;
+                    int cStart = 
tokens.get(cd.matchableStart).token.getStart();
+                    int cEnd = tokens.get(cd.matchableEnd).token.getEnd();
+                    CharSequence context = content.subSequence(cStart, cEnd);
+                    log.trace(" - filter match '{}'@[{},{}] because it does 
only match "
+                            + "{}% (min: {}%) of the matchable Tokens in Chunk 
'{}'@[{},{}]",
+                            new Object[]{match, start, end, 
Math.round(chunkMatchScore*100),
+                                    
Math.round(linkerConfig.getMinChunkMatchScore()*100),
+                                    context, cStart, cEnd});
+                }
+                labelMatch = null;
             }
-            if(labelMatch.getLabelScore() >= linkerConfig.getMinLabelScore() 
&& 
+            if(labelMatch != null &&
+                    labelMatch.getLabelScore() >= 
linkerConfig.getMinLabelScore() && 
                     labelMatch.getTextScore() >= 
linkerConfig.getMinTextScore() && 
                     labelMatch.getMatchScore() >= 
linkerConfig.getMinMatchScore()){
+                log.trace(" + add suggestion {}", labelMatch);
                 suggestion.addLabelMatch(labelMatch);
             }
         } //else NO tokens found -> nothing to do

Modified: 
stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/SectionData.java
URL: 
http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/SectionData.java?rev=1543431&r1=1543430&r2=1543431&view=diff
==============================================================================
--- 
stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/SectionData.java
 (original)
+++ 
stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/SectionData.java
 Tue Nov 19 13:51:46 2013
@@ -162,6 +162,17 @@ public class SectionData {
                     } else if(tokenData.isMatchable){
                         activeChunk.matchableCount++;
                     }
+                    if(tokenData.isMatchable){ //for matchable tokens
+                        //update the matchable span within the active chunk
+                        if(activeChunk.matchableStart < 0){
+                            activeChunk.matchableStart = tokenData.index;
+                            activeChunk.matchableStartCharIndex = 
tokenData.token.getStart();
+                        }
+                        if(activeChunk.matchableStart >= 0){ //if start is set 
also set end
+                            activeChunk.matchableEnd = tokenData.index;
+                            activeChunk.matchableEndCharIndex = 
tokenData.token.getEnd();
+                        }
+                    }
                     if (span.getEnd() >= activeChunk.getEndChar()){
                         //this is the last token in the current chunk
                         activeChunk.endToken = tokens.size()-1;

Modified: 
stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/test/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngineTest.java
URL: 
http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/test/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngineTest.java?rev=1543431&r1=1543430&r2=1543431&view=diff
==============================================================================
--- 
stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/test/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngineTest.java
 (original)
+++ 
stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/test/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngineTest.java
 Tue Nov 19 13:51:46 2013
@@ -295,6 +295,7 @@ public class EntityLinkingEngineTest {
         LanguageProcessingConfig tpc = new LanguageProcessingConfig();
         
tpc.setLinkedLexicalCategories(LanguageProcessingConfig.DEFAULT_LINKED_LEXICAL_CATEGORIES);
         tpc.setLinkedPos(Collections.EMPTY_SET);
+        tpc.setIgnoreChunksState(true); //to emulate pre STANBOL-1211
         EntityLinkerConfig config = new EntityLinkerConfig();
         config.setMinFoundTokens(2);//this is assumed by this test
         config.setRedirectProcessingMode(RedirectProcessingMode.FOLLOW);

Modified: 
stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java
URL: 
http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java?rev=1543431&r1=1543430&r2=1543431&view=diff
==============================================================================
--- 
stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java
 (original)
+++ 
stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java
 Tue Nov 19 13:51:46 2013
@@ -269,9 +269,14 @@ public class FstLinkingEngine implements
                         double length = Math.max(alength, 
matchLabel.getLexicalForm().length());
                         match.setMatch(1d - 
((double)distance/length),matchLabel);
                     }
-                    log.trace(" ... add suggestion: label: '{}'; conf: {}", 
+                    if(match.getScore() >= elConfig.getMinMatchScore()){
+                        log.trace(" ... add suggestion: label: '{}'; conf: 
{}", 
                             matchLabel, match.getScore());
-                    suggestions.add(match);
+                        suggestions.add(match);
+                    } else {
+                        log.trace(" ... filtered because match score < {}", 
+                            elConfig.getMinMatchScore());
+                    }
                 } else { //the type of the current Entity is blacklisted
                     log.trace("  ... filtered because of entity types");
                 }
@@ -356,7 +361,8 @@ public class FstLinkingEngine implements
         TokenStream baseTokenStream = 
corpus.getTaggingAnalyzer().tokenStream("", 
             new CharSequenceReader(at.getText()));
         LinkableTokenFilter linkableTokenFilter = new 
LinkableTokenFilter(baseTokenStream, 
-            at, session.getLanguage(), 
tpConfig.getConfiguration(session.getLanguage()));
+            at, session.getLanguage(), 
tpConfig.getConfiguration(session.getLanguage()),
+            elConfig.getMinChunkMatchScore());
         //we use two TagClusterReducer implementations.
         // (1) the linkableTokenFilter filters all tags that do not overlap any
         //     linkable Token

Modified: 
stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java
URL: 
http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java?rev=1543431&r1=1543430&r2=1543431&view=diff
==============================================================================
--- 
stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java
 (original)
+++ 
stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java
 Tue Nov 19 13:51:46 2013
@@ -55,6 +55,7 @@ import org.apache.felix.scr.annotations.
 import org.apache.felix.scr.annotations.Property;
 import org.apache.felix.scr.annotations.PropertyOption;
 import org.apache.felix.scr.annotations.Reference;
+import org.apache.felix.scr.annotations.ReferenceCardinality;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.index.AtomicReader;
 import org.apache.lucene.index.FieldInfo;
@@ -222,7 +223,7 @@ public class FstLinkingEngineComponent {
     /**
      * used to resolve '{prefix}:{local-name}' used within the engines 
configuration
      */
-    @Reference
+    @Reference(cardinality=ReferenceCardinality.OPTIONAL_UNARY)
     protected NamespacePrefixService prefixService;    
 
     /**

Modified: 
stanbol/branches/release-0.12/integration-tests/src/test/java/org/apache/stanbol/enhancer/it/FstLinkingTest.java
URL: 
http://svn.apache.org/viewvc/stanbol/branches/release-0.12/integration-tests/src/test/java/org/apache/stanbol/enhancer/it/FstLinkingTest.java?rev=1543431&r1=1543430&r2=1543431&view=diff
==============================================================================
--- 
stanbol/branches/release-0.12/integration-tests/src/test/java/org/apache/stanbol/enhancer/it/FstLinkingTest.java
 (original)
+++ 
stanbol/branches/release-0.12/integration-tests/src/test/java/org/apache/stanbol/enhancer/it/FstLinkingTest.java
 Tue Nov 19 13:51:46 2013
@@ -20,12 +20,14 @@ import org.junit.Test;
 
 public class FstLinkingTest extends EnhancerTestBase {
 
-    
+    //NOTE: adapted text as part of STANBOL-1211 to avoid a single noun phrase 
+    //"SPD candidate Peer Steinbrueck" avoiding the linking of SPD in this
+    //Text.
     public static final String TEST_TEXT = "There has been a worried response 
in "
             + "Greece to the Sunday's election in Germany. The win of 
Chancellor "
             + "Angela Merkel means that there will not be a radical change in "
-            + "European policy. Greeks would have preferred SPD candidate Peer 
"
-            + "Steinbrueck, whose party lost Sunday.";
+            + "European policy. Greeks would have preferred Peer Steinbrueck 
the"
+            + "candidate of the SPD, whose party lost Sunday.";
     
     /**
      * 
@@ -54,17 +56,20 @@ public class FstLinkingTest extends Enha
                 //and the entityLinkingEngine
                 "http://purl.org/dc/terms/creator.*FstLinkingEngine";,
                 //needs to suggest the following Entities
-                
"http://fise.iks-project.eu/ontology/entity-reference.*http://dbpedia.org/resource/Chancellor";,
                 
"http://fise.iks-project.eu/ontology/entity-reference.*http://dbpedia.org/resource/Angela_Merkel";,
                 
"http://fise.iks-project.eu/ontology/entity-reference.*http://dbpedia.org/resource/Greece";,
                 
"http://fise.iks-project.eu/ontology/entity-reference.*http://dbpedia.org/resource/Germany";,
                 
"http://fise.iks-project.eu/ontology/entity-reference.*http://dbpedia.org/resource/Social_Democratic_Party_of_Germany";,
                 //for the following sections within the text
-                
"http://fise.iks-project.eu/ontology/selected-text.*Chancellor";,
                 "http://fise.iks-project.eu/ontology/selected-text.*Angela 
Merkel",
                 "http://fise.iks-project.eu/ontology/selected-text.*Greece";,
                 "http://fise.iks-project.eu/ontology/selected-text.*Germany";,
-                "http://fise.iks-project.eu/ontology/selected-text.*SPD";);
+                "http://fise.iks-project.eu/ontology/selected-text.*SPD";)
+         //with STANBOL-1211 Chancellor MUST NOT be found as "Chancellor" does 
not
+         //select more as 50% of the tokens of the chunk "Chancellor Angela 
Merkel"
+         .assertContentRegexp(false, 
+                 
"http://fise.iks-project.eu/ontology/entity-reference.*http://dbpedia.org/resource/Chancellor";,
+                 
"http://fise.iks-project.eu/ontology/selected-text.*Chancellor";);
     }
 
     


Reply via email to