enhancer...

rwesten Tue, 19 Nov 2013 04:45:49 -0800

Author: rwesten
Date: Tue Nov 19 12:44:40 2013
New Revision: 1543405

URL: http://svn.apache.org/r1543405
Log:
STANBOL-1211: fixing an issue with the FST linking engine implementation; 
adding opennlp-chunker to the dbpedia-fst linking chain configuration; adapting 
integration test to include the new feature


Modified:
    
stanbol/trunk/data/defaultconfig/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-dbpedia_fst.config
    
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java
    
stanbol/trunk/integration-tests/src/test/java/org/apache/stanbol/enhancer/it/FstLinkingTest.java

Modified: 
stanbol/trunk/data/defaultconfig/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-dbpedia_fst.config
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/data/defaultconfig/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-dbpedia_fst.config?rev=1543405&r1=1543404&r2=1543405&view=diff
==============================================================================
--- 
stanbol/trunk/data/defaultconfig/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-dbpedia_fst.config
 (original)
+++ 
stanbol/trunk/data/defaultconfig/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-dbpedia_fst.config
 Tue Nov 19 12:44:40 2013
@@ -1,3 +1,3 @@
 stanbol.enhancer.chain.name="dbpedia-fst-linking"
-stanbol.enhancer.chain.weighted.chain=["tika;optional","langdetect","opennlp-sentence","opennlp-token","opennlp-pos","dbpedia-fst"]
+stanbol.enhancer.chain.weighted.chain=["tika;optional","langdetect","opennlp-sentence","opennlp-token","opennlp-pos","opennlp-chunker","dbpedia-fst"]
 service.ranking=I"0"
\ No newline at end of file

Modified: 
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java?rev=1543405&r1=1543404&r2=1543405&view=diff
==============================================================================
--- 
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java
 (original)
+++ 
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java
 Tue Nov 19 12:44:40 2013
@@ -155,7 +155,7 @@ public final class LinkableTokenFilter e
      * the {@link #reduce(TagLL[])} method to check if {@link TagLL tags} 
      * do overlap with any linkable token.
      */
-    private final List<TokenData> linkableTokens = new LinkedList<TokenData>();
+    private final List<LinkableTokenContext> linkableTokens = new 
LinkedList<LinkableTokenContext>();
     /**
      * The minimum score a tag needs to match processable tokens within a
      * {@link Chunk} so that is is not omitted. 
@@ -328,11 +328,12 @@ public final class LinkableTokenFilter e
         tokens.add(token);
         if(token.isLinkable){
             //add to the list of linkable for #reduce(TagLL[])
-            linkableTokens.add(token);
-        } else if(token.isMatchable && !lpc.isIgnoreChunks() //matchable token
-                && token.inChunk != null && //in chunks with two ore more
+            linkableTokens.add(new 
LinkableTokenContext(token,sectionData.getTokens()));
+        } else if(token.isMatchable && !lpc.isIgnoreChunks() &&//matchable 
token
+                token.inChunk != null && //in processable chunks with more
+                token.inChunk.isProcessable && //as two matchable tokens
                 token.inChunk.getMatchableCount() > 1){ //matchable tokens
-            linkableTokens.add(token);
+            linkableTokens.add(new LinkableTokenContext(token, 
sectionData.getTokens()));
         }
     }
     /**
@@ -342,19 +343,18 @@ public final class LinkableTokenFilter e
     private TokenData getToken(){
         return tokens.isEmpty() ? null : tokens.get(tokensCursor);
     }
-
     @Override
     public void reduce(TagLL[] head) {
-        TokenData linkableToken;
+        LinkableTokenContext linkableTokenContext;
         for(TagLL tag = head[0]; tag != null; tag = tag.getNextTag()) {
             int start = tag.getStartOffset();
             int end = tag.getEndOffset();
-            linkableToken = linkableTokens.isEmpty() ? null : 
linkableTokens.get(0);
-            while(linkableToken != null && linkableToken.token.getEnd() <= 
start){
+            linkableTokenContext = linkableTokens.isEmpty() ? null : 
linkableTokens.get(0);
+            while(linkableTokenContext != null && 
linkableTokenContext.linkableToken.token.getEnd() <= start){
                 linkableTokens.remove(0);
-                linkableToken = linkableTokens.isEmpty() ? null : 
linkableTokens.get(0);
+                linkableTokenContext = linkableTokens.isEmpty() ? null : 
linkableTokens.get(0);
             }
-            if(linkableToken == null || linkableToken.token.getStart() >= end){
+            if(linkableTokenContext == null || 
linkableTokenContext.linkableToken.token.getStart() >= end){
                 //does not overlap any linkable token
                 tag.removeLL(); //remove the tag from the cluster
                 if(log.isTraceEnabled()){
@@ -362,6 +362,8 @@ public final class LinkableTokenFilter e
                     log.trace(" > reduce tag {}", tagSequence);
                 }
             } else { //if the tag overlaps a linkable token 
+                TokenData linkableToken = linkableTokenContext.linkableToken;
+                List<TokenData> tokens = linkableTokenContext.context;
                 ChunkData cd = linkableToken.inChunk; //check if it maches > 
50% of the chunk
                 if(!lpc.isIgnoreChunks() && cd != null &&
                         cd.isProcessable){
@@ -371,7 +373,6 @@ public final class LinkableTokenFilter e
                     if(cstart < start || cend > end){ //if the tag does not 
cover the whole chunk
                         int num = 0;
                         int match = 0;
-                        List<TokenData> tokens = sectionData.getTokens();
                         for(int i = cd.getMatchableStart(); i <= 
cd.getMatchableEnd(); i++){
                             TokenData td = tokens.get(i);
                             if(td.isMatchable){
@@ -386,34 +387,54 @@ public final class LinkableTokenFilter e
                         //tokens in the Chunk are matched!
                         if(((float)match/(float)num) < minChunkMatchScore){
                             tag.removeLL(); //ignore
-                            if(log.isDebugEnabled()){
+                            if(log.isTraceEnabled()){
                                 CharSequence text = at.getText();
-                                log.debug(" - reduce tag {}[{},{}] because it 
does only match "
+                                log.trace(" - reduce tag {}[{},{}] because it 
does only match "
                                     + "{} of {} of matchable Chunk {}[{},{}]", 
                                     new Object[]{text.subSequence(start, end), 
start, end, match,  
                                             num, text.subSequence(cstart, 
cend), cstart, cend});
                             }
-                        } else if(log.isDebugEnabled()){
+                        } else if(log.isTraceEnabled()){
                             CharSequence text = at.getText();
-                            log.debug(" + keep tag {}[{},{}] matching {} of {} 
"
+                            log.trace(" + keep tag {}[{},{}] matching {} of {} 
"
                                 + "matchable Tokens for matchable Chunk 
{}[{},{}]", 
                                 new Object[]{text.subSequence(start, end), 
start, end, match,
                                         num, text.subSequence(cstart, cend), 
cstart, cend});
                         }
-                    } else if(log.isDebugEnabled()){
+                    } else if(log.isTraceEnabled()){
                         CharSequence text = at.getText();
-                        log.debug(" + keep tag {}[{},{}] for matchable Chunk 
{}[{},{}]", 
+                        log.trace(" + keep tag {}[{},{}] for matchable Chunk 
{}[{},{}]", 
                             new Object[]{text.subSequence(start, end), start, 
end, 
                                  text.subSequence(cstart, cend), cstart, 
cend});
                     }
                 }
-                if(log.isDebugEnabled()){
+                if(log.isTraceEnabled()){
                     CharSequence tagSequence = at.getText().subSequence(start, 
end);
-                    log.debug(" + keep tag {}", tagSequence);
+                    log.trace(" + keep tag {}", tagSequence);
                 }
             }
         }
         
     }
+    /**
+     * Holds the context for a linkable {@link Token}s. This ensures that the
+     * list of Tokens of the current {@link Section} (typically a {@link 
Sentence}) 
+     * is still available even if the {@link LinkableTokenFilter#sectionData} 
does hold
+     * already tokens for the next section.<p>
+     * This is necessary as {@link LinkableTokenFilter#reduce(TagLL[])} can
+     * be called for the previous sentence in cases where a Tag cluster 
includes
+     * the last {@link Token} of a {@link Section}.
+     * @author Rupert Westenthaler
+     *
+     */
+    private static class LinkableTokenContext {
+        final TokenData linkableToken;
+        final List<TokenData> context;
+        
+        LinkableTokenContext(TokenData linkableToken, List<TokenData> context){
+            this.linkableToken = linkableToken;
+            this.context = context;
+        }
+    }
     
 }

Modified: 
stanbol/trunk/integration-tests/src/test/java/org/apache/stanbol/enhancer/it/FstLinkingTest.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/integration-tests/src/test/java/org/apache/stanbol/enhancer/it/FstLinkingTest.java?rev=1543405&r1=1543404&r2=1543405&view=diff
==============================================================================
--- 
stanbol/trunk/integration-tests/src/test/java/org/apache/stanbol/enhancer/it/FstLinkingTest.java
 (original)
+++ 
stanbol/trunk/integration-tests/src/test/java/org/apache/stanbol/enhancer/it/FstLinkingTest.java
 Tue Nov 19 12:44:40 2013
@@ -20,12 +20,14 @@ import org.junit.Test;
 
 public class FstLinkingTest extends EnhancerTestBase {
 
-    
+    //NOTE: adapted text as part of STANBOL-1211 to avoid a single noun phrase 
+    //"SPD candidate Peer Steinbrueck" avoiding the linking of SPD in this
+    //Text.
     public static final String TEST_TEXT = "There has been a worried response 
in "
             + "Greece to the Sunday's election in Germany. The win of 
Chancellor "
             + "Angela Merkel means that there will not be a radical change in "
-            + "European policy. Greeks would have preferred SPD candidate Peer 
"
-            + "Steinbrueck, whose party lost Sunday.";
+            + "European policy. Greeks would have preferred Peer Steinbrueck 
the"
+            + "candidate of the SPD, whose party lost Sunday.";
     
     /**
      * 
@@ -54,17 +56,20 @@ public class FstLinkingTest extends Enha
                 //and the entityLinkingEngine
                 "http://purl.org/dc/terms/creator.*FstLinkingEngine";,
                 //needs to suggest the following Entities
-                
"http://fise.iks-project.eu/ontology/entity-reference.*http://dbpedia.org/resource/Chancellor";,
                 
"http://fise.iks-project.eu/ontology/entity-reference.*http://dbpedia.org/resource/Angela_Merkel";,
                 
"http://fise.iks-project.eu/ontology/entity-reference.*http://dbpedia.org/resource/Greece";,
                 
"http://fise.iks-project.eu/ontology/entity-reference.*http://dbpedia.org/resource/Germany";,
                 
"http://fise.iks-project.eu/ontology/entity-reference.*http://dbpedia.org/resource/Social_Democratic_Party_of_Germany";,
                 //for the following sections within the text
-                
"http://fise.iks-project.eu/ontology/selected-text.*Chancellor";,
                 "http://fise.iks-project.eu/ontology/selected-text.*Angela 
Merkel",
                 "http://fise.iks-project.eu/ontology/selected-text.*Greece";,
                 "http://fise.iks-project.eu/ontology/selected-text.*Germany";,
-                "http://fise.iks-project.eu/ontology/selected-text.*SPD";);
+                "http://fise.iks-project.eu/ontology/selected-text.*SPD";)
+         //with STANBOL-1211 Chancellor MUST NOT be found as "Chancellor" does 
not
+         //select more as 50% of the tokens of the chunk "Chancellor Angela 
Merkel"
+         .assertContentRegexp(false, 
+                 
"http://fise.iks-project.eu/ontology/entity-reference.*http://dbpedia.org/resource/Chancellor";,
+                 
"http://fise.iks-project.eu/ontology/selected-text.*Chancellor";);
     }

svn commit: r1543405 - in /stanbol/trunk: data/defaultconfig/src/main/resources/config/ enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/ integration-tests/src/test/java/org/apache/stanbol/enhancer...

Reply via email to