Author: rwesten
Date: Tue Nov 19 12:44:40 2013
New Revision: 1543405
URL: http://svn.apache.org/r1543405
Log:
STANBOL-1211: fixing an issue with the FST linking engine implementation;
adding opennlp-chunker to the dbpedia-fst linking chain configuration; adapting
integration test to include the new feature
Modified:
stanbol/trunk/data/defaultconfig/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-dbpedia_fst.config
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java
stanbol/trunk/integration-tests/src/test/java/org/apache/stanbol/enhancer/it/FstLinkingTest.java
Modified:
stanbol/trunk/data/defaultconfig/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-dbpedia_fst.config
URL:
http://svn.apache.org/viewvc/stanbol/trunk/data/defaultconfig/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-dbpedia_fst.config?rev=1543405&r1=1543404&r2=1543405&view=diff
==============================================================================
---
stanbol/trunk/data/defaultconfig/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-dbpedia_fst.config
(original)
+++
stanbol/trunk/data/defaultconfig/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-dbpedia_fst.config
Tue Nov 19 12:44:40 2013
@@ -1,3 +1,3 @@
stanbol.enhancer.chain.name="dbpedia-fst-linking"
-stanbol.enhancer.chain.weighted.chain=["tika;optional","langdetect","opennlp-sentence","opennlp-token","opennlp-pos","dbpedia-fst"]
+stanbol.enhancer.chain.weighted.chain=["tika;optional","langdetect","opennlp-sentence","opennlp-token","opennlp-pos","opennlp-chunker","dbpedia-fst"]
service.ranking=I"0"
\ No newline at end of file
Modified:
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java?rev=1543405&r1=1543404&r2=1543405&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java
(original)
+++
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java
Tue Nov 19 12:44:40 2013
@@ -155,7 +155,7 @@ public final class LinkableTokenFilter e
* the {@link #reduce(TagLL[])} method to check if {@link TagLL tags}
* do overlap with any linkable token.
*/
- private final List<TokenData> linkableTokens = new LinkedList<TokenData>();
+ private final List<LinkableTokenContext> linkableTokens = new
LinkedList<LinkableTokenContext>();
/**
* The minimum score a tag needs to match processable tokens within a
* {@link Chunk} so that is is not omitted.
@@ -328,11 +328,12 @@ public final class LinkableTokenFilter e
tokens.add(token);
if(token.isLinkable){
//add to the list of linkable for #reduce(TagLL[])
- linkableTokens.add(token);
- } else if(token.isMatchable && !lpc.isIgnoreChunks() //matchable token
- && token.inChunk != null && //in chunks with two ore more
+ linkableTokens.add(new
LinkableTokenContext(token,sectionData.getTokens()));
+ } else if(token.isMatchable && !lpc.isIgnoreChunks() &&//matchable
token
+ token.inChunk != null && //in processable chunks with more
+ token.inChunk.isProcessable && //as two matchable tokens
token.inChunk.getMatchableCount() > 1){ //matchable tokens
- linkableTokens.add(token);
+ linkableTokens.add(new LinkableTokenContext(token,
sectionData.getTokens()));
}
}
/**
@@ -342,19 +343,18 @@ public final class LinkableTokenFilter e
private TokenData getToken(){
return tokens.isEmpty() ? null : tokens.get(tokensCursor);
}
-
@Override
public void reduce(TagLL[] head) {
- TokenData linkableToken;
+ LinkableTokenContext linkableTokenContext;
for(TagLL tag = head[0]; tag != null; tag = tag.getNextTag()) {
int start = tag.getStartOffset();
int end = tag.getEndOffset();
- linkableToken = linkableTokens.isEmpty() ? null :
linkableTokens.get(0);
- while(linkableToken != null && linkableToken.token.getEnd() <=
start){
+ linkableTokenContext = linkableTokens.isEmpty() ? null :
linkableTokens.get(0);
+ while(linkableTokenContext != null &&
linkableTokenContext.linkableToken.token.getEnd() <= start){
linkableTokens.remove(0);
- linkableToken = linkableTokens.isEmpty() ? null :
linkableTokens.get(0);
+ linkableTokenContext = linkableTokens.isEmpty() ? null :
linkableTokens.get(0);
}
- if(linkableToken == null || linkableToken.token.getStart() >= end){
+ if(linkableTokenContext == null ||
linkableTokenContext.linkableToken.token.getStart() >= end){
//does not overlap any linkable token
tag.removeLL(); //remove the tag from the cluster
if(log.isTraceEnabled()){
@@ -362,6 +362,8 @@ public final class LinkableTokenFilter e
log.trace(" > reduce tag {}", tagSequence);
}
} else { //if the tag overlaps a linkable token
+ TokenData linkableToken = linkableTokenContext.linkableToken;
+ List<TokenData> tokens = linkableTokenContext.context;
ChunkData cd = linkableToken.inChunk; //check if it maches >
50% of the chunk
if(!lpc.isIgnoreChunks() && cd != null &&
cd.isProcessable){
@@ -371,7 +373,6 @@ public final class LinkableTokenFilter e
if(cstart < start || cend > end){ //if the tag does not
cover the whole chunk
int num = 0;
int match = 0;
- List<TokenData> tokens = sectionData.getTokens();
for(int i = cd.getMatchableStart(); i <=
cd.getMatchableEnd(); i++){
TokenData td = tokens.get(i);
if(td.isMatchable){
@@ -386,34 +387,54 @@ public final class LinkableTokenFilter e
//tokens in the Chunk are matched!
if(((float)match/(float)num) < minChunkMatchScore){
tag.removeLL(); //ignore
- if(log.isDebugEnabled()){
+ if(log.isTraceEnabled()){
CharSequence text = at.getText();
- log.debug(" - reduce tag {}[{},{}] because it
does only match "
+ log.trace(" - reduce tag {}[{},{}] because it
does only match "
+ "{} of {} of matchable Chunk {}[{},{}]",
new Object[]{text.subSequence(start, end),
start, end, match,
num, text.subSequence(cstart,
cend), cstart, cend});
}
- } else if(log.isDebugEnabled()){
+ } else if(log.isTraceEnabled()){
CharSequence text = at.getText();
- log.debug(" + keep tag {}[{},{}] matching {} of {}
"
+ log.trace(" + keep tag {}[{},{}] matching {} of {}
"
+ "matchable Tokens for matchable Chunk
{}[{},{}]",
new Object[]{text.subSequence(start, end),
start, end, match,
num, text.subSequence(cstart, cend),
cstart, cend});
}
- } else if(log.isDebugEnabled()){
+ } else if(log.isTraceEnabled()){
CharSequence text = at.getText();
- log.debug(" + keep tag {}[{},{}] for matchable Chunk
{}[{},{}]",
+ log.trace(" + keep tag {}[{},{}] for matchable Chunk
{}[{},{}]",
new Object[]{text.subSequence(start, end), start,
end,
text.subSequence(cstart, cend), cstart,
cend});
}
}
- if(log.isDebugEnabled()){
+ if(log.isTraceEnabled()){
CharSequence tagSequence = at.getText().subSequence(start,
end);
- log.debug(" + keep tag {}", tagSequence);
+ log.trace(" + keep tag {}", tagSequence);
}
}
}
}
+ /**
+ * Holds the context for a linkable {@link Token}s. This ensures that the
+ * list of Tokens of the current {@link Section} (typically a {@link
Sentence})
+ * is still available even if the {@link LinkableTokenFilter#sectionData}
does hold
+ * already tokens for the next section.<p>
+ * This is necessary as {@link LinkableTokenFilter#reduce(TagLL[])} can
+ * be called for the previous sentence in cases where a Tag cluster
includes
+ * the last {@link Token} of a {@link Section}.
+ * @author Rupert Westenthaler
+ *
+ */
+ private static class LinkableTokenContext {
+ final TokenData linkableToken;
+ final List<TokenData> context;
+
+ LinkableTokenContext(TokenData linkableToken, List<TokenData> context){
+ this.linkableToken = linkableToken;
+ this.context = context;
+ }
+ }
}
Modified:
stanbol/trunk/integration-tests/src/test/java/org/apache/stanbol/enhancer/it/FstLinkingTest.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/integration-tests/src/test/java/org/apache/stanbol/enhancer/it/FstLinkingTest.java?rev=1543405&r1=1543404&r2=1543405&view=diff
==============================================================================
---
stanbol/trunk/integration-tests/src/test/java/org/apache/stanbol/enhancer/it/FstLinkingTest.java
(original)
+++
stanbol/trunk/integration-tests/src/test/java/org/apache/stanbol/enhancer/it/FstLinkingTest.java
Tue Nov 19 12:44:40 2013
@@ -20,12 +20,14 @@ import org.junit.Test;
public class FstLinkingTest extends EnhancerTestBase {
-
+ //NOTE: adapted text as part of STANBOL-1211 to avoid a single noun phrase
+ //"SPD candidate Peer Steinbrueck" avoiding the linking of SPD in this
+ //Text.
public static final String TEST_TEXT = "There has been a worried response
in "
+ "Greece to the Sunday's election in Germany. The win of
Chancellor "
+ "Angela Merkel means that there will not be a radical change in "
- + "European policy. Greeks would have preferred SPD candidate Peer
"
- + "Steinbrueck, whose party lost Sunday.";
+ + "European policy. Greeks would have preferred Peer Steinbrueck
the"
+ + "candidate of the SPD, whose party lost Sunday.";
/**
*
@@ -54,17 +56,20 @@ public class FstLinkingTest extends Enha
//and the entityLinkingEngine
"http://purl.org/dc/terms/creator.*FstLinkingEngine",
//needs to suggest the following Entities
-
"http://fise.iks-project.eu/ontology/entity-reference.*http://dbpedia.org/resource/Chancellor",
"http://fise.iks-project.eu/ontology/entity-reference.*http://dbpedia.org/resource/Angela_Merkel",
"http://fise.iks-project.eu/ontology/entity-reference.*http://dbpedia.org/resource/Greece",
"http://fise.iks-project.eu/ontology/entity-reference.*http://dbpedia.org/resource/Germany",
"http://fise.iks-project.eu/ontology/entity-reference.*http://dbpedia.org/resource/Social_Democratic_Party_of_Germany",
//for the following sections within the text
-
"http://fise.iks-project.eu/ontology/selected-text.*Chancellor",
"http://fise.iks-project.eu/ontology/selected-text.*Angela
Merkel",
"http://fise.iks-project.eu/ontology/selected-text.*Greece",
"http://fise.iks-project.eu/ontology/selected-text.*Germany",
- "http://fise.iks-project.eu/ontology/selected-text.*SPD");
+ "http://fise.iks-project.eu/ontology/selected-text.*SPD")
+ //with STANBOL-1211 Chancellor MUST NOT be found as "Chancellor" does
not
+ //select more as 50% of the tokens of the chunk "Chancellor Angela
Merkel"
+ .assertContentRegexp(false,
+
"http://fise.iks-project.eu/ontology/entity-reference.*http://dbpedia.org/resource/Chancellor",
+
"http://fise.iks-project.eu/ontology/selected-text.*Chancellor");
}