Author: rwesten
Date: Fri Oct 7 20:18:41 2011
New Revision: 1180199
URL: http://svn.apache.org/viewvc?rev=1180199&view=rev
Log:
Further improvements related to keyword linking.
This changes should complete the initial version of the KeywordLinkingEnigne
(function wise). Regarding STANBOL-303 (making EntityFetching -> EntityLinker
component) there is still some additional work (line a plug-able label
matching) to do.
Changes in detail:
Language Processing:
* Some adaption the the configuration of POS tags
* The POSTypeChunker now (again) follows tokens with POS tags marked as follow
backwards. This is mainly to include with Adjective like "10th European Day of
Languages"-
* Tokens as used by the TextAnalyzer now have a boolean property if they
contain at least a single Alpha-Numerical char. This makes is more performant
to filter tokens that represent punctation and so on.
* Deactivated language specific Tokenizer for Danish and Swedish
KeywordExtractionEngine
* Processing state now also holds the last "consumed" item. This is intended to
allow backwards search for matching words until the last "consumed" word
(already linked with an entity).
* The label matching now also able to search backwards for matching tokens.
(e.g. to correctly match the "10th {event name}" or the "European {role name}"
...
* Matching now ignores Tokens without any alpha-numerical char
* Matching now again counts non-processable tokens. This has advantages and
disadvantages. The best solution would be to exclude stop-words however
currently there are no stop word lists available.
other changes
* reactivate default values for Persons, Organization and Place states for the
NamedEntityLinkingEnige so that in the Apache Felix Webconsole they are
correctly presented as boolean properties.
* added the "http://www.opengis.net/gml/" to the NamespaceEnum of the Entityhub
as this is namespace is used by DBpedia
Modified:
incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java
incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTypeChunker.java
incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/TextAnalyzer.java
incubator/stanbol/trunk/data/opennlp/lang/da/download_models.xml
incubator/stanbol/trunk/data/opennlp/lang/sv/download_models.xml
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/NamedEntityTaggingEngine.java
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/ProcessingState.java
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/Suggestion.java
incubator/stanbol/trunk/entityhub/generic/servicesapi/src/main/java/org/apache/stanbol/entityhub/servicesapi/defaults/NamespaceEnum.java
Modified:
incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java?rev=1180199&r1=1180198&r2=1180199&view=diff
==============================================================================
---
incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java
(original)
+++
incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java
Fri Oct 7 20:18:41 2011
@@ -20,9 +20,12 @@ public enum PosTagsCollectionEnum {
/**
* Nouns related POS types for English based on the
* <a
href="http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html">
- * Penn Treebank</a> tag set
+ * Penn Treebank</a> tag set.
+ * <p>
+ * NOTE the "``" tag is also added as noun, because it can not be found in
+ * the official tag set and is sometimes used to tag nouns.
*/
- EN_NOUN("en",PosTypeCollectionType.NOUN,"NN","NNP","NNPS","NNS","FW","CD"),
+
EN_NOUN("en",PosTypeCollectionType.NOUN,"NN","NNP","NNPS","NNS","FW","CD","``"),
/**
* Verb related POS types for English based on the
* <a
href="http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html">
@@ -36,7 +39,7 @@ public enum PosTagsCollectionEnum {
* <a
href="http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html">
* Penn Treebank</a> tag set
*/
- EN_FOLLOW("en",PosTypeCollectionType.FOLLOW,"#","$","
","(",")",",",".",":","``","POS","IN"),
+ EN_FOLLOW("en",PosTypeCollectionType.FOLLOW,"#","$","
","(",")",",",".",":","POS","IN","JJ"),
/**
* Noun related POS types for German based on the
* <a
href="http://www.ims.uni-stuttgart.de/projekte/corplex/TagSets/stts-table.html">
@@ -57,7 +60,7 @@ public enum PosTagsCollectionEnum {
* <a
href="http://www.ims.uni-stuttgart.de/projekte/corplex/TagSets/stts-table.html">
* STTS Tag Set</a>
*/
- DE_FOLLOW("de",PosTypeCollectionType.FOLLOW,"$","$.","$("),
+ DE_FOLLOW("de",PosTypeCollectionType.FOLLOW,"$.","$,","$(","APPR"),
/**
* POS types representing Nouns for Danish based on the PAROLE Tagset as
* described by <a href="http://korpus.dsl.dk/paroledoc_en.pdf">this
paper</a>
Modified:
incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTypeChunker.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTypeChunker.java?rev=1180199&r1=1180198&r2=1180199&view=diff
==============================================================================
---
incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTypeChunker.java
(original)
+++
incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTypeChunker.java
Fri Oct 7 20:18:41 2011
@@ -161,15 +161,14 @@ public class PosTypeChunker {
* @return the chunks as spans over the parsed tokens
*/
public Span[] chunkAsSpans(String[] tokens, String[] tags) {
-// int consumed = -1;
+ int consumed = -1;
List<Span> chunks = new ArrayList<Span>();
for(int i=0;i<tokens.length;i++){
if(includePOS(null,tags[i])){
int start = i;
- //do not follow backwards!
-// while(start-1 > consumed && followPOS(tags[start-1])){
-// start--; //follow backwards until consumed
-// }
+ while(start-1 > consumed && followPOS(null,tags[start-1])){
+ start--; //follow backwards until consumed
+ }
int followEnd = i;
int end = i;
while(followEnd+1 < tokens.length &&
followPOS(null,tags[followEnd+1])){
@@ -199,15 +198,15 @@ public class PosTypeChunker {
// used by this one :(
// If someone has a better Idea feel free to change!
// Rupert Westenthaler (28.Sep.2011)
-// int consumed = -1;
+ int consumed = -1;
List<Span> chunks = new ArrayList<Span>();
for(int i=0;i<tokens.length;i++){
if(includePOS(props[i],tags[i])){
int start = i;
//do not follow backwards!
-// while(start-1 > consumed && followPOS(tags[start-1])){
-// start--; //follow backwards until consumed
-// }
+ while(start-1 > consumed &&
followPOS(props[start-1],tags[start-1])){
+ start--; //follow backwards until consumed
+ }
int followEnd = i;
int end = i;
while(followEnd+1 < tokens.length &&
followPOS(props[followEnd+1],tags[followEnd+1])){
Modified:
incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/TextAnalyzer.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/TextAnalyzer.java?rev=1180199&r1=1180198&r2=1180199&view=diff
==============================================================================
---
incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/TextAnalyzer.java
(original)
+++
incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/TextAnalyzer.java
Fri Oct 7 20:18:41 2011
@@ -474,6 +474,7 @@ public class TextAnalyzer {
protected String token;
protected final String[] posTags;
protected final double[] posProbabilities;
+ protected final boolean hasAlphaNumeric;
private Token(Span span,String token,String pos,double
posProbability){
this(span,token,new String[]{pos},new double[]
{posProbability});
@@ -493,6 +494,11 @@ public class TextAnalyzer {
} else {
this.posProbabilities = posProbabilities;
}
+ boolean foundAlphaNumericCahr = false;
+ for(int i = 0;!foundAlphaNumericCahr &&i<token.length();i++){
+ foundAlphaNumericCahr =
Character.isLetterOrDigit(token.charAt(i));
+ }
+ hasAlphaNumeric = foundAlphaNumericCahr;
}
public int getStart(){
@@ -541,6 +547,9 @@ public class TextAnalyzer {
}
return token;
}
+ public boolean hasAplhaNumericChar(){
+ return hasAlphaNumeric;
+ }
@Override
public String toString() {
return getText()+(posTags != null?
Modified: incubator/stanbol/trunk/data/opennlp/lang/da/download_models.xml
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/data/opennlp/lang/da/download_models.xml?rev=1180199&r1=1180198&r2=1180199&view=diff
==============================================================================
--- incubator/stanbol/trunk/data/opennlp/lang/da/download_models.xml (original)
+++ incubator/stanbol/trunk/data/opennlp/lang/da/download_models.xml Fri Oct 7
20:18:41 2011
@@ -22,9 +22,15 @@
</description>
<target name="download">
- <copy todir="${target.directory}" flatten="true">
+ <!-- ensure the delition of the no longer used Danish tokenizer -->
+ <delete dir="${target.directory}" includes="da-token.bin" />
+ <copy todir="${target.directory}" flatten="true">
<resources>
+ <!--
+ After some testing the decision was to use the simple tokenizer
+ for the Danish language
<url url="${model.url}/da-token.bin"/>
+ -->
<url url="${model.url}/da-sent.bin"/>
<url url="${model.url}/da-pos-perceptron.bin"/>
<!-- no Chunker for german
Modified: incubator/stanbol/trunk/data/opennlp/lang/sv/download_models.xml
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/data/opennlp/lang/sv/download_models.xml?rev=1180199&r1=1180198&r2=1180199&view=diff
==============================================================================
--- incubator/stanbol/trunk/data/opennlp/lang/sv/download_models.xml (original)
+++ incubator/stanbol/trunk/data/opennlp/lang/sv/download_models.xml Fri Oct 7
20:18:41 2011
@@ -24,14 +24,19 @@
"se-*" to "sv-*"
-->
<mapper type="merge" />
+ <!-- ensure the delition of the no longer used Swedish tokenizer -->
+ <delete dir="${target.directory}" includes="sv-token.bin" />
<target name="download">
+<!-- Based on some testing the decision was to use the SimpleTokenizer for
+ the Swidish language
<copy toDir="${target.directory}/">
<resources>
<url url="${model.url}/se-token.bin"/>
</resources>
<mergemapper to="sv-token.bin"/>
</copy>
+ -->
<copy toDir="${target.directory}/">
<resources>
<url url="${model.url}/se-sent.bin"/>
Modified:
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/NamedEntityTaggingEngine.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/NamedEntityTaggingEngine.java?rev=1180199&r1=1180198&r2=1180199&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/NamedEntityTaggingEngine.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/NamedEntityTaggingEngine.java
Fri Oct 7 20:18:41 2011
@@ -91,19 +91,19 @@ public class NamedEntityTaggingEngine im
@Property//(value = "dbpedia")
public static final String REFERENCED_SITE_ID =
"org.apache.stanbol.enhancer.engines.entitytagging.referencedSiteId";
- @Property//(boolValue = true)
+ @Property(boolValue = false)
public static final String PERSON_STATE =
"org.apache.stanbol.enhancer.engines.entitytagging.personState";
@Property//(value = "dbp-ont:Person")
public static final String PERSON_TYPE =
"org.apache.stanbol.enhancer.engines.entitytagging.personType";
- @Property//(boolValue = true)
+ @Property(boolValue = false)
public static final String ORG_STATE =
"org.apache.stanbol.enhancer.engines.entitytagging.organisationState";
@Property//(value = "dbp-ont:Organisation")
public static final String ORG_TYPE =
"org.apache.stanbol.enhancer.engines.entitytagging.organisationType";
- @Property//(boolValue = true)
+ @Property(boolValue = false)
public static final String PLACE_STATE =
"org.apache.stanbol.enhancer.engines.entitytagging.placeState";
@Property//(value = "dbp-ont:Place")
Modified:
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/ProcessingState.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/ProcessingState.java?rev=1180199&r1=1180198&r2=1180199&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/ProcessingState.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/ProcessingState.java
Fri Oct 7 20:18:41 2011
@@ -5,6 +5,7 @@ package org.apache.stanbol.enhancer.engi
import java.util.HashMap;
import java.util.Iterator;
+import java.util.LinkedHashMap;
import java.util.Map;
import org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText;
@@ -35,16 +36,26 @@ public class ProcessingState {
* The current {@link Chunk}
*/
private Chunk chunk;
+ private static final int MAX_TEXT_CACHE_SIZE = 32;
/**
- * This is a cache over the exact labels over the following 'n' tokens
- * relative {@link #tokenIndex}. It is cleared each time {@link #next()}
- * is called.
+ * This is a cache over the last {@link #MAX_TEXT_CACHE_SIZE} token texts
+ * requested by {@link #getTokenText(int, int)}
*/
- private Map<Integer,String> textCache = new HashMap<Integer,String>();
+ private Map<String,String> textCache = new LinkedHashMap<String,String>(
+ MAX_TEXT_CACHE_SIZE, 0.75f, true){
+ private static final long serialVersionUID = 1L;
+ protected boolean removeEldestEntry(Map.Entry<String,String> eldest) {
+ return size() > MAX_TEXT_CACHE_SIZE;
+ };
+ };
/**
* The position for the next token
*/
private int nextToken = -1;
+ /**
+ * The position of the last consumed position
+ */
+ private int consumedIndex = -1;
public ProcessingState(Iterator<AnalysedText> sentences){
this.sentences = sentences;
@@ -68,6 +79,13 @@ public class ProcessingState {
return tokenIndex;
}
/**
+ * Getter for the last consumed index
+ * @return the index of the last consumed token
+ */
+ public final int getConsumedIndex() {
+ return consumedIndex;
+ }
+ /**
* The currently active token
* @return the token
*/
@@ -103,21 +121,37 @@ public class ProcessingState {
public final int getNextToken() {
return nextToken;
}
- /**
- * Allows to manually set to position of the next token to process.
- * This can be used to skip some tokens within (e.g. if a Concept
- * matching multiple Tokens where found.<p>
- * The set token may be greater than the number of tokens in
- * {@link #sentence}. This will simple cause the next sentence to be
- * activated on the next call to {@link #next()}
- * @param pos the position of the next token to process.
- */
- public void setNextToken(int pos){
- if(pos > tokenIndex){
- this.nextToken = pos;
+// /**
+// * Allows to manually set to position of the next token to process.
+// * This can be used to skip some tokens within (e.g. if a Concept
+// * matching multiple Tokens where found.<p>
+// * The set token may be greater than the number of tokens in
+// * {@link #sentence}. This will simple cause the next sentence to be
+// * activated on the next call to {@link #next()}
+// * @param pos the position of the next token to process.
+// */
+// public void setNextToken(int pos){
+// if(pos > tokenIndex){
+// this.nextToken = pos;
+// } else {
+// throw new IllegalArgumentException("The nextTokenPos "+pos+
+// " MUST BE greater than the current "+tokenIndex);
+// }
+// }
+ /**
+ * The index of an consumed Token. The consumed index MUST BE equals or
+ * greater as {@link #getTokenIndex()}. If the consumed index is set to a
+ * value greater that {@link #getTokenIndex()} than consumed tokens are
+ * skipped on the next call to {@link #next()}
+ * @param pos the position of the last consumed token.
+ */
+ public void setConsumed(int pos){
+ if(pos >= tokenIndex){
+ this.consumedIndex = pos;
+ this.nextToken = pos+1;
} else {
- throw new IllegalArgumentException("The nextTokenPos "+pos+
- " MUST BE greater than the current "+tokenIndex);
+ throw new IllegalArgumentException("The lastConsumedPos "+pos+
+ " MUST BE equals or gerater than the current Pos "+tokenIndex);
}
}
/**
@@ -127,8 +161,6 @@ public class ProcessingState {
* <code>false</code> if there are no further elements to process.
*/
public boolean next() {
- //first clear caches for the current element
- textCache.clear();
//switch to the next token
if(nextToken > tokenIndex){
tokenIndex = nextToken;
@@ -145,6 +177,9 @@ public class ProcessingState {
if(chunk.getStart() > tokenIndex) { //skip tokens outside
chunks
tokenIndex = chunk.getStart();
}
+ if(chunk.getStart() > consumedIndex){
+ consumedIndex = chunk.getStart()-1;
+ }
hasNext = true;
} else { //no more valid chunks in this sentence
hasNext = initNextSentence();
@@ -172,6 +207,7 @@ public class ProcessingState {
* {@link #chunks}, {@link #chunk} and {@link #tokenIndex} to
<code>null</code>
*/
private boolean initNextSentence() {
+ textCache.clear();
sentence = null;
while(sentence == null && sentences.hasNext()){
sentence = sentences.next();
@@ -180,6 +216,7 @@ public class ProcessingState {
if(chunks.hasNext()){
chunk = chunks.next();
tokenIndex = chunk.getStart();
+ consumedIndex = tokenIndex-1;
nextToken = tokenIndex;
} else { //no chunks in this sentence
sentence = null; //skip this sentence
@@ -191,6 +228,7 @@ public class ProcessingState {
chunks = null;
chunk = null;
tokenIndex = 0;
+ consumedIndex = -1;
nextToken = 0;
}
}
@@ -213,12 +251,13 @@ public class ProcessingState {
* @return the text covered by the span start of {@link #token} to end of
* token at <code>{@link #tokenIndex}+tokenCount</code>.
*/
- public String getTokenText(int tokenCount){
- Integer pos = Integer.valueOf(tokenCount-1);
- String text = textCache.get(Integer.valueOf(tokenCount-1));
+ public String getTokenText(int start, int tokenCount){
+ String pos = start+","+tokenCount;
+ String text = textCache.get(pos);
if(text == null){
- text = sentence.getText().substring(token.getStart(),
- sentence.getTokens().get(tokenIndex+pos.intValue()).getEnd());
+ text = sentence.getText().substring(
+ sentence.getTokens().get(start).getStart(),
+ sentence.getTokens().get(start+tokenCount-1).getEnd());
textCache.put(pos, text);
}
return text;
Modified:
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java?rev=1180199&r1=1180198&r2=1180199&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java
Fri Oct 7 20:18:41 2011
@@ -7,15 +7,11 @@ import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
-import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
-import opennlp.tools.util.Span;
-
import org.apache.clerezza.rdf.core.UriRef;
-import org.apache.commons.lang.StringUtils;
import org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText.Token;
import
org.apache.stanbol.enhancer.engines.keywordextraction.impl.ProcessingState;
import
org.apache.stanbol.enhancer.engines.keywordextraction.linking.EntityLinkerConfig.RedirectProcessingMode;
@@ -123,7 +119,8 @@ public class EntityLinker {
//TODO: change this to a warning (like to have
exceptions during debugging)
throw new IllegalStateException(String.format(
"The match count for the top Ranked Suggestion for
%s changed after resorting based on Scores! (original: %s, currnet %s)",
-
state.getTokenText(bestMatchCount),oldBestRanked,suggestions));
+
state.getTokenText(suggestions.get(0).getStart(),bestMatchCount),
+ oldBestRanked,suggestions));
}
//remove all suggestions > config.maxSuggestions
if(suggestions.size() > config.getMaxSuggestions()){
@@ -136,9 +133,10 @@ public class EntityLinker {
processRedirects(suggestion);
}
}
+ int start = suggestions.get(0).getStart();
int span = suggestions.get(0).getSpan();
//Store the linking results
- String selectedText = state.getTokenText(span);
+ String selectedText = state.getTokenText(start,span);
//float score;
LinkedEntity linkedEntity =
linkedEntities.get(selectedText);
if(linkedEntity == null){
@@ -150,7 +148,7 @@ public class EntityLinker {
state.getSentence(), state.getTokenIndex(), span);
//set the next token to process to the next word after the
//currently found suggestion
- state.setNextToken(state.getTokenIndex()+span);
+ state.setConsumed(start+span-1);
}
} //else do not process this token
@@ -255,7 +253,7 @@ public class EntityLinker {
config.getNameField(),config.getSelectedFields(),
searchStrings,
state.getSentence().getLanguage(),config.getDefaultLanguage());
List<Suggestion> suggestions = new ArrayList<Suggestion>();
- for(Representation result : results){
+ for(Representation result : results){
Suggestion match = matchLabels(result);
if(match.getMatch() != MATCH.NONE){
suggestions.add(match);
@@ -335,89 +333,162 @@ public class EntityLinker {
*/
private void matchLabel(Suggestion match, Text label) {
String text = label.getText().toLowerCase();
- String[] labelTokens = content.tokenize(text);
+ //Tokenize the label and remove remove tokens without alpha numerical
chars
+ String[] unprocessedLabelTokens = content.tokenize(text);
+ int offset = 0;
+ for(int i=0;i<unprocessedLabelTokens.length;i++){
+ boolean hasAlpha = false;
+ for(int j=0;!hasAlpha && j<unprocessedLabelTokens[i].length();j++){
+ hasAlpha =
Character.isLetterOrDigit(unprocessedLabelTokens[i].charAt(j));
+ }
+ if(!hasAlpha){
+ offset++;
+ } else if(offset > 0){
+ unprocessedLabelTokens[i-offset] = unprocessedLabelTokens[i];
+ }
+ }
+ String[] labelTokens;
+ if(offset == 0){
+ labelTokens = unprocessedLabelTokens;
+ } else {
+ labelTokens = new String[unprocessedLabelTokens.length-offset];
+ System.arraycopy(unprocessedLabelTokens, 0, labelTokens, 0,
labelTokens.length);
+ }
Set<String> labelTokenSet = new HashSet<String>(
Arrays.asList(labelTokens));
+ int foundProcessableTokens = 0;
int foundTokens = 0;
float foundTokenMatch = 0;
//ensure the correct order of the tokens in the suggested entity
boolean search = true;
+ int firstFoundIndex = -1;
int lastFoundIndex = -1;
+ int firstFoundLabelIndex = -1;
int lastfoundLabelIndex = -1;
Token currentToken;
String currentTokenText;
int currentTokenLength;
int notFound = 0;
//search for matches within the correct order
- for(int currentIndex = state.getTokenIndex();currentIndex <
state.getSentence().getTokens().size() && search;currentIndex++){
+ for(int currentIndex = state.getTokenIndex();
+ currentIndex < state.getSentence().getTokens().size()
+ && search ;currentIndex++){
currentToken = state.getSentence().getTokens().get(currentIndex);
- currentTokenText = currentToken.getText().toLowerCase();
- currentTokenLength = currentTokenText.length();
- boolean isProcessable = isProcessableToken(currentToken);
- boolean found = false;
- float matchFactor = 0f;
- //iteration starts at the next token after the last matched one
- //so it is OK to skip tokens in the label, but not within the text
- for(int i = lastfoundLabelIndex+1;!found && i <
labelTokens.length;i ++){
- String labelTokenText = labelTokens[i];
+ if(currentToken.hasAplhaNumericChar()){
+ currentTokenText = currentToken.getText().toLowerCase();
+ currentTokenLength = currentTokenText.length();
+ boolean isProcessable = isProcessableToken(currentToken);
+ boolean found = false;
+ float matchFactor = 0f;
+ //iteration starts at the next token after the last matched one
+ //so it is OK to skip tokens in the label, but not within the
text
+ for(int i = lastfoundLabelIndex+1;!found && i <
labelTokens.length;i ++){
+ String labelTokenText = labelTokens[i];
+ int labelTokenLength = labelTokenText.length();
+ float maxLength = currentTokenLength > labelTokenLength ?
currentTokenLength : labelTokenLength;
+ float lengthDif = Math.abs(currentTokenLength -
labelTokenLength);
+ if((lengthDif/maxLength)<=0.3f){ //this prevents
unnecessary string comparison
+ int matchCount = compairTokens(currentTokenText,
labelTokenText);
+ if(matchCount/maxLength >= 0.7f){
+ lastfoundLabelIndex = i; //set the last found
index to the current position
+ found = true; //set found to true -> stops
iteration
+ matchFactor = matchCount/maxLength; //how good is
the match
+ //remove matched labels from the set to disable
them for
+ //a later random oder search
+ labelTokenSet.remove(labelTokenText);
+ }
+ }
+ }
+ if(!found){
+ //search for a match in the wrong order
+ //currently only exact matches (for testing)
+ if(found = labelTokenSet.remove(currentTokenText)){
+ matchFactor = 0.7f;
+ }
+ }
+ //int found =
text.indexOf(currentToken.getText().toLowerCase());
+ if(found){ //found
+ if(isProcessable){
+ foundProcessableTokens++; //only count processable
Tokens
+ }
+ foundTokens++;
+ foundTokenMatch = foundTokenMatch + matchFactor; //sum up
the matches
+ if(firstFoundIndex < 0){
+ firstFoundIndex = currentIndex;
+ firstFoundLabelIndex = lastfoundLabelIndex;
+ }
+ lastFoundIndex = currentIndex;
+ } else { //not found
+ notFound++;
+ if(isProcessable || notFound > maxNotFound){
+ //stop as soon as a token that needs to be processed is
+ //not found in the label or the maximum number of
tokens
+ //that are not processable are not found
+ search = false;
+ }
+ }
+ } // else token without alpha or numeric characters are not
processed
+ }
+ //search backwards for label tokens until firstFoundLabelIndex if there
+ //are unconsumed Tokens in the sentence before state.getTokenIndex
+ int currentIndex = state.getTokenIndex()-1;
+ int labelIndex = firstFoundLabelIndex-1;
+ notFound = 0;
+ search = true;
+ while(search && labelIndex >= 0 && currentIndex >
state.getConsumedIndex()){
+ String labelTokenText = labelTokens[labelIndex];
+ if(labelTokenSet.remove(labelTokenText)){ //still not matched
+ currentToken =
state.getSentence().getTokens().get(currentIndex);
+ currentTokenText = currentToken.getText().toLowerCase();
+ currentTokenLength = currentTokenText.length();
+ boolean found = false;
+ float matchFactor = 0f;
int labelTokenLength = labelTokenText.length();
float maxLength = currentTokenLength > labelTokenLength ?
currentTokenLength : labelTokenLength;
float lengthDif = Math.abs(currentTokenLength -
labelTokenLength);
if((lengthDif/maxLength)<=0.3f){ //this prevents unnecessary
string comparison
- int matchCount = compairTokens(currentTokenText,
labelTokens[i]);
+ int matchCount = compairTokens(currentTokenText,
labelTokenText);
if(matchCount/maxLength >= 0.7f){
- lastfoundLabelIndex = i; //set the last found index to
the current position
found = true; //set found to true -> stops iteration
matchFactor = matchCount/maxLength; //how good is the
match
- //remove matched labels from the set to disable them
for
- //a later random oder search
- labelTokenSet.remove(labelTokenText);
}
}
- }
- if(!found){
- //search for a match in the wrong order
- //currently only exact matches (for testing)
- if(found = labelTokenSet.remove(currentTokenText)){
- matchFactor = 0.7f;
- }
- }
- //int found = text.indexOf(currentToken.getText().toLowerCase());
- if(found){ //found
- if(isProcessable){
- foundTokens++; //only count processable Tokens
+ if(found){ //found
+ foundTokens++;
foundTokenMatch = foundTokenMatch + matchFactor; //sum up
the matches
- }
- lastFoundIndex = currentIndex;
- } else { //not found
- notFound++;
- if(isProcessable || notFound > maxNotFound){
- //stop as soon as a token that needs to be processed is
- //not found in the label or the maximum number of tokens
- //that are not processable are not found
- search = false;
+ firstFoundIndex = currentIndex;
+ currentIndex --;
+ } else {
+ notFound++;
+ if(notFound > maxNotFound){
+ //stop as soon as a token that needs to be processed is
+ //not found in the label or the maximum number of
tokens
+ //that are not processable are not found
+ search = false;
+ }
}
}
+ labelIndex--;
}
//Now we make a second round to search tokens that match in the wrong
order
//e.g. if given and family name of persons are switched
MATCH labelMatch;
- int coveredTokens = lastFoundIndex-state.getTokenIndex()+1;
+ int coveredTokens = lastFoundIndex-firstFoundIndex+1;
float labelMatchScore = (foundTokenMatch/(float)labelTokens.length);
//Matching rules
// - if less than config#minTokenFound() than accept only EXACT
// - override PARTIAL matches with FULL/EXACT matches only if
// foundTokens of the PARTIAL match is > than of the FULL/EXACT
// match (this will be very rare
- if(foundTokens > 0 && match.getMatchCount() <= foundTokens) {
- String currentText = state.getTokenText(coveredTokens);
- if(currentText.equalsIgnoreCase(label.getText())){
+ if(foundProcessableTokens > 0 && match.getMatchCount() <=
foundProcessableTokens) {
+ String currentText =
state.getTokenText(firstFoundIndex,coveredTokens);
+ if(currentText.equalsIgnoreCase(text)){
labelMatch = MATCH.EXACT;
//set found to covered: May be lower because only
//processable tokens are counted, but Exact also checks
//of non-processable!
foundTokens = coveredTokens;
- } else if(foundTokens >= config.getMinFoundTokens() &&
+ } else if(foundProcessableTokens >= config.getMinFoundTokens() &&
labelMatchScore >= 0.6f){
if(foundTokens == coveredTokens){
labelMatch = MATCH.FULL;
@@ -428,10 +499,10 @@ public class EntityLinker {
labelMatch = MATCH.NONE;
}
if(labelMatch != MATCH.NONE){
- if(match.getMatchCount() < foundTokens ||
- match.getMatchCount() < foundTokens &&
+ if(match.getMatchCount() < foundProcessableTokens ||
+ match.getMatchCount() == foundProcessableTokens &&
labelMatch.ordinal() > match.getMatch().ordinal()){
- match.updateMatch(labelMatch, coveredTokens, foundTokens,
+ match.updateMatch(labelMatch, firstFoundIndex,
coveredTokens, foundTokens,
foundTokenMatch/foundTokens,label,labelTokens.length);
} //else this match is not better as the existing one
} //else ignore labels with MATCH.NONE
Modified:
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java?rev=1180199&r1=1180198&r2=1180199&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java
Fri Oct 7 20:18:41 2011
@@ -89,6 +89,7 @@ public class EntityLinkerConfig {
mappings.put(OntologicalClasses.DBPEDIA_PLACE.getUnicodeString(),
OntologicalClasses.DBPEDIA_PLACE);
mappings.put(NamespaceEnum.schema+"Place",
OntologicalClasses.DBPEDIA_PLACE);
+ mappings.put(NamespaceEnum.gml+"_Feature",
OntologicalClasses.DBPEDIA_PLACE);
mappings.put(OntologicalClasses.SKOS_CONCEPT.getUnicodeString(),
OntologicalClasses.SKOS_CONCEPT);
DEFAULT_ENTITY_TYPE_MAPPINGS = Collections.unmodifiableMap(mappings);
Modified:
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/Suggestion.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/Suggestion.java?rev=1180199&r1=1180198&r2=1180199&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/Suggestion.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/Suggestion.java
Fri Oct 7 20:18:41 2011
@@ -23,6 +23,7 @@ import org.apache.stanbol.entityhub.serv
*/
public class Suggestion implements Comparable<Suggestion>{
private MATCH match = MATCH.NONE;
+ private int start = 0;
private int span = 0;
private int matchCount = 0;
private Text label;
@@ -69,6 +70,7 @@ public class Suggestion implements Compa
/**
* Updates this suggestion
* @param match the math type
+ * @param start the start position of this suggestion
* @param span the number of token this suggestion spans
* @param count the number of token that match with the suggestion within
the span
* @param matchScore the score of the match. MUST BE in the range between
@@ -78,7 +80,7 @@ public class Suggestion implements Compa
* @param label the label that matches the tokens
* @param labelTokenCount the number of tokens of the label
*/
- protected void updateMatch(MATCH match,int span,int count,float
matchScore,Text label,int labelTokenCount){
+ protected void updateMatch(MATCH match,int start, int span,int count,float
matchScore,Text label,int labelTokenCount){
this.match = match;
//check the validity of the parameters to avoid later errors that are
//than hard to debug
@@ -101,6 +103,7 @@ public class Suggestion implements Compa
}
}
}
+ this.start = start;
this.span = span;
this.label = label;
if(match == MATCH.EXACT){ //for exact matches the matchScore needs to
be
@@ -154,6 +157,13 @@ public class Suggestion implements Compa
return matchScore;
}
/**
+ * Getter for the start index of this Suggestion
+ * @return the start token index for this suggestion
+ */
+ public int getStart() {
+ return start;
+ }
+ /**
* Getter for the number of the token matched by this suggestion
* @return The number of the token matched by this suggestion
*/
@@ -186,7 +196,7 @@ public class Suggestion implements Compa
* @return the best match or {@link Suggestion#getMatchedLabel()} if non
is found
*/
public Text getBestLabel(String nameField, String language){
- Representation rep = getRepresentation();
+ Representation rep = getRepresentation();
// 1. check if the returned Entity does has a label -> if not return
null
// add labels (set only a single label. Use "en" if available!
Text label = null;
Modified:
incubator/stanbol/trunk/entityhub/generic/servicesapi/src/main/java/org/apache/stanbol/entityhub/servicesapi/defaults/NamespaceEnum.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/generic/servicesapi/src/main/java/org/apache/stanbol/entityhub/servicesapi/defaults/NamespaceEnum.java?rev=1180199&r1=1180198&r2=1180199&view=diff
==============================================================================
---
incubator/stanbol/trunk/entityhub/generic/servicesapi/src/main/java/org/apache/stanbol/entityhub/servicesapi/defaults/NamespaceEnum.java
(original)
+++
incubator/stanbol/trunk/entityhub/generic/servicesapi/src/main/java/org/apache/stanbol/entityhub/servicesapi/defaults/NamespaceEnum.java
Fri Oct 7 20:18:41 2011
@@ -55,6 +55,7 @@ public enum NamespaceEnum {
//Some well known Namespaces of Ontologies
geo("http://www.w3.org/2003/01/geo/wgs84_pos#"),
georss("http://www.georss.org/georss/"),
+ gml("http://www.opengis.net/gml/"),
dcElements("dc-elements","http://purl.org/dc/elements/1.1/"),
dcTerms("dc","http://purl.org/dc/terms/"), // Entityhub prefers DC-Terms,
therefore use the "dc" prefix for the terms name space
foaf("http://xmlns.com/foaf/0.1/"),