entitytag...

rwesten Fri, 07 Oct 2011 13:19:14 -0700

Author: rwesten
Date: Fri Oct  7 20:18:41 2011
New Revision: 1180199

URL: http://svn.apache.org/viewvc?rev=1180199&view=rev
Log:
Further improvements related to keyword linking.


This changes should complete the initial version of the KeywordLinkingEnigne 
(function wise). Regarding STANBOL-303 (making EntityFetching -> EntityLinker 
component) there is still some additional work (line a plug-able label 
matching) to do.

Changes in detail:

Language Processing:

* Some adaption the the configuration of POS tags
* The POSTypeChunker now (again) follows tokens with POS tags marked as follow 
backwards. This is mainly to include with Adjective like "10th European Day of 
Languages"-
* Tokens as used by the TextAnalyzer now have a boolean property if they 
contain at least a single Alpha-Numerical char. This makes is more performant 
to filter tokens that represent punctation and so on.
* Deactivated language specific Tokenizer for Danish and Swedish

KeywordExtractionEngine

* Processing state now also holds the last "consumed" item. This is intended to 
allow backwards search for matching words until the last "consumed" word 
(already linked with an entity).
* The label matching now also able to search backwards for matching tokens. 
(e.g. to correctly match the "10th {event name}" or the "European {role name}" 
...
* Matching now ignores Tokens without any alpha-numerical char
* Matching now again counts non-processable tokens. This has advantages and 
disadvantages.  The best solution would be to exclude stop-words however 
currently there are no stop word lists available.


other changes

* reactivate default values for Persons, Organization and Place states for the 
NamedEntityLinkingEnige so that in the Apache Felix Webconsole they are 
correctly presented as boolean properties.
* added the "http://www.opengis.net/gml/"; to the NamespaceEnum of the Entityhub 
as this is namespace is used by DBpedia

Modified:
    
incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java
    
incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTypeChunker.java
    
incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/TextAnalyzer.java
    incubator/stanbol/trunk/data/opennlp/lang/da/download_models.xml
    incubator/stanbol/trunk/data/opennlp/lang/sv/download_models.xml
    
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/NamedEntityTaggingEngine.java
    
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/ProcessingState.java
    
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java
    
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java
    
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/Suggestion.java
    
incubator/stanbol/trunk/entityhub/generic/servicesapi/src/main/java/org/apache/stanbol/entityhub/servicesapi/defaults/NamespaceEnum.java

Modified: 
incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java?rev=1180199&r1=1180198&r2=1180199&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java
 (original)
+++ 
incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTagsCollectionEnum.java
 Fri Oct  7 20:18:41 2011
@@ -20,9 +20,12 @@ public enum PosTagsCollectionEnum {
     /**
      * Nouns related POS types for English based on the 
      * <a 
href="http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html";>
-     * Penn Treebank</a> tag set
+     * Penn Treebank</a> tag set.
+     * <p>
+     * NOTE the "``" tag is also added as noun, because it can not be found in
+     * the official tag set and is sometimes used to tag nouns.
      */
-    EN_NOUN("en",PosTypeCollectionType.NOUN,"NN","NNP","NNPS","NNS","FW","CD"),
+    
EN_NOUN("en",PosTypeCollectionType.NOUN,"NN","NNP","NNPS","NNS","FW","CD","``"),
     /**
      * Verb related POS types for English based on the 
      * <a 
href="http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html";>
@@ -36,7 +39,7 @@ public enum PosTagsCollectionEnum {
      * <a 
href="http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html";>
      * Penn Treebank</a> tag set
      */
-    EN_FOLLOW("en",PosTypeCollectionType.FOLLOW,"#","$"," 
","(",")",",",".",":","``","POS","IN"),
+    EN_FOLLOW("en",PosTypeCollectionType.FOLLOW,"#","$"," 
","(",")",",",".",":","POS","IN","JJ"),
     /**
      * Noun related POS types for German based on the 
      * <a 
href="http://www.ims.uni-stuttgart.de/projekte/corplex/TagSets/stts-table.html";>
@@ -57,7 +60,7 @@ public enum PosTagsCollectionEnum {
      * <a 
href="http://www.ims.uni-stuttgart.de/projekte/corplex/TagSets/stts-table.html";>
      * STTS Tag Set</a> 
      */
-    DE_FOLLOW("de",PosTypeCollectionType.FOLLOW,"$","$.","$("),
+    DE_FOLLOW("de",PosTypeCollectionType.FOLLOW,"$.","$,","$(","APPR"),
     /**
      * POS types representing Nouns for Danish based on the PAROLE Tagset as
      * described by <a href="http://korpus.dsl.dk/paroledoc_en.pdf";>this 
paper</a>

Modified: 
incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTypeChunker.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTypeChunker.java?rev=1180199&r1=1180198&r2=1180199&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTypeChunker.java
 (original)
+++ 
incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/PosTypeChunker.java
 Fri Oct  7 20:18:41 2011
@@ -161,15 +161,14 @@ public class PosTypeChunker {
      * @return the chunks as spans over the parsed tokens
      */
     public Span[] chunkAsSpans(String[] tokens, String[] tags) {
-//      int consumed = -1;
+      int consumed = -1;
         List<Span> chunks = new ArrayList<Span>();
         for(int i=0;i<tokens.length;i++){
             if(includePOS(null,tags[i])){
                 int start = i;
-                //do not follow backwards!
-//                while(start-1 > consumed && followPOS(tags[start-1])){
-//                    start--; //follow backwards until consumed
-//                }
+                while(start-1 > consumed && followPOS(null,tags[start-1])){
+                    start--; //follow backwards until consumed
+                }
                 int followEnd = i;
                 int end = i;
                 while(followEnd+1 < tokens.length && 
followPOS(null,tags[followEnd+1])){
@@ -199,15 +198,15 @@ public class PosTypeChunker {
         //      used by this one :(
         //      If someone has a better Idea feel free to change!
         //      Rupert Westenthaler (28.Sep.2011)
-//        int consumed = -1;
+        int consumed = -1;
         List<Span> chunks = new ArrayList<Span>();
         for(int i=0;i<tokens.length;i++){
             if(includePOS(props[i],tags[i])){
                 int start = i;
                 //do not follow backwards!
-//                while(start-1 > consumed && followPOS(tags[start-1])){
-//                    start--; //follow backwards until consumed
-//                }
+                while(start-1 > consumed && 
followPOS(props[start-1],tags[start-1])){
+                    start--; //follow backwards until consumed
+                }
                 int followEnd = i;
                 int end = i;
                 while(followEnd+1 < tokens.length && 
followPOS(props[followEnd+1],tags[followEnd+1])){

Modified: 
incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/TextAnalyzer.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/TextAnalyzer.java?rev=1180199&r1=1180198&r2=1180199&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/TextAnalyzer.java
 (original)
+++ 
incubator/stanbol/trunk/commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/TextAnalyzer.java
 Fri Oct  7 20:18:41 2011
@@ -474,6 +474,7 @@ public class TextAnalyzer {
             protected String token;
             protected final String[] posTags;
             protected final double[] posProbabilities;
+            protected final boolean hasAlphaNumeric;
 
             private Token(Span span,String token,String pos,double 
posProbability){
                 this(span,token,new String[]{pos},new double[] 
{posProbability});
@@ -493,6 +494,11 @@ public class TextAnalyzer {
                 } else {
                     this.posProbabilities = posProbabilities;
                 }
+                boolean foundAlphaNumericCahr = false;
+                for(int i = 0;!foundAlphaNumericCahr &&i<token.length();i++){
+                    foundAlphaNumericCahr = 
Character.isLetterOrDigit(token.charAt(i));
+                }
+                hasAlphaNumeric = foundAlphaNumericCahr;
             }
 
             public int getStart(){
@@ -541,6 +547,9 @@ public class TextAnalyzer {
                 }
                 return token;
             }
+            public boolean hasAplhaNumericChar(){
+                return hasAlphaNumeric;
+            }
             @Override
             public String toString() {
                 return getText()+(posTags != null?

Modified: incubator/stanbol/trunk/data/opennlp/lang/da/download_models.xml
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/data/opennlp/lang/da/download_models.xml?rev=1180199&r1=1180198&r2=1180199&view=diff
==============================================================================
--- incubator/stanbol/trunk/data/opennlp/lang/da/download_models.xml (original)
+++ incubator/stanbol/trunk/data/opennlp/lang/da/download_models.xml Fri Oct  7 
20:18:41 2011
@@ -22,9 +22,15 @@
   </description>
    
   <target name="download">
-    <copy todir="${target.directory}" flatten="true">
+   <!-- ensure the delition of the no longer used Danish tokenizer --> 
+   <delete dir="${target.directory}" includes="da-token.bin" />
+   <copy todir="${target.directory}" flatten="true">
       <resources>
+        <!-- 
+          After some testing the decision was to use the simple tokenizer
+          for the Danish language
         <url url="${model.url}/da-token.bin"/>
+        -->
         <url url="${model.url}/da-sent.bin"/>
         <url url="${model.url}/da-pos-perceptron.bin"/>
         <!-- no Chunker for german

Modified: incubator/stanbol/trunk/data/opennlp/lang/sv/download_models.xml
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/data/opennlp/lang/sv/download_models.xml?rev=1180199&r1=1180198&r2=1180199&view=diff
==============================================================================
--- incubator/stanbol/trunk/data/opennlp/lang/sv/download_models.xml (original)
+++ incubator/stanbol/trunk/data/opennlp/lang/sv/download_models.xml Fri Oct  7 
20:18:41 2011
@@ -24,14 +24,19 @@
        "se-*" to "sv-*"
    -->
   <mapper type="merge" />
+  <!-- ensure the delition of the no longer used Swedish tokenizer -->  
+  <delete dir="${target.directory}" includes="sv-token.bin" />
    
   <target name="download">
+<!-- Based on some testing the decision was to use the SimpleTokenizer for 
+     the Swidish language
       <copy toDir="${target.directory}/">
           <resources>
               <url url="${model.url}/se-token.bin"/>
           </resources>
           <mergemapper to="sv-token.bin"/>
       </copy>
+      -->
       <copy toDir="${target.directory}/">
           <resources>
               <url url="${model.url}/se-sent.bin"/>

Modified: 
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/NamedEntityTaggingEngine.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/NamedEntityTaggingEngine.java?rev=1180199&r1=1180198&r2=1180199&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/NamedEntityTaggingEngine.java
 (original)
+++ 
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/NamedEntityTaggingEngine.java
 Fri Oct  7 20:18:41 2011
@@ -91,19 +91,19 @@ public class NamedEntityTaggingEngine im
     @Property//(value = "dbpedia")
     public static final String REFERENCED_SITE_ID = 
"org.apache.stanbol.enhancer.engines.entitytagging.referencedSiteId";
 
-    @Property//(boolValue = true)
+    @Property(boolValue = false)
     public static final String PERSON_STATE = 
"org.apache.stanbol.enhancer.engines.entitytagging.personState";
 
     @Property//(value = "dbp-ont:Person")
     public static final String PERSON_TYPE = 
"org.apache.stanbol.enhancer.engines.entitytagging.personType";
 
-    @Property//(boolValue = true)
+    @Property(boolValue = false)
     public static final String ORG_STATE = 
"org.apache.stanbol.enhancer.engines.entitytagging.organisationState";
 
     @Property//(value = "dbp-ont:Organisation")
     public static final String ORG_TYPE = 
"org.apache.stanbol.enhancer.engines.entitytagging.organisationType";
 
-    @Property//(boolValue = true)
+    @Property(boolValue = false)
     public static final String PLACE_STATE = 
"org.apache.stanbol.enhancer.engines.entitytagging.placeState";
 
     @Property//(value = "dbp-ont:Place")

Modified: 
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/ProcessingState.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/ProcessingState.java?rev=1180199&r1=1180198&r2=1180199&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/ProcessingState.java
 (original)
+++ 
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/ProcessingState.java
 Fri Oct  7 20:18:41 2011
@@ -5,6 +5,7 @@ package org.apache.stanbol.enhancer.engi
 
 import java.util.HashMap;
 import java.util.Iterator;
+import java.util.LinkedHashMap;
 import java.util.Map;
 
 import org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText;
@@ -35,16 +36,26 @@ public class ProcessingState {
      * The current {@link Chunk}
      */
     private Chunk chunk;
+    private static final int MAX_TEXT_CACHE_SIZE = 32;
     /**
-     * This is a cache over the exact labels over the following 'n' tokens
-     * relative {@link #tokenIndex}. It is cleared each time {@link #next()}
-     * is called. 
+     * This is a cache over the last {@link #MAX_TEXT_CACHE_SIZE} token texts
+     * requested by {@link #getTokenText(int, int)}
      */
-    private Map<Integer,String> textCache = new HashMap<Integer,String>();
+    private Map<String,String> textCache = new LinkedHashMap<String,String>(
+            MAX_TEXT_CACHE_SIZE, 0.75f, true){
+        private static final long serialVersionUID = 1L;
+        protected boolean removeEldestEntry(Map.Entry<String,String> eldest) {
+            return size() > MAX_TEXT_CACHE_SIZE;
+        };
+    };
     /**
      * The position for the next token
      */
     private int nextToken = -1;
+    /**
+     * The position of the last consumed position
+     */
+    private int consumedIndex = -1;
 
     public ProcessingState(Iterator<AnalysedText> sentences){
         this.sentences = sentences;
@@ -68,6 +79,13 @@ public class ProcessingState {
         return tokenIndex;
     }
     /**
+     * Getter for the last consumed index
+     * @return the index of the last consumed token
+     */
+    public final int getConsumedIndex() {
+        return consumedIndex;
+    }
+    /**
      * The currently active token
      * @return the token
      */
@@ -103,21 +121,37 @@ public class ProcessingState {
     public final int getNextToken() {
         return nextToken;
     }
-    /**
-     * Allows to manually set to position of the next token to process.
-     * This can be used to skip some tokens within (e.g. if a Concept
-     * matching multiple Tokens where found.<p>
-     * The set token may be greater than the number of tokens in 
-     * {@link #sentence}. This will simple cause the next sentence to be
-     * activated on the next call to {@link #next()}
-     * @param pos the position of the next token to process. 
-     */
-    public void setNextToken(int pos){
-        if(pos > tokenIndex){
-            this.nextToken = pos;
+//    /**
+//     * Allows to manually set to position of the next token to process.
+//     * This can be used to skip some tokens within (e.g. if a Concept
+//     * matching multiple Tokens where found.<p>
+//     * The set token may be greater than the number of tokens in 
+//     * {@link #sentence}. This will simple cause the next sentence to be
+//     * activated on the next call to {@link #next()}
+//     * @param pos the position of the next token to process. 
+//     */
+//    public void setNextToken(int pos){
+//        if(pos > tokenIndex){
+//            this.nextToken = pos;
+//        } else {
+//            throw new IllegalArgumentException("The nextTokenPos "+pos+
+//                " MUST BE greater than the current "+tokenIndex);
+//        }
+//    }
+    /**
+     * The index of an consumed Token. The consumed index MUST BE equals or
+     * greater as {@link #getTokenIndex()}. If the consumed index is set to a
+     * value greater that {@link #getTokenIndex()} than consumed tokens are
+     * skipped on the next call to {@link #next()}
+     * @param pos the position of the last consumed token.
+     */
+    public void setConsumed(int pos){
+        if(pos >= tokenIndex){
+            this.consumedIndex = pos;
+            this.nextToken = pos+1;
         } else {
-            throw new IllegalArgumentException("The nextTokenPos "+pos+
-                " MUST BE greater than the current "+tokenIndex);
+            throw new IllegalArgumentException("The lastConsumedPos "+pos+
+                " MUST BE equals or gerater than the current Pos "+tokenIndex);
         }
     }
     /**
@@ -127,8 +161,6 @@ public class ProcessingState {
      * <code>false</code> if there are no further elements to process.
      */
     public boolean next() {
-        //first clear caches for the current element
-        textCache.clear();
         //switch to the next token
         if(nextToken > tokenIndex){
             tokenIndex = nextToken;
@@ -145,6 +177,9 @@ public class ProcessingState {
                 if(chunk.getStart() > tokenIndex) { //skip tokens outside 
chunks
                     tokenIndex = chunk.getStart();
                 }
+                if(chunk.getStart() > consumedIndex){
+                    consumedIndex = chunk.getStart()-1;
+                }
                 hasNext = true;
             } else { //no more valid chunks in this sentence
                 hasNext = initNextSentence();
@@ -172,6 +207,7 @@ public class ProcessingState {
      * {@link #chunks}, {@link #chunk} and {@link #tokenIndex} to 
<code>null</code>
      */
     private boolean initNextSentence() {
+        textCache.clear();
         sentence = null;
         while(sentence == null && sentences.hasNext()){
             sentence = sentences.next();
@@ -180,6 +216,7 @@ public class ProcessingState {
                 if(chunks.hasNext()){
                     chunk = chunks.next();
                     tokenIndex = chunk.getStart();
+                    consumedIndex = tokenIndex-1;
                     nextToken = tokenIndex;
                 } else { //no chunks in this sentence
                     sentence = null; //skip this sentence
@@ -191,6 +228,7 @@ public class ProcessingState {
                     chunks = null;
                     chunk = null;
                     tokenIndex = 0;
+                    consumedIndex = -1;
                     nextToken = 0;
                 }
             }
@@ -213,12 +251,13 @@ public class ProcessingState {
      * @return the text covered by the span start of {@link #token} to end of
      * token at <code>{@link #tokenIndex}+tokenCount</code>.
      */
-    public String getTokenText(int tokenCount){
-        Integer pos = Integer.valueOf(tokenCount-1);
-        String text = textCache.get(Integer.valueOf(tokenCount-1));
+    public String getTokenText(int start, int tokenCount){
+        String pos = start+","+tokenCount;
+        String text = textCache.get(pos);
         if(text == null){
-            text = sentence.getText().substring(token.getStart(),
-                sentence.getTokens().get(tokenIndex+pos.intValue()).getEnd());
+            text = sentence.getText().substring(
+                sentence.getTokens().get(start).getStart(),
+                sentence.getTokens().get(start+tokenCount-1).getEnd());
             textCache.put(pos, text);
         }
         return text;

Modified: 
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java?rev=1180199&r1=1180198&r2=1180199&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java
 (original)
+++ 
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java
 Fri Oct  7 20:18:41 2011
@@ -7,15 +7,11 @@ import java.util.Collections;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
-import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
 
-import opennlp.tools.util.Span;
-
 import org.apache.clerezza.rdf.core.UriRef;
-import org.apache.commons.lang.StringUtils;
 import org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText.Token;
 import 
org.apache.stanbol.enhancer.engines.keywordextraction.impl.ProcessingState;
 import 
org.apache.stanbol.enhancer.engines.keywordextraction.linking.EntityLinkerConfig.RedirectProcessingMode;
@@ -123,7 +119,8 @@ public class EntityLinker {
                         //TODO: change this to a warning (like to have 
exceptions during debugging)
                         throw new IllegalStateException(String.format(
                             "The match count for the top Ranked Suggestion for 
%s changed after resorting based on Scores! (original: %s, currnet %s)",
-                            
state.getTokenText(bestMatchCount),oldBestRanked,suggestions));
+                            
state.getTokenText(suggestions.get(0).getStart(),bestMatchCount),
+                            oldBestRanked,suggestions));
                     }
                     //remove all suggestions > config.maxSuggestions
                     if(suggestions.size() > config.getMaxSuggestions()){
@@ -136,9 +133,10 @@ public class EntityLinker {
                             processRedirects(suggestion);
                         }
                     }
+                    int start = suggestions.get(0).getStart();
                     int span = suggestions.get(0).getSpan();
                     //Store the linking results
-                    String selectedText = state.getTokenText(span);
+                    String selectedText = state.getTokenText(start,span);
                     //float score;
                     LinkedEntity linkedEntity = 
linkedEntities.get(selectedText);
                     if(linkedEntity == null){
@@ -150,7 +148,7 @@ public class EntityLinker {
                         state.getSentence(), state.getTokenIndex(), span);
                     //set the next token to process to the next word after the
                     //currently found suggestion
-                    state.setNextToken(state.getTokenIndex()+span);
+                    state.setConsumed(start+span-1);
                 }
                 
             } //else do not process this token
@@ -255,7 +253,7 @@ public class EntityLinker {
             config.getNameField(),config.getSelectedFields(),
             searchStrings, 
state.getSentence().getLanguage(),config.getDefaultLanguage());
         List<Suggestion> suggestions = new ArrayList<Suggestion>();
-        for(Representation result : results){
+        for(Representation result : results){ 
             Suggestion match = matchLabels(result);
             if(match.getMatch() != MATCH.NONE){
                 suggestions.add(match);
@@ -335,89 +333,162 @@ public class EntityLinker {
      */
     private void matchLabel(Suggestion match, Text label) {
         String text = label.getText().toLowerCase();
-        String[] labelTokens = content.tokenize(text);
+        //Tokenize the label and remove remove tokens without alpha numerical 
chars
+        String[] unprocessedLabelTokens = content.tokenize(text);
+        int offset = 0;
+        for(int i=0;i<unprocessedLabelTokens.length;i++){
+            boolean hasAlpha = false;
+            for(int j=0;!hasAlpha && j<unprocessedLabelTokens[i].length();j++){
+                hasAlpha = 
Character.isLetterOrDigit(unprocessedLabelTokens[i].charAt(j));
+            }
+            if(!hasAlpha){
+                offset++;
+            } else if(offset > 0){
+                unprocessedLabelTokens[i-offset] = unprocessedLabelTokens[i];
+            }
+        }
+        String[] labelTokens;
+        if(offset == 0){
+            labelTokens = unprocessedLabelTokens;
+        } else {
+            labelTokens = new String[unprocessedLabelTokens.length-offset];
+            System.arraycopy(unprocessedLabelTokens, 0, labelTokens, 0, 
labelTokens.length);
+        }
         Set<String> labelTokenSet = new HashSet<String>(
                 Arrays.asList(labelTokens));
+        int foundProcessableTokens = 0;
         int foundTokens = 0;
         float foundTokenMatch = 0;
         //ensure the correct order of the tokens in the suggested entity
         boolean search = true;
+        int firstFoundIndex = -1;
         int lastFoundIndex = -1;
+        int firstFoundLabelIndex = -1;
         int lastfoundLabelIndex = -1;
         Token currentToken;
         String currentTokenText;
         int currentTokenLength;
         int notFound = 0;
         //search for matches within the correct order
-        for(int currentIndex = state.getTokenIndex();currentIndex < 
state.getSentence().getTokens().size() && search;currentIndex++){
+        for(int currentIndex = state.getTokenIndex();
+                currentIndex < state.getSentence().getTokens().size() 
+                && search ;currentIndex++){
             currentToken = state.getSentence().getTokens().get(currentIndex);
-            currentTokenText = currentToken.getText().toLowerCase();
-            currentTokenLength = currentTokenText.length();
-            boolean isProcessable = isProcessableToken(currentToken);
-            boolean found = false;
-            float matchFactor = 0f;
-            //iteration starts at the next token after the last matched one
-            //so it is OK to skip tokens in the label, but not within the text
-            for(int i = lastfoundLabelIndex+1;!found && i < 
labelTokens.length;i ++){
-                String labelTokenText = labelTokens[i];
+            if(currentToken.hasAplhaNumericChar()){
+                currentTokenText = currentToken.getText().toLowerCase();
+                currentTokenLength = currentTokenText.length();
+                boolean isProcessable = isProcessableToken(currentToken);
+                boolean found = false;
+                float matchFactor = 0f;
+                //iteration starts at the next token after the last matched one
+                //so it is OK to skip tokens in the label, but not within the 
text
+                for(int i = lastfoundLabelIndex+1;!found && i < 
labelTokens.length;i ++){
+                    String labelTokenText = labelTokens[i];
+                    int labelTokenLength = labelTokenText.length();
+                    float maxLength = currentTokenLength > labelTokenLength ? 
currentTokenLength : labelTokenLength;
+                    float lengthDif = Math.abs(currentTokenLength - 
labelTokenLength);
+                    if((lengthDif/maxLength)<=0.3f){ //this prevents 
unnecessary string comparison 
+                        int matchCount = compairTokens(currentTokenText, 
labelTokenText);
+                        if(matchCount/maxLength >= 0.7f){
+                            lastfoundLabelIndex = i; //set the last found 
index to the current position
+                            found = true; //set found to true -> stops 
iteration
+                            matchFactor = matchCount/maxLength; //how good is 
the match
+                            //remove matched labels from the set to disable 
them for
+                            //a later random oder search
+                            labelTokenSet.remove(labelTokenText);
+                        }
+                    }
+                }
+                if(!found){
+                    //search for a match in the wrong order
+                    //currently only exact matches (for testing)
+                    if(found = labelTokenSet.remove(currentTokenText)){
+                        matchFactor = 0.7f;
+                    }
+                }
+                //int found = 
text.indexOf(currentToken.getText().toLowerCase());
+                if(found){ //found
+                    if(isProcessable){
+                        foundProcessableTokens++; //only count processable 
Tokens
+                    }
+                    foundTokens++;
+                    foundTokenMatch = foundTokenMatch + matchFactor; //sum up 
the matches
+                    if(firstFoundIndex < 0){
+                        firstFoundIndex = currentIndex;
+                        firstFoundLabelIndex = lastfoundLabelIndex;
+                    }
+                    lastFoundIndex = currentIndex;
+                } else { //not found
+                    notFound++;
+                    if(isProcessable || notFound > maxNotFound){
+                        //stop as soon as a token that needs to be processed is
+                        //not found in the label or the maximum number of 
tokens
+                        //that are not processable are not found
+                        search = false; 
+                    }
+                }
+            } // else token without alpha or numeric characters are not 
processed
+        }
+        //search backwards for label tokens until firstFoundLabelIndex if there
+        //are unconsumed Tokens in the sentence before state.getTokenIndex
+        int currentIndex = state.getTokenIndex()-1;
+        int labelIndex = firstFoundLabelIndex-1;
+        notFound = 0;
+        search = true;
+        while(search && labelIndex >= 0 && currentIndex > 
state.getConsumedIndex()){
+            String labelTokenText = labelTokens[labelIndex];
+            if(labelTokenSet.remove(labelTokenText)){ //still not matched
+                currentToken = 
state.getSentence().getTokens().get(currentIndex);
+                currentTokenText = currentToken.getText().toLowerCase();
+                currentTokenLength = currentTokenText.length();
+                boolean found = false;
+                float matchFactor = 0f;
                 int labelTokenLength = labelTokenText.length();
                 float maxLength = currentTokenLength > labelTokenLength ? 
currentTokenLength : labelTokenLength;
                 float lengthDif = Math.abs(currentTokenLength - 
labelTokenLength);
                 if((lengthDif/maxLength)<=0.3f){ //this prevents unnecessary 
string comparison 
-                    int matchCount = compairTokens(currentTokenText, 
labelTokens[i]);
+                    int matchCount = compairTokens(currentTokenText, 
labelTokenText);
                     if(matchCount/maxLength >= 0.7f){
-                        lastfoundLabelIndex = i; //set the last found index to 
the current position
                         found = true; //set found to true -> stops iteration
                         matchFactor = matchCount/maxLength; //how good is the 
match
-                        //remove matched labels from the set to disable them 
for
-                        //a later random oder search
-                        labelTokenSet.remove(labelTokenText);
                     }
                 }
-            }
-            if(!found){
-                //search for a match in the wrong order
-                //currently only exact matches (for testing)
-                if(found = labelTokenSet.remove(currentTokenText)){
-                    matchFactor = 0.7f;
-                }
-            }
-            //int found = text.indexOf(currentToken.getText().toLowerCase());
-            if(found){ //found
-                if(isProcessable){
-                    foundTokens++; //only count processable Tokens
+                if(found){ //found
+                    foundTokens++;
                     foundTokenMatch = foundTokenMatch + matchFactor; //sum up 
the matches
-                }
-                lastFoundIndex = currentIndex;
-            } else { //not found
-                notFound++;
-                if(isProcessable || notFound > maxNotFound){
-                    //stop as soon as a token that needs to be processed is
-                    //not found in the label or the maximum number of tokens
-                    //that are not processable are not found
-                    search = false; 
+                    firstFoundIndex = currentIndex;
+                    currentIndex --;
+                } else {
+                    notFound++;
+                    if(notFound > maxNotFound){
+                        //stop as soon as a token that needs to be processed is
+                        //not found in the label or the maximum number of 
tokens
+                        //that are not processable are not found
+                        search = false; 
+                    }
                 }
             }
+            labelIndex--; 
         }
         //Now we make a second round to search tokens that match in the wrong 
order
         //e.g. if given and family name of persons are switched
         MATCH labelMatch; 
-        int coveredTokens = lastFoundIndex-state.getTokenIndex()+1;
+        int coveredTokens = lastFoundIndex-firstFoundIndex+1;
         float labelMatchScore = (foundTokenMatch/(float)labelTokens.length);
         //Matching rules
         // - if less than config#minTokenFound() than accept only EXACT
         // - override PARTIAL matches with FULL/EXACT matches only if
         //   foundTokens of the PARTIAL match is > than of the FULL/EXACT
         //   match (this will be very rare
-        if(foundTokens > 0 && match.getMatchCount() <= foundTokens) {
-            String currentText = state.getTokenText(coveredTokens);
-            if(currentText.equalsIgnoreCase(label.getText())){ 
+        if(foundProcessableTokens > 0 && match.getMatchCount() <= 
foundProcessableTokens) {
+            String currentText = 
state.getTokenText(firstFoundIndex,coveredTokens);
+            if(currentText.equalsIgnoreCase(text)){ 
                 labelMatch = MATCH.EXACT;
                 //set found to covered: May be lower because only
                 //processable tokens are counted, but Exact also checks
                 //of non-processable!
                 foundTokens = coveredTokens;
-            } else if(foundTokens >= config.getMinFoundTokens() && 
+            } else if(foundProcessableTokens >= config.getMinFoundTokens() && 
                     labelMatchScore >= 0.6f){
                 if(foundTokens == coveredTokens){
                     labelMatch = MATCH.FULL;
@@ -428,10 +499,10 @@ public class EntityLinker {
                 labelMatch = MATCH.NONE;
             }
             if(labelMatch != MATCH.NONE){
-                if(match.getMatchCount() < foundTokens ||
-                        match.getMatchCount() < foundTokens && 
+                if(match.getMatchCount() < foundProcessableTokens ||
+                        match.getMatchCount() == foundProcessableTokens && 
                         labelMatch.ordinal() > match.getMatch().ordinal()){
-                    match.updateMatch(labelMatch, coveredTokens, foundTokens,
+                    match.updateMatch(labelMatch, firstFoundIndex, 
coveredTokens, foundTokens,
                         foundTokenMatch/foundTokens,label,labelTokens.length);
                 } //else this match is not better as the existing one
             } //else ignore labels with MATCH.NONE

Modified: 
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java?rev=1180199&r1=1180198&r2=1180199&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java
 (original)
+++ 
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java
 Fri Oct  7 20:18:41 2011
@@ -89,6 +89,7 @@ public class EntityLinkerConfig {
 
         mappings.put(OntologicalClasses.DBPEDIA_PLACE.getUnicodeString(), 
OntologicalClasses.DBPEDIA_PLACE);
         mappings.put(NamespaceEnum.schema+"Place", 
OntologicalClasses.DBPEDIA_PLACE);
+        mappings.put(NamespaceEnum.gml+"_Feature", 
OntologicalClasses.DBPEDIA_PLACE);
 
         mappings.put(OntologicalClasses.SKOS_CONCEPT.getUnicodeString(), 
OntologicalClasses.SKOS_CONCEPT);
         DEFAULT_ENTITY_TYPE_MAPPINGS = Collections.unmodifiableMap(mappings);

Modified: 
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/Suggestion.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/Suggestion.java?rev=1180199&r1=1180198&r2=1180199&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/Suggestion.java
 (original)
+++ 
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/Suggestion.java
 Fri Oct  7 20:18:41 2011
@@ -23,6 +23,7 @@ import org.apache.stanbol.entityhub.serv
  */
 public class Suggestion implements Comparable<Suggestion>{
     private MATCH match = MATCH.NONE;
+    private int start = 0;
     private int span = 0;
     private int matchCount = 0;
     private Text label;
@@ -69,6 +70,7 @@ public class Suggestion implements Compa
     /**
      * Updates this suggestion 
      * @param match the math type
+     * @param start the start position of this suggestion
      * @param span the number of token this suggestion spans
      * @param count the number of token that match with the suggestion within 
the span
      * @param matchScore the score of the match. MUST BE in the range between 
@@ -78,7 +80,7 @@ public class Suggestion implements Compa
      * @param label the label that matches the tokens
      * @param labelTokenCount the number of tokens of the label
      */
-    protected void updateMatch(MATCH match,int span,int count,float 
matchScore,Text label,int labelTokenCount){
+    protected void updateMatch(MATCH match,int start, int span,int count,float 
matchScore,Text label,int labelTokenCount){
         this.match = match;
         //check the validity of the parameters to avoid later errors that are
         //than hard to debug
@@ -101,6 +103,7 @@ public class Suggestion implements Compa
                 }
             }
         }
+        this.start = start;
         this.span = span;
         this.label = label;
         if(match == MATCH.EXACT){ //for exact matches the matchScore needs to 
be
@@ -154,6 +157,13 @@ public class Suggestion implements Compa
         return matchScore;
     }
     /**
+     * Getter for the start index of this Suggestion
+     * @return the start token index for this suggestion
+     */
+    public int getStart() {
+        return start;
+    }
+    /**
      * Getter for the number of the token matched by this suggestion
      * @return The number of the token matched by this suggestion
      */
@@ -186,7 +196,7 @@ public class Suggestion implements Compa
      * @return the best match or {@link Suggestion#getMatchedLabel()} if non 
is found
      */
     public Text getBestLabel(String nameField, String language){
-        Representation rep = getRepresentation();
+        Representation rep = getRepresentation(); 
         // 1. check if the returned Entity does has a label -> if not return 
null
         // add labels (set only a single label. Use "en" if available!
         Text label = null;

Modified: 
incubator/stanbol/trunk/entityhub/generic/servicesapi/src/main/java/org/apache/stanbol/entityhub/servicesapi/defaults/NamespaceEnum.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/generic/servicesapi/src/main/java/org/apache/stanbol/entityhub/servicesapi/defaults/NamespaceEnum.java?rev=1180199&r1=1180198&r2=1180199&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/entityhub/generic/servicesapi/src/main/java/org/apache/stanbol/entityhub/servicesapi/defaults/NamespaceEnum.java
 (original)
+++ 
incubator/stanbol/trunk/entityhub/generic/servicesapi/src/main/java/org/apache/stanbol/entityhub/servicesapi/defaults/NamespaceEnum.java
 Fri Oct  7 20:18:41 2011
@@ -55,6 +55,7 @@ public enum NamespaceEnum {
     //Some well known Namespaces of Ontologies
     geo("http://www.w3.org/2003/01/geo/wgs84_pos#";),
     georss("http://www.georss.org/georss/";),
+    gml("http://www.opengis.net/gml/";),
     dcElements("dc-elements","http://purl.org/dc/elements/1.1/";),
     dcTerms("dc","http://purl.org/dc/terms/";), // Entityhub prefers DC-Terms, 
therefore use the "dc" prefix for the terms name space
     foaf("http://xmlns.com/foaf/0.1/";),

svn commit: r1180199 - in /incubator/stanbol/trunk: commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/ data/opennlp/lang/da/ data/opennlp/lang/sv/ enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytag...

Reply via email to