Author: rwesten
Date: Fri Aug 30 12:09:18 2013
New Revision: 1518948

URL: http://svn.apache.org/r1518948
Log:
STANBOL-1151: The Sentiment Word Classifier now supports Lexical Categories. To 
ease implementation of classifiers a new utility class 
'WordSentimentDictionary' was added.

Added:
    
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/util/
    
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/util/WordSentimentDictionary.java
Modified:
    stanbol/trunk/enhancement-engines/sentiment-wordclassifier/pom.xml
    
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/api/LexicalCategoryClassifier.java
    
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/api/SentimentClassifier.java
    
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/classifiers/SentiWSComponent.java
    
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/classifiers/SentiWordNet.java
    
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/services/SentimentEngine.java

Modified: stanbol/trunk/enhancement-engines/sentiment-wordclassifier/pom.xml
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/sentiment-wordclassifier/pom.xml?rev=1518948&r1=1518947&r2=1518948&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/sentiment-wordclassifier/pom.xml 
(original)
+++ stanbol/trunk/enhancement-engines/sentiment-wordclassifier/pom.xml Fri Aug 
30 12:09:18 2013
@@ -67,10 +67,12 @@
               org.apache.stanbol.enhancer.servicesapi; provide:=true; 
version="[0.10,0.12)",
               org.apache.stanbol.enhancer.servicesapi.impl; provide:=true; 
version="[0.10,0.12)",
               org.apache.stanbol.enhancer.engines.sentiment.api; provide:=true,
+              org.apache.stanbol.enhancer.engines.sentiment.util; 
provide:=true,
               *
             </Import-Package>
             <Export-Package>
-              
org.apache.stanbol.enhancer.engines.sentiment.api;version=${project.version}
+              
org.apache.stanbol.enhancer.engines.sentiment.api;version=${project.version},
+              org.apache.stanbol.enhancer.engines.sentiment.util; 
version=${project.version}
             </Export-Package>
             <Private-Package>
               
org.apache.stanbol.enhancer.engines.sentiment.classifiers;version=${project.version},

Modified: 
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/api/LexicalCategoryClassifier.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/api/LexicalCategoryClassifier.java?rev=1518948&r1=1518947&r2=1518948&view=diff
==============================================================================
--- 
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/api/LexicalCategoryClassifier.java
 (original)
+++ 
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/api/LexicalCategoryClassifier.java
 Fri Aug 30 12:09:18 2013
@@ -16,6 +16,8 @@
 
 package org.apache.stanbol.enhancer.engines.sentiment.api;
 
+import java.util.Set;
+
 import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
 import org.apache.stanbol.enhancer.nlp.pos.PosTag;
 
@@ -31,16 +33,11 @@ import org.apache.stanbol.enhancer.nlp.p
  */
 public abstract class LexicalCategoryClassifier implements SentimentClassifier 
{
 
-    public abstract double classifyWord(String word);
-
-    @Override
-    public boolean isAdjective(PosTag posTag) {
-        return posTag.hasCategory(LexicalCategory.Adjective);
-    }
+    public abstract double classifyWord(LexicalCategory cat, String word);
 
     @Override
-    public boolean isNoun(PosTag posTag) {
-        return posTag.hasCategory(LexicalCategory.Noun);
+    public Set<LexicalCategory> getCategories(PosTag posTag) {
+        return posTag.getCategories();
     }
 
 }

Modified: 
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/api/SentimentClassifier.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/api/SentimentClassifier.java?rev=1518948&r1=1518947&r2=1518948&view=diff
==============================================================================
--- 
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/api/SentimentClassifier.java
 (original)
+++ 
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/api/SentimentClassifier.java
 Fri Aug 30 12:09:18 2013
@@ -16,6 +16,9 @@
 
 package org.apache.stanbol.enhancer.engines.sentiment.api;
 
+import java.util.Set;
+
+import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
 import org.apache.stanbol.enhancer.nlp.pos.PosTag;
 import org.osgi.framework.BundleContext;
 
@@ -27,7 +30,10 @@ import org.osgi.framework.BundleContext;
  * can be used with this engine. Implementations need to be 
  * {@link BundleContext#registerService(String, Object, java.util.Dictionary)
  * registered as OSGI service}.
+ * @see LexicalCategoryClassifier
+ * 
  * @author Sebastian Schaffert
+ * @author Rupert Westenthaler
  */
 public interface SentimentClassifier {
 
@@ -35,29 +41,26 @@ public interface SentimentClassifier {
      * Given the word passed as argument, return a value between -1 and 1 
indicating its sentiment value from
      * very negative to very positive. Unknown words should return the value 0.
      *
-     * @param word
+     * @param cat the lexical category of the word (see 
+     * <a 
href="https://issues.apache.org/jira/browse/STANBOL-1151";>STANBOL-1151</a>)
+     * @param word the word
      * @return
      */
-    public double classifyWord(String word);
-
+    public double classifyWord(LexicalCategory cat, String word);
 
-    /**
-     * Helper method. Return true if the given POS tag indicates an adjective 
in the language implemented by
-     * this classifier.
-     *
-     * @param posTag
-     * @return
-     */
-    public boolean isAdjective(PosTag posTag);
 
     /**
-     * Helper method. Return true if the given POS tag indicates a noun in the 
language implemented by this
-     * classifier.
-     *
-     * @param posTag
-     * @return
+     * Getter for the LexicalCategories for the parsed {@link PosTag}. Used
+     * to lookup the lexical categories for the 
+     * {@link #classifyWord(LexicalCategory, String)} lookups.<p>
+     * Simple implementations might return {@link PosTag#getCategories()}. But
+     * as some {@link PosTag} instances might only define the literal
+     * {@link PosTag#getTag()} value this method might also implement its own
+     * mappings.
+     * @param posTag the posTag
+     * @return the categories 
      */
-    public boolean isNoun(PosTag posTag);
+    public Set<LexicalCategory> getCategories(PosTag posTag);
     
     /**
      * The language of this WordClassifier

Modified: 
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/classifiers/SentiWSComponent.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/classifiers/SentiWSComponent.java?rev=1518948&r1=1518947&r2=1518948&view=diff
==============================================================================
--- 
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/classifiers/SentiWSComponent.java
 (original)
+++ 
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/classifiers/SentiWSComponent.java
 Fri Aug 30 12:09:18 2013
@@ -20,7 +20,9 @@ import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
+import java.util.Collections;
 import java.util.Dictionary;
+import java.util.EnumMap;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Hashtable;
@@ -39,17 +41,19 @@ import org.apache.stanbol.commons.stanbo
 import 
org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileTracker;
 import 
org.apache.stanbol.enhancer.engines.sentiment.api.LexicalCategoryClassifier;
 import org.apache.stanbol.enhancer.engines.sentiment.api.SentimentClassifier;
+import 
org.apache.stanbol.enhancer.engines.sentiment.util.WordSentimentDictionary;
+import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
 import org.osgi.framework.BundleContext;
 import org.osgi.framework.ServiceRegistration;
 import org.osgi.service.component.ComponentContext;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-
 /**
  * A German word classifier based on SentiWS. Reads the SentiWS positive and 
negative word lists and parses them
  * into an appropriate hash table, so lookups should be extremely fast.
  * <p/>
  * @author Sebastian Schaffert
+ * @author Rupert Westenthaler
  */
 @Component(immediate=true)
 public class SentiWSComponent {
@@ -177,15 +181,13 @@ public class SentiWSComponent {
      */
     public static class SentiWsClassifierDE extends LexicalCategoryClassifier 
implements SentimentClassifier {
     
-        private ReadWriteLock lock = new ReentrantReadWriteLock();
-        private Map<String,Double> wordMap = new TreeMap<String,Double>();
+        private WordSentimentDictionary dict = new 
WordSentimentDictionary(Locale.GERMAN);
 
         protected SentiWsClassifierDE(){}
         
         protected void parseSentiWS(InputStream is) throws IOException {
             log.debug("parsing SentiWS word lists ...");
             BufferedReader in = new BufferedReader(new InputStreamReader(is));
-            lock.writeLock().lock();
             try {
                 for(String line = in.readLine(); line != null; line = 
in.readLine()) {
                     // input file will have a space- or tab-separated list per 
line:
@@ -195,37 +197,37 @@ public class SentiWSComponent {
                     String[] components = line.split("\\s");
 
                     // parse the weight
-                    Double weight = Double.parseDouble(components[1]);
+                    Double weight = Double.valueOf(components[1]);
 
                     // get the main word
-                    String[] mainWord = components[0].split("\\|");
-                    wordMap.put(mainWord[0],weight);
+                    String[] wordPart = components[0].split("\\|");
+                    String mainWord = wordPart[0];
+                    LexicalCategory cat = getLexicalCategory(wordPart[1]);
+                    dict.updateSentiment(cat, mainWord, weight);
 
                     // get the remaining words (deflections)
                     if(components.length > 2) {
                         for(String word : components[2].split(",")) {
-                            String lcWord = word.toLowerCase(Locale.GERMAN);
-                            Double current = wordMap.put(lcWord,weight);
-                            if(current != null){
-                                log.warn("Multiple sentiments [{},{}] for word 
{}",
-                                    new Object[]{current,weight,lcWord});
-                            }
+                            dict.updateSentiment(cat, word, weight);
                         }
                     }
                 }
             } finally {
-                lock.writeLock().unlock();
                 IOUtils.closeQuietly(in);
             }
         }
     
-    
-        public int getWordCount() {
-            lock.readLock().lock();
-            try {
-                return wordMap.size();
-            } finally {
-                lock.readLock().unlock();
+        private LexicalCategory getLexicalCategory(String posTag){
+            char c = posTag.charAt(0);
+            switch (c) {
+                case 'N':
+                    return LexicalCategory.Noun;
+                case 'V':
+                    return LexicalCategory.Verb;
+                case 'A':
+                    return LexicalCategory.Adjective;
+                default: //TODO: change this to a warning and return NULL
+                    throw new IllegalStateException("Unsupported posTag 
'"+posTag+"'!");
             }
         }
         
@@ -242,26 +244,16 @@ public class SentiWSComponent {
          * @return
          */
         @Override
-        public double classifyWord(String word) {
-            lock.readLock().lock();
-            try {
-                Double sentiment = 
wordMap.get(word.toLowerCase(Locale.GERMAN));
-                return sentiment != null ? sentiment.doubleValue() : 0.0;
-            } finally {
-                lock.readLock().unlock();  
-            }
+        public double classifyWord(LexicalCategory cat, String word) {
+            Double sentiment = dict.getSentiment(cat, word);
+            return sentiment != null ? sentiment.doubleValue() : 0.0;
         }
         /**
          * Internally used to free up resources when the service is
          * unregistered
          */
         protected void close(){
-            lock.writeLock().lock();
-            try {
-                wordMap.clear();
-            } finally {
-                lock.writeLock().unlock();
-            }
+            dict.clear();
         }
     }
 

Modified: 
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/classifiers/SentiWordNet.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/classifiers/SentiWordNet.java?rev=1518948&r1=1518947&r2=1518948&view=diff
==============================================================================
--- 
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/classifiers/SentiWordNet.java
 (original)
+++ 
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/classifiers/SentiWordNet.java
 Fri Aug 30 12:09:18 2013
@@ -39,6 +39,8 @@ import org.apache.stanbol.commons.stanbo
 import 
org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileTracker;
 import 
org.apache.stanbol.enhancer.engines.sentiment.api.LexicalCategoryClassifier;
 import org.apache.stanbol.enhancer.engines.sentiment.api.SentimentClassifier;
+import 
org.apache.stanbol.enhancer.engines.sentiment.util.WordSentimentDictionary;
+import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
 import org.osgi.framework.BundleContext;
 import org.osgi.framework.ServiceRegistration;
 import org.osgi.service.component.ComponentContext;
@@ -55,6 +57,7 @@ import org.slf4j.LoggerFactory;
  * settings.
  * <p/>
  * @author Sebastian Schaffert
+ * @autor Rupert Westenthaler
  */
 @Component(immediate = true)
 public class SentiWordNet {
@@ -164,16 +167,14 @@ public class SentiWordNet {
      */
     public static class SentiWordNetClassifierEN extends 
LexicalCategoryClassifier implements SentimentClassifier {
 
-        private ReadWriteLock lock = new ReentrantReadWriteLock();
-        private Map<String,Double> wordMap = new TreeMap<String,Double>();
-
+        WordSentimentDictionary dict = new 
WordSentimentDictionary(Locale.ENGLISH);
+        
         private org.apache.lucene.analysis.en.EnglishMinimalStemmer stemmer = 
new EnglishMinimalStemmer();
 
         protected SentiWordNetClassifierEN() {}
 
         protected void parseSentiWordNet(InputStream is) throws IOException {
             BufferedReader in = new BufferedReader(new InputStreamReader(is));
-            lock.writeLock().lock();
             try {
                 // read line by line:
                 // - lines starting with # are ignored
@@ -184,6 +185,7 @@ public class SentiWordNet {
                         String[] components = line.split("\t");
     
                         try {
+                            LexicalCategory cat = parseLexCat(components[0]);
                             double posScore = 
Double.parseDouble(components[2]);
                             double negScore = 
Double.parseDouble(components[3]);
                             String synonyms = components[4];
@@ -196,34 +198,36 @@ public class SentiWordNet {
                                     // part
                                     String[] synonym = synonymToken.split("#");
                                     String stemmed = getStemmed(synonym[0]);
-                                    Double existing = 
wordMap.put(stemmed.toLowerCase(Locale.ENGLISH), score);
-                                    if(existing != null){
-                                        log.warn("Multiple Sentiment Scores 
[{},{}] for word {}",
-                                            new Object[]{existing, score, 
stemmed.toLowerCase(Locale.ENGLISH)});
-                                    }
+                                    dict.updateSentiment(cat, stemmed, score);
                                 }
                             }
     
-                        } catch (Exception ex) {
+                        } catch (RuntimeException ex) {
                             log.warn("could not parse SentiWordNet line '{}': 
{}", line, ex.getMessage());
                         }
                     }
                 }
             } finally {
-                lock.writeLock().unlock();
                 IOUtils.closeQuietly(in);
             }
         }
 
-        public int getWordCount() {
-            lock.readLock().lock();
-            try {
-                return wordMap.size();
-            } finally {
-                lock.readLock().unlock();
+        private LexicalCategory parseLexCat(String val) {
+            switch (val.charAt(0)) {
+                case 'a':
+                    return LexicalCategory.Adjective;
+                case 'v':
+                    return LexicalCategory.Verb;
+                case 'n':
+                    return LexicalCategory.Noun;
+                case 'r':
+                    return LexicalCategory.Adverb;
+                default:
+                    throw new IllegalStateException("Uncown POS tag 
'"+val+"'!");
             }
         }
 
+
         /**
          * Given the word passed as argument, return a value between -1 and 1 
indicating its sentiment value
          * from very negative to very positive. Unknown words should return 
the value 0.
@@ -232,15 +236,9 @@ public class SentiWordNet {
          * @return
          */
         @Override
-        public double classifyWord(String word) {
-            String stemmed = getStemmed(word);
-            lock.readLock().lock();
-            try {
-                Double sentiment = 
wordMap.get(stemmed.toLowerCase(Locale.ENGLISH));
-                return sentiment != null ? sentiment.doubleValue() : 0.0;
-            } finally {
-                lock.readLock().unlock();
-            }
+        public double classifyWord(LexicalCategory cat, String word) {
+            Double sentiment = dict.getSentiment(cat, getStemmed(word));
+            return sentiment != null ? sentiment.doubleValue() : 0.0;
         }
 
         private String getStemmed(String word) {
@@ -253,12 +251,7 @@ public class SentiWordNet {
         }
         
         protected void close(){
-            lock.writeLock().lock();
-            try {
-                wordMap.clear();
-            } finally {
-                lock.writeLock().unlock();
-            }
+            dict.clear();
         }
     }
 }

Modified: 
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/services/SentimentEngine.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/services/SentimentEngine.java?rev=1518948&r1=1518947&r2=1518948&view=diff
==============================================================================
--- 
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/services/SentimentEngine.java
 (original)
+++ 
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/services/SentimentEngine.java
 Fri Aug 30 12:09:18 2013
@@ -25,6 +25,7 @@ import java.util.Dictionary;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.Map;
+import java.util.Set;
 
 import org.apache.felix.scr.annotations.Activate;
 import org.apache.felix.scr.annotations.Component;
@@ -264,14 +265,26 @@ public class SentimentEngine  extends Ab
         Iterator<Token> tokens = analysedText.getTokens();
         while(tokens.hasNext()){
             Token token = tokens.next();
-            boolean process = !adjectivesOnly;
-            if(!process){ //check POS types
+            Set<LexicalCategory> cats = null;
+            boolean process = false;
+            if(!adjectivesOnly){
+                process = true;
+                Value<PosTag> posTag = 
token.getAnnotation(NlpAnnotations.POS_ANNOTATION);
+                if(posTag != null && posTag.probability() == 
Value.UNKNOWN_PROBABILITY
+                        || posTag.probability() >= (minPOSConfidence/2.0)){
+                    cats = classifier.getCategories(posTag.value());
+                } else { //no POS tags or probability to low
+                    cats = Collections.emptySet();
+                }
+            } else { //check PosTags if we need to lookup this word
                 Iterator<Value<PosTag>> posTags = 
token.getAnnotations(NlpAnnotations.POS_ANNOTATION).iterator();
                 boolean ignore = false;
                 while(!ignore && !process && posTags.hasNext()) {
                     Value<PosTag> value = posTags.next();
                     PosTag tag = value.value();
-                    boolean state = classifier.isAdjective(tag) || 
classifier.isNoun(tag);
+                    cats = classifier.getCategories(tag);
+                    boolean state = cats.contains(LexicalCategory.Adjective) 
+                            || cats.contains(LexicalCategory.Noun);
                     ignore = !state && (value.probability() == 
Value.UNKNOWN_PROBABILITY ||
                             value.probability() >= minPOSConfidence);
                     process = state && (value.probability() == 
Value.UNKNOWN_PROBABILITY ||
@@ -279,11 +292,28 @@ public class SentimentEngine  extends Ab
                 }
             } //else process all tokens ... no POS tag checking needed
             if(process){
-                double sentiment = classifier.classifyWord(token.getSpan());
+                String word = token.getSpan();
+                double sentiment = 0.0;
+                if(cats.isEmpty()){
+                    sentiment = classifier.classifyWord(null, word);
+                } else { //in case of multiple Lexical Cats
+                    //we build the average over NOT NULL sentiments for the 
word
+                    int catSentNum = 0;
+                    for(LexicalCategory cat : cats){
+                        double catSent = classifier.classifyWord(cat, word);
+                        if(catSent != 0.0){
+                            catSentNum++;
+                            sentiment = sentiment + catSent;
+                        }
+                    }
+                    if(catSentNum > 0){
+                        sentiment = sentiment / (double) catSentNum;
+                    }
+                }
                 if(sentiment != 0.0){
                     token.addAnnotation(SENTIMENT_ANNOTATION, new 
Value<Double>(sentiment));
                 } //else do not set sentiments with 0.0
-            }
+            } // else do not process
         }
 //        } finally {
 //            ci.getLock().writeLock().unlock();

Added: 
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/util/WordSentimentDictionary.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/util/WordSentimentDictionary.java?rev=1518948&view=auto
==============================================================================
--- 
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/util/WordSentimentDictionary.java
 (added)
+++ 
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/util/WordSentimentDictionary.java
 Fri Aug 30 12:09:18 2013
@@ -0,0 +1,214 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.stanbol.enhancer.engines.sentiment.util;
+
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.concurrent.locks.ReadWriteLock;
+import java.util.concurrent.locks.ReentrantReadWriteLock;
+
+import org.apache.stanbol.enhancer.engines.sentiment.api.SentimentClassifier;
+import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
+
+/**
+ * <code>{Word,Category} -&gt; {Sentiment}</code> Dictionary intended to be
+ * used by {@link SentimentClassifier} implementation to hold the 
dictionary.<p>
+ * This implementation is thread save.
+ * 
+ * @author Rupert Westenthaler
+ *
+ */
+public class WordSentimentDictionary {
+
+    
+    private final ReadWriteLock lock;
+    private final Map<String,Map<LexicalCategory,Double>> wordMap;
+    private final Locale locale;
+    private int sentCount; //the number of wordSentiments
+
+    /**
+     * Create a word sentiment directory for the given locale.
+     * @param locale the locale used to convert words to lower case. If
+     * <code>null</code> {@link Locale#ROOT} will be used.
+     */
+    public WordSentimentDictionary(Locale locale){
+        this.wordMap = new HashMap<String,Map<LexicalCategory,Double>>();
+        this.lock = new ReentrantReadWriteLock();
+        this.locale = locale == null ? Locale.ROOT : locale;
+    }
+    
+    /**
+     * Puts (adds/updates) a word (with unknown {@link LexicalCategory})
+     * to the dictionary
+     * @param word the word.
+     * @param sentiment the sentiment value
+     * @return the old sentiment value or <code>null</code> if none.
+     */
+    public Double updateSentiment(String word, Double sentiment){
+        return updateSentiment(null, word, sentiment);
+    }
+    /**
+     * Puts (adds/updates) a word with {@link LexicalCategory} to the 
dictionary.
+     * @param cat the {@link LexicalCategory} of the word or <code>null</code> 
if not known
+     * @param word the word 
+     * @param sentiment the sentiment value or <code>null</code> to remove this
+     *     mapping.
+     * @return the old sentiment value or <code>null</code> if none.
+     */
+    public Double updateSentiment(LexicalCategory cat, String word, Double 
sentiment){
+        word = word.toLowerCase(locale);
+        Double old = null;
+        lock.writeLock().lock();
+        try {
+            Map<LexicalCategory,Double> entry = wordMap.get(word);
+            //most elements (99%) will only have a single value.
+            //so we use a singleton map as default and create a HashMap for 
those
+            //that do have more elements (to save memory)
+            boolean replace = false;
+            if(entry == null && sentiment != null){
+                entry = Collections.singletonMap(cat, sentiment);
+                replace = true;
+            } else if(entry != null){
+                if(entry.size() == 1){ //special case
+                    if(sentiment == null) {
+                        old = entry.get(cat);
+                        if(old != null){ //remove
+                            entry = null;
+                            replace = true;
+                        } //not found -> do nothing
+                    } else { //about to add 2nd element
+                        //create a normal HashMap and add the existing value;
+                        entry = new HashMap<LexicalCategory,Double>(entry);
+                        replace = true;
+                    }
+                }
+                if(sentiment == null){
+                    if(entry != null && entry.size() > 1){
+                        old = entry.remove(cat);
+                        if(old != null && entry.size() == 1){ //only one entry 
left
+                            //switch back to a singletonMap
+                            Entry<LexicalCategory,Double> lastEntry = 
entry.entrySet().iterator().next();
+                            entry = 
Collections.singletonMap(lastEntry.getKey(), lastEntry.getValue());
+                            replace = true;
+                        }
+                    } //else already processed by special case size == 1
+                } else {
+                    old = entry.put(cat, Double.valueOf(sentiment));
+                }
+            } //else entry == null and sentiment == null ... nothing to do
+            if(replace){ //we have changed the entry instance and need to put 
the word
+                if(entry == null){
+                    wordMap.remove(word);
+                } else {
+                    wordMap.put(word, entry);
+                }
+            }
+        } finally {
+            lock.writeLock().unlock();
+        }
+        if(old == null && sentiment != null){
+            sentCount++; //we added a new sentiment
+        } else if(old != null && sentiment == null){
+            sentCount--;
+        } //else no change
+        return old;
+    }
+
+    /**
+     * Getter for the sentiment value for the word. If multiple sentiments
+     * for different {@link LexicalCategory lexical categories} are registered
+     * for the word this will return the average of those.
+     * @param word the word
+     * @return the sentiment or <code>null</code> if not in the dictionary.
+     */
+    public Double getSentiment(String word){
+        return getSentiment(null, word);
+    }
+    /**
+     * Getter for the sentiment for the parsed word and {@link 
LexicalCategory}.
+     * In case the category is <code>null</code> this method might parse an
+     * average over different sentiments registered for different lexical
+     * categories.
+     * @param cat the category
+     * @param word the word
+     * @return the sentiment or <code>null</code> if the not in the dictionary.
+     */
+    public Double getSentiment(LexicalCategory cat, String word){
+        lock.readLock().lock();
+        try {
+            Map<LexicalCategory,Double> sentiments = 
wordMap.get(word.toLowerCase(locale));
+            if(sentiments != null){
+                Double sentiment = sentiments.get(cat);
+                if(sentiment == null && cat == null && !sentiments.isEmpty()){
+                    if(sentiments.size() == 1) {
+                        sentiment = sentiments.values().iterator().next();
+                    } else {
+                        double avgSent = 0;
+                        for(Double sent : sentiments.values()){
+                            avgSent = avgSent + sent;
+                        }
+                        sentiment = 
Double.valueOf(avgSent/(double)sentiments.size());
+                    }
+                }
+                return sentiment;
+            } else {
+                return null;
+            }
+        } finally {
+            lock.readLock().unlock();
+        }
+    }
+    
+    /** 
+     * The number of words in the dictionary. NOTE that a single word
+     * might have multiple sentiments for different {@link LexicalCategory}.
+     * So this value might be lower to {@link #size()} 
+     **/
+    public int getWordCount() {
+        lock.readLock().lock();
+        try {
+            return wordMap.size();
+        } finally {
+            lock.readLock().unlock();
+        }
+    }
+    /**
+     * The number of word sentiments in the dictionary
+     * @return
+     */
+    public int size(){
+        return sentCount;
+    }
+
+    /**
+     * removes all entries of this dictionary.
+     */
+    public void clear() {
+        lock.writeLock().lock();
+        try {
+            wordMap.clear();
+        } finally {
+            lock.writeLock().unlock();
+        }
+    }
+
+    
+}


Reply via email to