Author: rwesten
Date: Fri Aug 30 12:09:18 2013
New Revision: 1518948
URL: http://svn.apache.org/r1518948
Log:
STANBOL-1151: The Sentiment Word Classifier now supports Lexical Categories. To
ease implementation of classifiers a new utility class
'WordSentimentDictionary' was added.
Added:
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/util/
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/util/WordSentimentDictionary.java
Modified:
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/pom.xml
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/api/LexicalCategoryClassifier.java
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/api/SentimentClassifier.java
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/classifiers/SentiWSComponent.java
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/classifiers/SentiWordNet.java
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/services/SentimentEngine.java
Modified: stanbol/trunk/enhancement-engines/sentiment-wordclassifier/pom.xml
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/sentiment-wordclassifier/pom.xml?rev=1518948&r1=1518947&r2=1518948&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/sentiment-wordclassifier/pom.xml
(original)
+++ stanbol/trunk/enhancement-engines/sentiment-wordclassifier/pom.xml Fri Aug
30 12:09:18 2013
@@ -67,10 +67,12 @@
org.apache.stanbol.enhancer.servicesapi; provide:=true;
version="[0.10,0.12)",
org.apache.stanbol.enhancer.servicesapi.impl; provide:=true;
version="[0.10,0.12)",
org.apache.stanbol.enhancer.engines.sentiment.api; provide:=true,
+ org.apache.stanbol.enhancer.engines.sentiment.util;
provide:=true,
*
</Import-Package>
<Export-Package>
-
org.apache.stanbol.enhancer.engines.sentiment.api;version=${project.version}
+
org.apache.stanbol.enhancer.engines.sentiment.api;version=${project.version},
+ org.apache.stanbol.enhancer.engines.sentiment.util;
version=${project.version}
</Export-Package>
<Private-Package>
org.apache.stanbol.enhancer.engines.sentiment.classifiers;version=${project.version},
Modified:
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/api/LexicalCategoryClassifier.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/api/LexicalCategoryClassifier.java?rev=1518948&r1=1518947&r2=1518948&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/api/LexicalCategoryClassifier.java
(original)
+++
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/api/LexicalCategoryClassifier.java
Fri Aug 30 12:09:18 2013
@@ -16,6 +16,8 @@
package org.apache.stanbol.enhancer.engines.sentiment.api;
+import java.util.Set;
+
import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
import org.apache.stanbol.enhancer.nlp.pos.PosTag;
@@ -31,16 +33,11 @@ import org.apache.stanbol.enhancer.nlp.p
*/
public abstract class LexicalCategoryClassifier implements SentimentClassifier
{
- public abstract double classifyWord(String word);
-
- @Override
- public boolean isAdjective(PosTag posTag) {
- return posTag.hasCategory(LexicalCategory.Adjective);
- }
+ public abstract double classifyWord(LexicalCategory cat, String word);
@Override
- public boolean isNoun(PosTag posTag) {
- return posTag.hasCategory(LexicalCategory.Noun);
+ public Set<LexicalCategory> getCategories(PosTag posTag) {
+ return posTag.getCategories();
}
}
Modified:
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/api/SentimentClassifier.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/api/SentimentClassifier.java?rev=1518948&r1=1518947&r2=1518948&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/api/SentimentClassifier.java
(original)
+++
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/api/SentimentClassifier.java
Fri Aug 30 12:09:18 2013
@@ -16,6 +16,9 @@
package org.apache.stanbol.enhancer.engines.sentiment.api;
+import java.util.Set;
+
+import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
import org.apache.stanbol.enhancer.nlp.pos.PosTag;
import org.osgi.framework.BundleContext;
@@ -27,7 +30,10 @@ import org.osgi.framework.BundleContext;
* can be used with this engine. Implementations need to be
* {@link BundleContext#registerService(String, Object, java.util.Dictionary)
* registered as OSGI service}.
+ * @see LexicalCategoryClassifier
+ *
* @author Sebastian Schaffert
+ * @author Rupert Westenthaler
*/
public interface SentimentClassifier {
@@ -35,29 +41,26 @@ public interface SentimentClassifier {
* Given the word passed as argument, return a value between -1 and 1
indicating its sentiment value from
* very negative to very positive. Unknown words should return the value 0.
*
- * @param word
+ * @param cat the lexical category of the word (see
+ * <a
href="https://issues.apache.org/jira/browse/STANBOL-1151">STANBOL-1151</a>)
+ * @param word the word
* @return
*/
- public double classifyWord(String word);
-
+ public double classifyWord(LexicalCategory cat, String word);
- /**
- * Helper method. Return true if the given POS tag indicates an adjective
in the language implemented by
- * this classifier.
- *
- * @param posTag
- * @return
- */
- public boolean isAdjective(PosTag posTag);
/**
- * Helper method. Return true if the given POS tag indicates a noun in the
language implemented by this
- * classifier.
- *
- * @param posTag
- * @return
+ * Getter for the LexicalCategories for the parsed {@link PosTag}. Used
+ * to lookup the lexical categories for the
+ * {@link #classifyWord(LexicalCategory, String)} lookups.<p>
+ * Simple implementations might return {@link PosTag#getCategories()}. But
+ * as some {@link PosTag} instances might only define the literal
+ * {@link PosTag#getTag()} value this method might also implement its own
+ * mappings.
+ * @param posTag the posTag
+ * @return the categories
*/
- public boolean isNoun(PosTag posTag);
+ public Set<LexicalCategory> getCategories(PosTag posTag);
/**
* The language of this WordClassifier
Modified:
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/classifiers/SentiWSComponent.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/classifiers/SentiWSComponent.java?rev=1518948&r1=1518947&r2=1518948&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/classifiers/SentiWSComponent.java
(original)
+++
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/classifiers/SentiWSComponent.java
Fri Aug 30 12:09:18 2013
@@ -20,7 +20,9 @@ import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
+import java.util.Collections;
import java.util.Dictionary;
+import java.util.EnumMap;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Hashtable;
@@ -39,17 +41,19 @@ import org.apache.stanbol.commons.stanbo
import
org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileTracker;
import
org.apache.stanbol.enhancer.engines.sentiment.api.LexicalCategoryClassifier;
import org.apache.stanbol.enhancer.engines.sentiment.api.SentimentClassifier;
+import
org.apache.stanbol.enhancer.engines.sentiment.util.WordSentimentDictionary;
+import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
import org.osgi.framework.BundleContext;
import org.osgi.framework.ServiceRegistration;
import org.osgi.service.component.ComponentContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-
/**
* A German word classifier based on SentiWS. Reads the SentiWS positive and
negative word lists and parses them
* into an appropriate hash table, so lookups should be extremely fast.
* <p/>
* @author Sebastian Schaffert
+ * @author Rupert Westenthaler
*/
@Component(immediate=true)
public class SentiWSComponent {
@@ -177,15 +181,13 @@ public class SentiWSComponent {
*/
public static class SentiWsClassifierDE extends LexicalCategoryClassifier
implements SentimentClassifier {
- private ReadWriteLock lock = new ReentrantReadWriteLock();
- private Map<String,Double> wordMap = new TreeMap<String,Double>();
+ private WordSentimentDictionary dict = new
WordSentimentDictionary(Locale.GERMAN);
protected SentiWsClassifierDE(){}
protected void parseSentiWS(InputStream is) throws IOException {
log.debug("parsing SentiWS word lists ...");
BufferedReader in = new BufferedReader(new InputStreamReader(is));
- lock.writeLock().lock();
try {
for(String line = in.readLine(); line != null; line =
in.readLine()) {
// input file will have a space- or tab-separated list per
line:
@@ -195,37 +197,37 @@ public class SentiWSComponent {
String[] components = line.split("\\s");
// parse the weight
- Double weight = Double.parseDouble(components[1]);
+ Double weight = Double.valueOf(components[1]);
// get the main word
- String[] mainWord = components[0].split("\\|");
- wordMap.put(mainWord[0],weight);
+ String[] wordPart = components[0].split("\\|");
+ String mainWord = wordPart[0];
+ LexicalCategory cat = getLexicalCategory(wordPart[1]);
+ dict.updateSentiment(cat, mainWord, weight);
// get the remaining words (deflections)
if(components.length > 2) {
for(String word : components[2].split(",")) {
- String lcWord = word.toLowerCase(Locale.GERMAN);
- Double current = wordMap.put(lcWord,weight);
- if(current != null){
- log.warn("Multiple sentiments [{},{}] for word
{}",
- new Object[]{current,weight,lcWord});
- }
+ dict.updateSentiment(cat, word, weight);
}
}
}
} finally {
- lock.writeLock().unlock();
IOUtils.closeQuietly(in);
}
}
-
- public int getWordCount() {
- lock.readLock().lock();
- try {
- return wordMap.size();
- } finally {
- lock.readLock().unlock();
+ private LexicalCategory getLexicalCategory(String posTag){
+ char c = posTag.charAt(0);
+ switch (c) {
+ case 'N':
+ return LexicalCategory.Noun;
+ case 'V':
+ return LexicalCategory.Verb;
+ case 'A':
+ return LexicalCategory.Adjective;
+ default: //TODO: change this to a warning and return NULL
+ throw new IllegalStateException("Unsupported posTag
'"+posTag+"'!");
}
}
@@ -242,26 +244,16 @@ public class SentiWSComponent {
* @return
*/
@Override
- public double classifyWord(String word) {
- lock.readLock().lock();
- try {
- Double sentiment =
wordMap.get(word.toLowerCase(Locale.GERMAN));
- return sentiment != null ? sentiment.doubleValue() : 0.0;
- } finally {
- lock.readLock().unlock();
- }
+ public double classifyWord(LexicalCategory cat, String word) {
+ Double sentiment = dict.getSentiment(cat, word);
+ return sentiment != null ? sentiment.doubleValue() : 0.0;
}
/**
* Internally used to free up resources when the service is
* unregistered
*/
protected void close(){
- lock.writeLock().lock();
- try {
- wordMap.clear();
- } finally {
- lock.writeLock().unlock();
- }
+ dict.clear();
}
}
Modified:
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/classifiers/SentiWordNet.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/classifiers/SentiWordNet.java?rev=1518948&r1=1518947&r2=1518948&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/classifiers/SentiWordNet.java
(original)
+++
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/classifiers/SentiWordNet.java
Fri Aug 30 12:09:18 2013
@@ -39,6 +39,8 @@ import org.apache.stanbol.commons.stanbo
import
org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileTracker;
import
org.apache.stanbol.enhancer.engines.sentiment.api.LexicalCategoryClassifier;
import org.apache.stanbol.enhancer.engines.sentiment.api.SentimentClassifier;
+import
org.apache.stanbol.enhancer.engines.sentiment.util.WordSentimentDictionary;
+import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
import org.osgi.framework.BundleContext;
import org.osgi.framework.ServiceRegistration;
import org.osgi.service.component.ComponentContext;
@@ -55,6 +57,7 @@ import org.slf4j.LoggerFactory;
* settings.
* <p/>
* @author Sebastian Schaffert
+ * @autor Rupert Westenthaler
*/
@Component(immediate = true)
public class SentiWordNet {
@@ -164,16 +167,14 @@ public class SentiWordNet {
*/
public static class SentiWordNetClassifierEN extends
LexicalCategoryClassifier implements SentimentClassifier {
- private ReadWriteLock lock = new ReentrantReadWriteLock();
- private Map<String,Double> wordMap = new TreeMap<String,Double>();
-
+ WordSentimentDictionary dict = new
WordSentimentDictionary(Locale.ENGLISH);
+
private org.apache.lucene.analysis.en.EnglishMinimalStemmer stemmer =
new EnglishMinimalStemmer();
protected SentiWordNetClassifierEN() {}
protected void parseSentiWordNet(InputStream is) throws IOException {
BufferedReader in = new BufferedReader(new InputStreamReader(is));
- lock.writeLock().lock();
try {
// read line by line:
// - lines starting with # are ignored
@@ -184,6 +185,7 @@ public class SentiWordNet {
String[] components = line.split("\t");
try {
+ LexicalCategory cat = parseLexCat(components[0]);
double posScore =
Double.parseDouble(components[2]);
double negScore =
Double.parseDouble(components[3]);
String synonyms = components[4];
@@ -196,34 +198,36 @@ public class SentiWordNet {
// part
String[] synonym = synonymToken.split("#");
String stemmed = getStemmed(synonym[0]);
- Double existing =
wordMap.put(stemmed.toLowerCase(Locale.ENGLISH), score);
- if(existing != null){
- log.warn("Multiple Sentiment Scores
[{},{}] for word {}",
- new Object[]{existing, score,
stemmed.toLowerCase(Locale.ENGLISH)});
- }
+ dict.updateSentiment(cat, stemmed, score);
}
}
- } catch (Exception ex) {
+ } catch (RuntimeException ex) {
log.warn("could not parse SentiWordNet line '{}':
{}", line, ex.getMessage());
}
}
}
} finally {
- lock.writeLock().unlock();
IOUtils.closeQuietly(in);
}
}
- public int getWordCount() {
- lock.readLock().lock();
- try {
- return wordMap.size();
- } finally {
- lock.readLock().unlock();
+ private LexicalCategory parseLexCat(String val) {
+ switch (val.charAt(0)) {
+ case 'a':
+ return LexicalCategory.Adjective;
+ case 'v':
+ return LexicalCategory.Verb;
+ case 'n':
+ return LexicalCategory.Noun;
+ case 'r':
+ return LexicalCategory.Adverb;
+ default:
+ throw new IllegalStateException("Uncown POS tag
'"+val+"'!");
}
}
+
/**
* Given the word passed as argument, return a value between -1 and 1
indicating its sentiment value
* from very negative to very positive. Unknown words should return
the value 0.
@@ -232,15 +236,9 @@ public class SentiWordNet {
* @return
*/
@Override
- public double classifyWord(String word) {
- String stemmed = getStemmed(word);
- lock.readLock().lock();
- try {
- Double sentiment =
wordMap.get(stemmed.toLowerCase(Locale.ENGLISH));
- return sentiment != null ? sentiment.doubleValue() : 0.0;
- } finally {
- lock.readLock().unlock();
- }
+ public double classifyWord(LexicalCategory cat, String word) {
+ Double sentiment = dict.getSentiment(cat, getStemmed(word));
+ return sentiment != null ? sentiment.doubleValue() : 0.0;
}
private String getStemmed(String word) {
@@ -253,12 +251,7 @@ public class SentiWordNet {
}
protected void close(){
- lock.writeLock().lock();
- try {
- wordMap.clear();
- } finally {
- lock.writeLock().unlock();
- }
+ dict.clear();
}
}
}
Modified:
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/services/SentimentEngine.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/services/SentimentEngine.java?rev=1518948&r1=1518947&r2=1518948&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/services/SentimentEngine.java
(original)
+++
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/services/SentimentEngine.java
Fri Aug 30 12:09:18 2013
@@ -25,6 +25,7 @@ import java.util.Dictionary;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
+import java.util.Set;
import org.apache.felix.scr.annotations.Activate;
import org.apache.felix.scr.annotations.Component;
@@ -264,14 +265,26 @@ public class SentimentEngine extends Ab
Iterator<Token> tokens = analysedText.getTokens();
while(tokens.hasNext()){
Token token = tokens.next();
- boolean process = !adjectivesOnly;
- if(!process){ //check POS types
+ Set<LexicalCategory> cats = null;
+ boolean process = false;
+ if(!adjectivesOnly){
+ process = true;
+ Value<PosTag> posTag =
token.getAnnotation(NlpAnnotations.POS_ANNOTATION);
+ if(posTag != null && posTag.probability() ==
Value.UNKNOWN_PROBABILITY
+ || posTag.probability() >= (minPOSConfidence/2.0)){
+ cats = classifier.getCategories(posTag.value());
+ } else { //no POS tags or probability to low
+ cats = Collections.emptySet();
+ }
+ } else { //check PosTags if we need to lookup this word
Iterator<Value<PosTag>> posTags =
token.getAnnotations(NlpAnnotations.POS_ANNOTATION).iterator();
boolean ignore = false;
while(!ignore && !process && posTags.hasNext()) {
Value<PosTag> value = posTags.next();
PosTag tag = value.value();
- boolean state = classifier.isAdjective(tag) ||
classifier.isNoun(tag);
+ cats = classifier.getCategories(tag);
+ boolean state = cats.contains(LexicalCategory.Adjective)
+ || cats.contains(LexicalCategory.Noun);
ignore = !state && (value.probability() ==
Value.UNKNOWN_PROBABILITY ||
value.probability() >= minPOSConfidence);
process = state && (value.probability() ==
Value.UNKNOWN_PROBABILITY ||
@@ -279,11 +292,28 @@ public class SentimentEngine extends Ab
}
} //else process all tokens ... no POS tag checking needed
if(process){
- double sentiment = classifier.classifyWord(token.getSpan());
+ String word = token.getSpan();
+ double sentiment = 0.0;
+ if(cats.isEmpty()){
+ sentiment = classifier.classifyWord(null, word);
+ } else { //in case of multiple Lexical Cats
+ //we build the average over NOT NULL sentiments for the
word
+ int catSentNum = 0;
+ for(LexicalCategory cat : cats){
+ double catSent = classifier.classifyWord(cat, word);
+ if(catSent != 0.0){
+ catSentNum++;
+ sentiment = sentiment + catSent;
+ }
+ }
+ if(catSentNum > 0){
+ sentiment = sentiment / (double) catSentNum;
+ }
+ }
if(sentiment != 0.0){
token.addAnnotation(SENTIMENT_ANNOTATION, new
Value<Double>(sentiment));
} //else do not set sentiments with 0.0
- }
+ } // else do not process
}
// } finally {
// ci.getLock().writeLock().unlock();
Added:
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/util/WordSentimentDictionary.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/util/WordSentimentDictionary.java?rev=1518948&view=auto
==============================================================================
---
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/util/WordSentimentDictionary.java
(added)
+++
stanbol/trunk/enhancement-engines/sentiment-wordclassifier/src/main/java/org/apache/stanbol/enhancer/engines/sentiment/util/WordSentimentDictionary.java
Fri Aug 30 12:09:18 2013
@@ -0,0 +1,214 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.stanbol.enhancer.engines.sentiment.util;
+
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.concurrent.locks.ReadWriteLock;
+import java.util.concurrent.locks.ReentrantReadWriteLock;
+
+import org.apache.stanbol.enhancer.engines.sentiment.api.SentimentClassifier;
+import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
+
+/**
+ * <code>{Word,Category} -> {Sentiment}</code> Dictionary intended to be
+ * used by {@link SentimentClassifier} implementation to hold the
dictionary.<p>
+ * This implementation is thread save.
+ *
+ * @author Rupert Westenthaler
+ *
+ */
+public class WordSentimentDictionary {
+
+
+ private final ReadWriteLock lock;
+ private final Map<String,Map<LexicalCategory,Double>> wordMap;
+ private final Locale locale;
+ private int sentCount; //the number of wordSentiments
+
+ /**
+ * Create a word sentiment directory for the given locale.
+ * @param locale the locale used to convert words to lower case. If
+ * <code>null</code> {@link Locale#ROOT} will be used.
+ */
+ public WordSentimentDictionary(Locale locale){
+ this.wordMap = new HashMap<String,Map<LexicalCategory,Double>>();
+ this.lock = new ReentrantReadWriteLock();
+ this.locale = locale == null ? Locale.ROOT : locale;
+ }
+
+ /**
+ * Puts (adds/updates) a word (with unknown {@link LexicalCategory})
+ * to the dictionary
+ * @param word the word.
+ * @param sentiment the sentiment value
+ * @return the old sentiment value or <code>null</code> if none.
+ */
+ public Double updateSentiment(String word, Double sentiment){
+ return updateSentiment(null, word, sentiment);
+ }
+ /**
+ * Puts (adds/updates) a word with {@link LexicalCategory} to the
dictionary.
+ * @param cat the {@link LexicalCategory} of the word or <code>null</code>
if not known
+ * @param word the word
+ * @param sentiment the sentiment value or <code>null</code> to remove this
+ * mapping.
+ * @return the old sentiment value or <code>null</code> if none.
+ */
+ public Double updateSentiment(LexicalCategory cat, String word, Double
sentiment){
+ word = word.toLowerCase(locale);
+ Double old = null;
+ lock.writeLock().lock();
+ try {
+ Map<LexicalCategory,Double> entry = wordMap.get(word);
+ //most elements (99%) will only have a single value.
+ //so we use a singleton map as default and create a HashMap for
those
+ //that do have more elements (to save memory)
+ boolean replace = false;
+ if(entry == null && sentiment != null){
+ entry = Collections.singletonMap(cat, sentiment);
+ replace = true;
+ } else if(entry != null){
+ if(entry.size() == 1){ //special case
+ if(sentiment == null) {
+ old = entry.get(cat);
+ if(old != null){ //remove
+ entry = null;
+ replace = true;
+ } //not found -> do nothing
+ } else { //about to add 2nd element
+ //create a normal HashMap and add the existing value;
+ entry = new HashMap<LexicalCategory,Double>(entry);
+ replace = true;
+ }
+ }
+ if(sentiment == null){
+ if(entry != null && entry.size() > 1){
+ old = entry.remove(cat);
+ if(old != null && entry.size() == 1){ //only one entry
left
+ //switch back to a singletonMap
+ Entry<LexicalCategory,Double> lastEntry =
entry.entrySet().iterator().next();
+ entry =
Collections.singletonMap(lastEntry.getKey(), lastEntry.getValue());
+ replace = true;
+ }
+ } //else already processed by special case size == 1
+ } else {
+ old = entry.put(cat, Double.valueOf(sentiment));
+ }
+ } //else entry == null and sentiment == null ... nothing to do
+ if(replace){ //we have changed the entry instance and need to put
the word
+ if(entry == null){
+ wordMap.remove(word);
+ } else {
+ wordMap.put(word, entry);
+ }
+ }
+ } finally {
+ lock.writeLock().unlock();
+ }
+ if(old == null && sentiment != null){
+ sentCount++; //we added a new sentiment
+ } else if(old != null && sentiment == null){
+ sentCount--;
+ } //else no change
+ return old;
+ }
+
+ /**
+ * Getter for the sentiment value for the word. If multiple sentiments
+ * for different {@link LexicalCategory lexical categories} are registered
+ * for the word this will return the average of those.
+ * @param word the word
+ * @return the sentiment or <code>null</code> if not in the dictionary.
+ */
+ public Double getSentiment(String word){
+ return getSentiment(null, word);
+ }
+ /**
+ * Getter for the sentiment for the parsed word and {@link
LexicalCategory}.
+ * In case the category is <code>null</code> this method might parse an
+ * average over different sentiments registered for different lexical
+ * categories.
+ * @param cat the category
+ * @param word the word
+ * @return the sentiment or <code>null</code> if the not in the dictionary.
+ */
+ public Double getSentiment(LexicalCategory cat, String word){
+ lock.readLock().lock();
+ try {
+ Map<LexicalCategory,Double> sentiments =
wordMap.get(word.toLowerCase(locale));
+ if(sentiments != null){
+ Double sentiment = sentiments.get(cat);
+ if(sentiment == null && cat == null && !sentiments.isEmpty()){
+ if(sentiments.size() == 1) {
+ sentiment = sentiments.values().iterator().next();
+ } else {
+ double avgSent = 0;
+ for(Double sent : sentiments.values()){
+ avgSent = avgSent + sent;
+ }
+ sentiment =
Double.valueOf(avgSent/(double)sentiments.size());
+ }
+ }
+ return sentiment;
+ } else {
+ return null;
+ }
+ } finally {
+ lock.readLock().unlock();
+ }
+ }
+
+ /**
+ * The number of words in the dictionary. NOTE that a single word
+ * might have multiple sentiments for different {@link LexicalCategory}.
+ * So this value might be lower to {@link #size()}
+ **/
+ public int getWordCount() {
+ lock.readLock().lock();
+ try {
+ return wordMap.size();
+ } finally {
+ lock.readLock().unlock();
+ }
+ }
+ /**
+ * The number of word sentiments in the dictionary
+ * @return
+ */
+ public int size(){
+ return sentCount;
+ }
+
+ /**
+ * removes all entries of this dictionary.
+ */
+ public void clear() {
+ lock.writeLock().lock();
+ try {
+ wordMap.clear();
+ } finally {
+ lock.writeLock().unlock();
+ }
+ }
+
+
+}