Author: vincenzo
Date: Mon Dec 12 06:26:29 2005
New Revision: 356257
URL: http://svn.apache.org/viewcvs?rev=356257&view=rev
Log:
1) Fixed JAMES-387 (java.lang.ClassCastException: java.lang.Integer).
2) Some enhancements to reduce memory footprint.
Modified:
james/server/trunk/src/java/org/apache/james/transport/mailets/BayesianAnalysis.java
james/server/trunk/src/java/org/apache/james/util/BayesianAnalyzer.java
james/server/trunk/src/java/org/apache/james/util/JDBCBayesianAnalyzer.java
Modified:
james/server/trunk/src/java/org/apache/james/transport/mailets/BayesianAnalysis.java
URL:
http://svn.apache.org/viewcvs/james/server/trunk/src/java/org/apache/james/transport/mailets/BayesianAnalysis.java?rev=356257&r1=356256&r2=356257&view=diff
==
---
james/server/trunk/src/java/org/apache/james/transport/mailets/BayesianAnalysis.java
(original)
+++
james/server/trunk/src/java/org/apache/james/transport/mailets/BayesianAnalysis.java
Mon Dec 12 06:26:29 2005
@@ -340,8 +340,10 @@
try {
// this is synchronized to avoid concurrent update of the corpus
synchronized(JDBCBayesianAnalyzer.DATABASE_LOCK) {
+analyzer.tokenCountsClear();
analyzer.loadHamNSpam(conn);
analyzer.buildCorpus();
+analyzer.tokenCountsClear();
}
log("BayesianAnalysis Corpus loaded");
Modified:
james/server/trunk/src/java/org/apache/james/util/BayesianAnalyzer.java
URL:
http://svn.apache.org/viewcvs/james/server/trunk/src/java/org/apache/james/util/BayesianAnalyzer.java?rev=356257&r1=356256&r2=356257&view=diff
==
--- james/server/trunk/src/java/org/apache/james/util/BayesianAnalyzer.java
(original)
+++ james/server/trunk/src/java/org/apache/james/util/BayesianAnalyzer.java Mon
Dec 12 06:26:29 2005
@@ -261,14 +261,21 @@
public void clear() {
corpus.clear();
-hamTokenCounts.clear();
-spamTokenCounts.clear();
+tokenCountsClear();
hamMessageCount = 0;
spamMessageCount = 0;
}
/**
+ * Clears token counters.
+ */
+public void tokenCountsClear() {
+hamTokenCounts.clear();
+spamTokenCounts.clear();
+}
+
+/**
* Public setter for corpus.
*
* @param corpus The new corpus.
@@ -289,17 +296,19 @@
*/
public void buildCorpus() {
//Combine the known ham & spam tokens.
-corpus.putAll(hamTokenCounts);
-corpus.putAll(spamTokenCounts);
+Set set = new HashSet(hamTokenCounts.size() + spamTokenCounts.size());
+set.addAll(hamTokenCounts.keySet());
+set.addAll(spamTokenCounts.keySet());
+Map tempCorpus = new HashMap(set.size());
//Iterate through all the tokens and compute their new
//individual probabilities.
-Iterator i = corpus.keySet().iterator();
+Iterator i = set.iterator();
while (i.hasNext()) {
String token = (String) i.next();
-
-corpus.put(token, new Double(computeProbability(token)));
+tempCorpus.put(token, new Double(computeProbability(token)));
}
+setCorpus(tempCorpus);
}
/**
@@ -335,13 +344,17 @@
//Build a set of the tokens in the Stream.
Set tokens = parse(stream);
+// Get the corpus to use in this run
+// A new corpus may be being built in the meantime
+Map workCorpus = getCorpus();
+
//Assign their probabilities from the Corpus (using an additional
//calculation to determine spamminess).
-SortedSet tokenProbabilityStrengths =
getTokenProbabilityStrengths(tokens);
+SortedSet tokenProbabilityStrengths =
getTokenProbabilityStrengths(tokens, workCorpus);
//Compute and return the overall probability that the
//stream is SPAM.
-return computeOverallProbability(tokenProbabilityStrengths);
+return computeOverallProbability(tokenProbabilityStrengths,
workCorpus);
}
/**
@@ -575,9 +588,10 @@
* The ordering is from the highest strength to the lowest strength.
*
* @param tokens
+ * @param workCorpus
* @return SortedSet of TokenProbabilityStrength objects.
*/
-private SortedSet getTokenProbabilityStrengths(Set tokens) {
+private SortedSet getTokenProbabilityStrengths(Set tokens, Map workCorpus)
{
//Convert to a SortedSet of token probability strengths.
SortedSet tokenProbabilityStrengths = new TreeSet();
@@ -587,14 +601,15 @@
tps.token = (String) i.next();
-if (corpus.containsKey(tps.token)) {
-tps.strength = Math.abs(0.5