Author: ogrisel
Date: Mon May 7 12:51:37 2012
New Revision: 1334986
URL: http://svn.apache.org/viewvc?rev=1334986&view=rev
Log:
STANBOL-197: various small improvements to TopicClassificationEngine
Modified:
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
Modified:
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java?rev=1334986&r1=1334985&r2=1334986&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
Mon May 7 12:51:37 2012
@@ -232,8 +232,12 @@ public class TopicClassificationEngine e
private int MAX_COLLECTED_EXAMPLES = 1000;
- public int MAX_EVALUATION_SAMPLES = 200;
+ // Limit the evaluation time by computing the performance estimates on a
bounded random selection of
+ // labeled examples in the training set.
+ public int MAX_EVALUATION_SAMPLES = 500;
+ // Do not try to compute performance estimates if there is not at least a
minimum number of example
+ // documents for this concept.
public int MIN_EVALUATION_SAMPLES = 10;
public int MAX_CHARS_PER_TOPIC = 100000;
@@ -404,6 +408,9 @@ public class TopicClassificationEngine e
List<TopicSuggestion> topics;
try {
topics = suggestTopics(text);
+ if (topics.isEmpty()) {
+ return;
+ }
} catch (ClassifierException e) {
throw new EngineException(e);
}
@@ -568,7 +575,7 @@ public class TopicClassificationEngine e
// no need to apply the cutting heuristic
return suggestedTopics;
}
- // filter out suggestion that are less than some threshold based on
the mean of the top scores
+ // filter out suggestions that are less than some threshold based on
the mean of the top scores
float mean = 0.0f;
for (TopicSuggestion suggestion : suggestedTopics) {
mean += suggestion.score / suggestedTopics.size();
@@ -1025,7 +1032,7 @@ public class TopicClassificationEngine e
tmpfolder.mkdir();
evaluationRunning = true;
int cvFoldCount = 3; // 3-folds CV is hardcoded for now
- int cvIterationCount = 1; // only one 3-folds CV iteration
+ int cvIterationCount = 3; // make it possible to limit the number
of folds to use
// We will use the training set quite intensively, ensure that the
index is packed and its
// statistics are up to date