Revision: 17462
http://sourceforge.net/p/gate/code/17462
Author: adamfunk
Date: 2014-02-26 22:12:56 +0000 (Wed, 26 Feb 2014)
Log Message:
-----------
This plugin need some theology & geometry, some taste & decency.
Modified Paths:
--------------
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/bank/AbstractTermbank.java
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/bank/DocumentFrequencyBank.java
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/bank/HyponymyTermbank.java
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/bank/TfIdfTermbank.java
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/modes/Normalization.java
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/util/Utilities.java
Modified:
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/bank/AbstractTermbank.java
===================================================================
---
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/bank/AbstractTermbank.java
2014-02-26 21:54:45 UTC (rev 17461)
+++
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/bank/AbstractTermbank.java
2014-02-26 22:12:56 UTC (rev 17462)
@@ -44,10 +44,14 @@
protected Map<Term, Set<String>> termDocuments;
public static final String RAW_SUFFIX = ".raw";
+ // TODO get rid of this in favour of the default key of scores
+ @Deprecated
protected Map<Term, Double> termScores;
+
protected List<Term> termsByDescendingScore;
protected Map<Term, Integer> termFrequencies, docFrequencies;
protected boolean descendingScoresDone = false;
+
// TODO delete when FrequencyTableModel is superseded
public static final String freqProperty = "frequency";
@@ -235,15 +239,8 @@
protected abstract void calculateScores();
-
+ // TODO: change to use getMainScores()
@Deprecated
- protected int incrementTermFreq(Term term, int increment) {
- return Utilities.incrementMap(termFrequencies, term, increment);
- }
-
-
-
- // TODO: change to use getMainScores()
public Double getScore(Term term) {
if (termScores.containsKey(term)) {
return termScores.get(term).doubleValue();
Modified:
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/bank/DocumentFrequencyBank.java
===================================================================
---
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/bank/DocumentFrequencyBank.java
2014-02-26 21:54:45 UTC (rev 17461)
+++
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/bank/DocumentFrequencyBank.java
2014-02-26 22:12:56 UTC (rev 17462)
@@ -79,7 +79,6 @@
protected void resetScores() {
documentCount = 0;
documentFrequencies = new HashMap<Term, Integer>();
- termFrequencies = new HashMap<Term, Integer>();
languages = new HashSet<String>();
types = new HashSet<String>();
stringLookupTable = new HashMap<String, Set<Term>>();
Modified:
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/bank/HyponymyTermbank.java
===================================================================
---
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/bank/HyponymyTermbank.java
2014-02-26 21:54:45 UTC (rev 17461)
+++
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/bank/HyponymyTermbank.java
2014-02-26 22:12:56 UTC (rev 17462)
@@ -16,6 +16,7 @@
import gate.*;
import gate.termraider.modes.*;
import gate.termraider.util.*;
+
import java.util.*;
@@ -32,6 +33,7 @@
/* EXTRA CREOLE PARAMETERS */
protected List<String> inputHeadFeatures;
+ private Normalization normalization;
/* EXTRA DATA FOR ANALYSIS */
@@ -52,8 +54,8 @@
private double calculateOneRawScore(Term term) {
- double docFreq = (double) getSetFromMap(termDocuments, term).size();
- double hyponyms = (double) getSetFromMap(termHyponyms, term).size();
+ double docFreq = (double) Utilities.getStringSetFromMap(termDocuments,
term).size();
+ double hyponyms = (double) Utilities.getStringSetFromMap(termHyponyms,
term).size();
return docFreq * (1.0F + hyponyms);
}
@@ -83,19 +85,6 @@
- private Set<String> getSetFromMap(Map<Term, Set<String>> map, Term key) {
- if (map.containsKey(key)) {
- return map.get(key);
- }
-
- //implied else
- Set<String> valueSet = new HashSet<String>();
- map.put(key, valueSet);
- return valueSet;
- }
-
-
-
public void calculateScores() {
Set<Term> terms = termHeads.keySet();
Set<String> headsI, headsJ;
@@ -123,8 +112,9 @@
for (Term term : terms) {
double rawScore = calculateOneRawScore(term);
- double score = Normalization.normalizeScore(rawScore);
+ double normalized = Normalization.calculate(normalization, rawScore);
Utilities.setScoreTermValue(scores, rawScoreST, term, rawScore);
+ Utilities.setScoreTermValue(scores, getDefaultScoreType(), term,
normalized);
Utilities.setScoreTermValue(scores, localDocFrequencyST, term,
this.termDocuments.size());
}
@@ -138,7 +128,6 @@
termHeads = new HashMap<Term, Set<String>>();
termHyponyms = new HashMap<Term, Set<String>>();
termDocuments = new HashMap<Term, Set<String>>();
- termScores = new HashMap<Term, Double>();
termsByDescendingScore = new ArrayList<Term>();
termFrequencies = new HashMap<Term, Integer>();
docFrequencies = new HashMap<Term, Integer>();
@@ -175,5 +164,18 @@
super.setScoreProperty(name);
}
+
+ @CreoleParameter(comment = "score normalization",
+ defaultValue = "Sigmoid")
+ public void setNormalization(Normalization mode) {
+ this.normalization = mode;
+ }
+
+ public Normalization getNormalization() {
+ return this.normalization;
+ }
+
+
+
}
Modified:
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/bank/TfIdfTermbank.java
===================================================================
---
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/bank/TfIdfTermbank.java
2014-02-26 21:54:45 UTC (rev 17461)
+++
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/bank/TfIdfTermbank.java
2014-02-26 22:12:56 UTC (rev 17462)
@@ -33,6 +33,7 @@
/* EXTRA CREOLE PARAMETERS */
private TfCalculation tfCalculation;
private IdfCalculation idfCalculation;
+ private Normalization normalization;
private DocumentFrequencyBank docFreqSource;
/* EXTRA DATA */
@@ -77,9 +78,9 @@
int tf = termFrequencies.get(term);
int df = getRefDocFrequency(term);
int n = docFreqSource.getDocumentCount();
- double score = TfCalculation.calculate(tfCalculation, tf) *
IdfCalculation.calculate(idfCalculation, df, n);
- Utilities.setScoreTermValue(scores, rawScoreST, term, score);
- double normalized = Normalization.normalizeScore(score);
+ double rawScore = TfCalculation.calculate(tfCalculation, tf) *
IdfCalculation.calculate(idfCalculation, df, n);
+ Utilities.setScoreTermValue(scores, rawScoreST, term, rawScore);
+ double normalized = Normalization.calculate(normalization, rawScore);
Utilities.setScoreTermValue(scores, getDefaultScoreType(), term,
normalized);
}
@@ -89,11 +90,11 @@
}
+ // TODO termFrequency incrementation may have been lost in the refactoring
+
protected void resetScores() {
termDocuments = new HashMap<Term, Set<String>>();
- termScores = new HashMap<Term, Double>();
termFrequencies = new HashMap<Term, Integer>();
- docFrequencies = new HashMap<Term, Integer>();
documentCount = 0;
}
@@ -113,6 +114,16 @@
return this.docFreqSource;
}
+ @CreoleParameter(comment = "score normalization",
+ defaultValue = "Sigmoid")
+ public void setNormalization(Normalization mode) {
+ this.normalization = mode;
+ }
+
+ public Normalization getNormalization() {
+ return this.normalization;
+ }
+
@CreoleParameter(comment = "term frequency calculation",
defaultValue = "Logarithmic")
Modified:
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/modes/Normalization.java
===================================================================
---
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/modes/Normalization.java
2014-02-26 21:54:45 UTC (rev 17461)
+++
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/modes/Normalization.java
2014-02-26 22:12:56 UTC (rev 17462)
@@ -13,6 +13,7 @@
public enum Normalization {
None,
+ Hundred,
Sigmoid;
@@ -24,6 +25,10 @@
return raw.doubleValue();
}
+ if (mode == Hundred) {
+ return 100.0 * raw.doubleValue();
+ }
+
// must be sigmoid
return normalizeScore(raw.doubleValue());
}
@@ -44,6 +49,10 @@
double norm = 2.0 / (1.0 + Math.exp(-score / xScale)) - 1.0;
return (double) (100.0F * norm);
}
+
+ /* Note: Normalization mode does not apply to the AnnotationTermbank, since
it
+ * is derived from (presumably) already normalized tf.idf values.
+ */
}
Modified:
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/util/Utilities.java
===================================================================
---
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/util/Utilities.java
2014-02-26 21:54:45 UTC (rev 17461)
+++
gate/branches/termraider-refactoring/plugins/TermRaider/src/gate/termraider/util/Utilities.java
2014-02-26 22:12:56 UTC (rev 17462)
@@ -221,7 +221,17 @@
submap.put(term, count);
map.put(type, submap);
}
+
+ public static Set<String> getStringSetFromMap(Map<Term, Set<String>> map,
Term key) {
+ if (map.containsKey(key)) {
+ return map.get(key);
+ }
+
+ //implied else
+ Set<String> valueSet = new HashSet<String>();
+ map.put(key, valueSet);
+ return valueSet;
+ }
-
}
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
Flow-based real-time traffic analytics software. Cisco certified tool.
Monitor traffic, SLAs, QoS, Medianet, WAAS etc. with NetFlow Analyzer
Customize your own dashboards, set traffic alerts and generate reports.
Network behavioral analysis & security monitoring. All-in-one tool.
http://pubads.g.doubleclick.net/gampad/clk?id=126839071&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs