Revision: 17417
          http://sourceforge.net/p/gate/code/17417
Author:   adamfunk
Date:     2014-02-25 08:36:20 +0000 (Tue, 25 Feb 2014)
Log Message:
-----------
Successfully integrated DFB into TF.IDF system.
Some spaghetti still needs to be untangled.

Modified Paths:
--------------
    gate/trunk/plugins/TermRaider/src/gate/termraider/PMIExample.java
    gate/trunk/plugins/TermRaider/src/gate/termraider/TermRaiderEnglish.java
    gate/trunk/plugins/TermRaider/src/gate/termraider/bank/AbstractTermbank.java
    
gate/trunk/plugins/TermRaider/src/gate/termraider/bank/AnnotationTermbank.java
    
gate/trunk/plugins/TermRaider/src/gate/termraider/bank/DocumentFrequencyBank.java
    gate/trunk/plugins/TermRaider/src/gate/termraider/bank/HyponymyTermbank.java
    gate/trunk/plugins/TermRaider/src/gate/termraider/bank/TfIdfTermbank.java
    
gate/trunk/plugins/TermRaider/src/gate/termraider/bank/modes/IdfCalculation.java
    gate/trunk/plugins/TermRaider/src/gate/termraider/gui/TermbankViewer.java
    gate/trunk/plugins/TermRaider/src/gate/termraider/output/CsvGenerator.java

Modified: gate/trunk/plugins/TermRaider/src/gate/termraider/PMIExample.java
===================================================================
--- gate/trunk/plugins/TermRaider/src/gate/termraider/PMIExample.java   
2014-02-25 02:22:21 UTC (rev 17416)
+++ gate/trunk/plugins/TermRaider/src/gate/termraider/PMIExample.java   
2014-02-25 08:36:20 UTC (rev 17417)
@@ -1,10 +1,7 @@
 package gate.termraider;
 
 import gate.creole.PackagedController;
-import gate.creole.metadata.AutoInstance;
-import gate.creole.metadata.AutoInstanceParam;
-import gate.creole.metadata.CreoleParameter;
-import gate.creole.metadata.CreoleResource;
+import gate.creole.metadata.*;
 
 @CreoleResource(name = "PMI Example (English)",
     icon = "TermRaiderApp",
@@ -12,5 +9,5 @@
         @AutoInstanceParam(name="pipelineURL", 
value="applications/pmi-example.gapp"),
         @AutoInstanceParam(name="menu", value="TermRaider")}))
 public class PMIExample extends PackagedController {
-
+  private static final long serialVersionUID = -4725697168124226331L;
 }

Modified: 
gate/trunk/plugins/TermRaider/src/gate/termraider/TermRaiderEnglish.java
===================================================================
--- gate/trunk/plugins/TermRaider/src/gate/termraider/TermRaiderEnglish.java    
2014-02-25 02:22:21 UTC (rev 17416)
+++ gate/trunk/plugins/TermRaider/src/gate/termraider/TermRaiderEnglish.java    
2014-02-25 08:36:20 UTC (rev 17417)
@@ -1,10 +1,7 @@
 package gate.termraider;
 
 import gate.creole.PackagedController;
-import gate.creole.metadata.AutoInstance;
-import gate.creole.metadata.AutoInstanceParam;
-import gate.creole.metadata.CreoleParameter;
-import gate.creole.metadata.CreoleResource;
+import gate.creole.metadata.*;
 
 @CreoleResource(name = "TermRaider English Term Extraction",
     icon = "TermRaiderApp",
@@ -12,5 +9,5 @@
         @AutoInstanceParam(name="pipelineURL", 
value="applications/termraider-eng.gapp"),
         @AutoInstanceParam(name="menu", value="TermRaider")}))
 public class TermRaiderEnglish extends PackagedController {
-
+  private static final long serialVersionUID = -1599367292323903155L;
 }

Modified: 
gate/trunk/plugins/TermRaider/src/gate/termraider/bank/AbstractTermbank.java
===================================================================
--- 
gate/trunk/plugins/TermRaider/src/gate/termraider/bank/AbstractTermbank.java    
    2014-02-25 02:22:21 UTC (rev 17416)
+++ 
gate/trunk/plugins/TermRaider/src/gate/termraider/bank/AbstractTermbank.java    
    2014-02-25 08:36:20 UTC (rev 17417)
@@ -149,7 +149,7 @@
       boolean wasLoaded = corpus.isDocumentLoaded(i);
       Document document = (Document) corpus.get(i);
       
-      addData(document);
+      processDocument(document);
 
       // datastore safety
       if (! wasLoaded) {
@@ -160,7 +160,7 @@
   }
   
   
-  private void scanTypesLanguagesDocFreq() {
+  protected void scanTypesLanguagesDocFreq() {
     this.types = new TreeSet<String>();
     this.languages = new TreeSet<String>();
     for (Term term : this.termFrequencies.keySet()) {
@@ -171,12 +171,12 @@
   }
   
 
-  /* BEHOLD THE GUBBINS to distinguish the various types of Termbanks*/
+  /* BEHOLD THE GUBBINS to distinguish the various types of Termbanks */
 
   /**
    * This method needs to call incrementTermFreq(...)!
    */
-  protected abstract void addData(Document document);
+  protected abstract void processDocument(Document document);
   
   protected abstract void calculateScores(); 
   
@@ -288,5 +288,11 @@
   public Set<String> getInputAnnotationTypes() {
     return this.inputAnnotationTypes;
   }
+
+
+  public abstract String getCsvHeader();
+
+
+  public abstract String getCsvLine(Term term);
   
 }

Modified: 
gate/trunk/plugins/TermRaider/src/gate/termraider/bank/AnnotationTermbank.java
===================================================================
--- 
gate/trunk/plugins/TermRaider/src/gate/termraider/bank/AnnotationTermbank.java  
    2014-02-25 02:22:21 UTC (rev 17416)
+++ 
gate/trunk/plugins/TermRaider/src/gate/termraider/bank/AnnotationTermbank.java  
    2014-02-25 08:36:20 UTC (rev 17417)
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2008--2012, The University of Sheffield. See the file
+ *  Copyright (c) 2008--2014, The University of Sheffield. See the file
  *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
  *
  *  This file is part of GATE (see http://gate.ac.uk/), and is free
@@ -17,6 +17,7 @@
 import gate.termraider.util.*;
 import gate.termraider.bank.modes.*;
 import java.util.*;
+import org.apache.commons.lang.StringEscapeUtils;
 
 
 
@@ -36,7 +37,7 @@
 
   
   
-  protected void addData(Document document) {
+  protected void processDocument(Document document) {
     String documentSource = Utilities.sourceOrName(document);
     AnnotationSet candidates = 
document.getAnnotations(inputASName).get(inputAnnotationTypes);
 
@@ -105,8 +106,39 @@
     docFrequencies = new HashMap<Term, Integer>();
   }
 
+  
+  public String getCsvHeader() {
+    StringBuilder sb = new StringBuilder();
+    sb.append(StringEscapeUtils.escapeCsv("Term"));
+    sb.append(',').append(StringEscapeUtils.escapeCsv("Lang"));
+    sb.append(',').append(StringEscapeUtils.escapeCsv("Type"));
+    sb.append(',').append(StringEscapeUtils.escapeCsv("ScoreType"));
+    sb.append(',').append(StringEscapeUtils.escapeCsv("Score"));
+    sb.append(',').append(StringEscapeUtils.escapeCsv("Document_Count"));
+    sb.append(',').append(StringEscapeUtils.escapeCsv("Term_Frequency"));
+    return sb.toString();
+  }
 
   
+  public String getCsvLine(Term term) {
+      StringBuilder sb = new StringBuilder();
+      sb.append(StringEscapeUtils.escapeCsv(term.getTermString()));
+      sb.append(',');
+      sb.append(StringEscapeUtils.escapeCsv(term.getLanguageCode()));
+      sb.append(',');
+      sb.append(StringEscapeUtils.escapeCsv(term.getType()));
+      sb.append(',');
+      sb.append(StringEscapeUtils.escapeCsv(this.getScoreProperty()));
+      sb.append(',');
+      sb.append(StringEscapeUtils.escapeCsv(this.getScore(term).toString()));
+      sb.append(',');
+      
sb.append(StringEscapeUtils.escapeCsv(Integer.toString(this.getDocFrequency(term))));
+      sb.append(',');
+      
sb.append(StringEscapeUtils.escapeCsv(Integer.toString(this.getTermFrequency(term))));
+      return sb.toString();
+  }
+  
+  
   /***** CREOLE PARAMETERS *****/
 
   @CreoleParameter(comment = "annotation feature containing the score to 
index",

Modified: 
gate/trunk/plugins/TermRaider/src/gate/termraider/bank/DocumentFrequencyBank.java
===================================================================
--- 
gate/trunk/plugins/TermRaider/src/gate/termraider/bank/DocumentFrequencyBank.java
   2014-02-25 02:22:21 UTC (rev 17416)
+++ 
gate/trunk/plugins/TermRaider/src/gate/termraider/bank/DocumentFrequencyBank.java
   2014-02-25 08:36:20 UTC (rev 17417)
@@ -16,6 +16,8 @@
 
 import javax.swing.Action;
 
+import org.apache.commons.lang.StringEscapeUtils;
+
 import gate.Annotation;
 import gate.AnnotationSet;
 import gate.Corpus;
@@ -34,7 +36,7 @@
 @CreoleResource(name = "DocumentFrequencyBank",
 icon = "termbank-lr.png",
 comment = "Document frequency counter derived from corpora and other DFBs")
-public class DocumentFrequencyBank extends AbstractBank
+public class DocumentFrequencyBank extends AbstractTermbank
 implements ActionsPublisher{
   
   private static final long serialVersionUID = 5149075094060830331L;
@@ -56,9 +58,11 @@
 
   public Resource init() throws ResourceInstantiationException {
     prepare();
+    resetScores();
     processInputBanks();
     processCorpora();
-    churnData();
+    scanTypesLanguagesDocFreq();
+    calculateScores();
     return this;
   }
   
@@ -76,9 +80,12 @@
     if (inputBanks == null) {
       inputBanks = new HashSet<DocumentFrequencyBank>();
     }
-    
+  }
+  
+  protected void resetScores() {
     documentTotal = 0;
     documentFrequencies = new HashMap<Term, Integer>();
+    termFrequencies = new HashMap<Term, Integer>();
     languages = new HashSet<String>();
     types = new HashSet<String>();
     stringLookupTable = new HashMap<String, Set<Term>>();
@@ -140,7 +147,7 @@
   }
 
   
-  private void churnData() {
+  protected void calculateScores() {
     if (this.getTerms().size() > 0) {
       minFrequency = 
this.getFrequencyStrict(this.getTerms().iterator().next());
     }
@@ -192,7 +199,13 @@
   }
   
   
+  @Override
+  public int getDocFrequency(Term term) {
+    return getFrequencyLax(term);
+  }
   
+  
+  
   @CreoleParameter(comment = "Other DFBs to compile into the new one")
   public void setInputBanks(Set<DocumentFrequencyBank> inputBanks) {
     this.inputBanks = inputBanks;
@@ -287,6 +300,8 @@
   }
   
   
+  
+  
   private void increment(Term term, int i) {
     int count = i;
     if (documentFrequencies.containsKey(term)) {
@@ -317,4 +332,32 @@
   public int getTotalDocs() {
     return this.documentTotal;
   }
+
+
+  public String getCsvLine(Term term) {
+    StringBuilder sb = new StringBuilder();
+    sb.append(StringEscapeUtils.escapeCsv(term.getTermString()));
+    sb.append(',');
+    sb.append(StringEscapeUtils.escapeCsv(term.getLanguageCode()));
+    sb.append(',');
+    sb.append(StringEscapeUtils.escapeCsv(term.getType()));
+    sb.append(',');
+    
sb.append(StringEscapeUtils.escapeCsv(Integer.toString(this.getDocFrequency(term))));
+    return sb.toString();
+  }
+
+
+  public String getCsvHeader() {
+    StringBuilder sb = new StringBuilder();
+    sb.append(StringEscapeUtils.escapeCsv("Term"));
+    sb.append(',').append(StringEscapeUtils.escapeCsv("Lang"));
+    sb.append(',').append(StringEscapeUtils.escapeCsv("Type"));
+    sb.append(',').append(StringEscapeUtils.escapeCsv("DocFrequency"));
+    sb.append('\n');
+    sb.append(',').append(StringEscapeUtils.escapeCsv("_TOTAL_DOCS_"));
+    sb.append(',').append(StringEscapeUtils.escapeCsv(""));
+    sb.append(',').append(StringEscapeUtils.escapeCsv(""));
+    
sb.append(',').append(StringEscapeUtils.escapeCsv(Integer.toString(this.getTotalDocs())));
+    return sb.toString();
+  }
 }

Modified: 
gate/trunk/plugins/TermRaider/src/gate/termraider/bank/HyponymyTermbank.java
===================================================================
--- 
gate/trunk/plugins/TermRaider/src/gate/termraider/bank/HyponymyTermbank.java    
    2014-02-25 02:22:21 UTC (rev 17416)
+++ 
gate/trunk/plugins/TermRaider/src/gate/termraider/bank/HyponymyTermbank.java    
    2014-02-25 08:36:20 UTC (rev 17417)
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2008--2012, The University of Sheffield. See the file
+ *  Copyright (c) 2008--2014, The University of Sheffield. See the file
  *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
  *
  *  This file is part of GATE (see http://gate.ac.uk/), and is free
@@ -15,6 +15,7 @@
 import gate.gui.ActionsPublisher;
 import gate.*;
 import gate.termraider.util.*;
+import org.apache.commons.lang.StringEscapeUtils;
 import java.util.*;
 
 
@@ -56,7 +57,7 @@
   }
 
   
-  protected void addData(Document document) {
+  protected void processDocument(Document document) {
     String documentSource = Utilities.sourceOrName(document);
     AnnotationSet candidates = 
document.getAnnotations(inputASName).get(inputAnnotationTypes);
     
@@ -167,6 +168,36 @@
   }
 
   
+  public String getCsvHeader() {
+    StringBuilder sb = new StringBuilder();
+    sb.append(StringEscapeUtils.escapeCsv("Term"));
+    sb.append(',').append(StringEscapeUtils.escapeCsv("Lang"));
+    sb.append(',').append(StringEscapeUtils.escapeCsv("Type"));
+    sb.append(',').append(StringEscapeUtils.escapeCsv("ScoreType"));
+    sb.append(',').append(StringEscapeUtils.escapeCsv("Score"));
+    sb.append(',').append(StringEscapeUtils.escapeCsv("Document_Count"));
+    sb.append(',').append(StringEscapeUtils.escapeCsv("Term_Frequency"));
+    return sb.toString();
+  }
+
+  public String getCsvLine(Term term) {
+      StringBuilder sb = new StringBuilder();
+      sb.append(StringEscapeUtils.escapeCsv(term.getTermString()));
+      sb.append(',');
+      sb.append(StringEscapeUtils.escapeCsv(term.getLanguageCode()));
+      sb.append(',');
+      sb.append(StringEscapeUtils.escapeCsv(term.getType()));
+      sb.append(',');
+      sb.append(StringEscapeUtils.escapeCsv(this.getScoreProperty()));
+      sb.append(',');
+      sb.append(StringEscapeUtils.escapeCsv(this.getScore(term).toString()));
+      sb.append(',');
+      
sb.append(StringEscapeUtils.escapeCsv(Integer.toString(this.getDocFrequency(term))));
+      sb.append(',');
+      
sb.append(StringEscapeUtils.escapeCsv(Integer.toString(this.getTermFrequency(term))));
+      return sb.toString();
+  }
+
   /***** CREOLE PARAMETERS *****/
 
   @CreoleParameter(comment = "Annotation features (in order) to be scanned as 
terms' heads")

Modified: 
gate/trunk/plugins/TermRaider/src/gate/termraider/bank/TfIdfTermbank.java
===================================================================
--- gate/trunk/plugins/TermRaider/src/gate/termraider/bank/TfIdfTermbank.java   
2014-02-25 02:22:21 UTC (rev 17416)
+++ gate/trunk/plugins/TermRaider/src/gate/termraider/bank/TfIdfTermbank.java   
2014-02-25 08:36:20 UTC (rev 17417)
@@ -11,16 +11,20 @@
  */
 package gate.termraider.bank;
 
+import gate.creole.ResourceInstantiationException;
 import gate.creole.metadata.*;
 import gate.gui.ActionsPublisher;
 import gate.*;
 import gate.termraider.bank.modes.IdfCalculation;
 import gate.termraider.bank.modes.TfCalculation;
 import gate.termraider.util.*;
+
 import java.util.*;
 
+import org.apache.commons.lang.StringEscapeUtils;
 
 
+
 @CreoleResource(name = "TfIdfTermbank",
         icon = "termbank-lr.png",
         comment = "TermRaider Termbank derived from vectors in document 
features")
@@ -33,12 +37,13 @@
   /* EXTRA CREOLE PARAMETERS */
   private TfCalculation tfCalculation;
   private IdfCalculation idfCalculation;
+  private DocumentFrequencyBank docFreqSource;
   
   /* EXTRA DATA */
   private int documentCount;
   
   
-  protected void addData(Document document) {
+  protected void processDocument(Document document) {
     documentCount++;
     String documentSource = Utilities.sourceOrName(document);
     AnnotationSet candidates = 
document.getAnnotations(inputASName).get(inputAnnotationTypes);
@@ -62,8 +67,9 @@
   protected void calculateScores() {
     for (Term term : termFrequencies.keySet()) {
       int tf = termFrequencies.get(term);
-      int df = termDocuments.get(term).size();
-      double score = TfCalculation.calculate(tfCalculation, tf) * 
IdfCalculation.calculate(idfCalculation, df, documentCount);
+      int df = docFreqSource.getDocFrequency(term);
+      int n = docFreqSource.getTotalDocs();
+      double score = TfCalculation.calculate(tfCalculation, tf) * 
IdfCalculation.calculate(idfCalculation, df, n);
       rawTermScores.put(term, Double.valueOf(score));
       termScores.put(term, Utilities.normalizeScore(score));
     }
@@ -90,8 +96,21 @@
   }
 
 
+  public int getDocCount() {
+    return this.documentCount;
+  }
   
   /***** CREOLE PARAMETERS *****/
+  
+  @CreoleParameter(comment = "document frequency bank (unset = create from 
these corpora)")
+  public void setDocFreqSource(DocumentFrequencyBank dfb) {
+    this.docFreqSource = dfb;
+  }
+  
+  public DocumentFrequencyBank getDocFreqSource() {
+    return this.docFreqSource;
+  }
+  
 
   @CreoleParameter(comment = "term frequency calculation",
           defaultValue = "Logarithmic")
@@ -122,4 +141,59 @@
     super.setScoreProperty(name);
   }
 
+
+  public String getCsvHeader() {
+    StringBuilder sb = new StringBuilder();
+    sb.append(StringEscapeUtils.escapeCsv("Term"));
+    sb.append(',').append(StringEscapeUtils.escapeCsv("Lang"));
+    sb.append(',').append(StringEscapeUtils.escapeCsv("Type"));
+    sb.append(',').append(StringEscapeUtils.escapeCsv("ScoreType"));
+    sb.append(',').append(StringEscapeUtils.escapeCsv("Score"));
+    sb.append(',').append(StringEscapeUtils.escapeCsv("Document_Count"));
+    sb.append(',').append(StringEscapeUtils.escapeCsv("Ref_Doc_Frequency"));
+    sb.append(',').append(StringEscapeUtils.escapeCsv("Term_Frequency"));
+    return sb.toString();
+  }
+
+
+  public String getCsvLine(Term term) {
+      StringBuilder sb = new StringBuilder();
+      sb.append(StringEscapeUtils.escapeCsv(term.getTermString()));
+      sb.append(',');
+      sb.append(StringEscapeUtils.escapeCsv(term.getLanguageCode()));
+      sb.append(',');
+      sb.append(StringEscapeUtils.escapeCsv(term.getType()));
+      sb.append(',');
+      sb.append(StringEscapeUtils.escapeCsv(this.getScoreProperty()));
+      sb.append(',');
+      sb.append(StringEscapeUtils.escapeCsv(this.getScore(term).toString()));
+      sb.append(',');
+      
sb.append(StringEscapeUtils.escapeCsv(Integer.toString(this.getDocFrequency(term))));
+      sb.append(',');
+      
sb.append(StringEscapeUtils.escapeCsv(Integer.toString(this.docFreqSource.getDocFrequency(term))));
+      sb.append(',');
+      
sb.append(StringEscapeUtils.escapeCsv(Integer.toString(this.getTermFrequency(term))));
+      return sb.toString();
+  }
+
+
+  protected void prepare() throws ResourceInstantiationException {
+    if ( (corpora == null) || (corpora.size() == 0) ) {
+      throw new ResourceInstantiationException("No corpora given");
+    }
+    
+    // If no DFB is specified, we create one from the given corpora
+    if (this.docFreqSource == null) {
+      FeatureMap dfbParameters = Factory.newFeatureMap();
+      dfbParameters.put("inputASName", this.inputASName);
+      dfbParameters.put("languageFeature", this.languageFeature);
+      dfbParameters.put("inputAnnotationFeature", this.inputAnnotationFeature);
+      dfbParameters.put("corpora", this.corpora);
+      dfbParameters.put("debugMode", this.debugMode);
+
+      DocumentFrequencyBank dfb = (DocumentFrequencyBank) 
Factory.createResource(DocumentFrequencyBank.class.getName(), dfbParameters);
+      this.setDocFreqSource(dfb);
+    }
+  }
+
 }

Modified: 
gate/trunk/plugins/TermRaider/src/gate/termraider/bank/modes/IdfCalculation.java
===================================================================
--- 
gate/trunk/plugins/TermRaider/src/gate/termraider/bank/modes/IdfCalculation.java
    2014-02-25 02:22:21 UTC (rev 17416)
+++ 
gate/trunk/plugins/TermRaider/src/gate/termraider/bank/modes/IdfCalculation.java
    2014-02-25 08:36:20 UTC (rev 17417)
@@ -13,15 +13,10 @@
 
 public enum IdfCalculation {
   Natural,
-  Logarithmic,
-  LogarithmicPlus1;
+  Logarithmic;
   
   /* These calculations are from Manning & Schütze, Foundations of
    * Statistical NLP, section 15.2 (p.544).
-   * 
-   * TODO: Use (df + 1) normalization methods so we can handle
-   * terms not found in the IDF table (to allow for external 
-   * IDF sources in future use).
    */
   
   public static double calculate(IdfCalculation mode, int rawDF, int 
corpusSize) {
@@ -29,15 +24,14 @@
     double n = (double) corpusSize;
     
     if (mode == Logarithmic) {
-      return logarithm(n / df);
+      return 1.0 + logarithm(n / (df + 1.0));
     }
     
-    if (mode == LogarithmicPlus1) {
-      return 1.0 + logarithm(n / df);
-    }
+    // TODO: review the df calculation modes; they must always return 
+    // something > 0.
     
     // must be Natural
-    return 1.0 / df;
+    return 1.0 / (df + 1.0);
   }
 
   public static final double logBase = 2.0;

Modified: 
gate/trunk/plugins/TermRaider/src/gate/termraider/gui/TermbankViewer.java
===================================================================
--- gate/trunk/plugins/TermRaider/src/gate/termraider/gui/TermbankViewer.java   
2014-02-25 02:22:21 UTC (rev 17416)
+++ gate/trunk/plugins/TermRaider/src/gate/termraider/gui/TermbankViewer.java   
2014-02-25 08:36:20 UTC (rev 17417)
@@ -313,7 +313,8 @@
   }  
 
   public void setTarget(Object target) {
-    if(target == null || ! (target instanceof AbstractTermbank)) {
+    if(target == null || ! (target instanceof AbstractTermbank)
+            || (target instanceof DocumentFrequencyBank) ) {
       throw new IllegalArgumentException("This Viewer cannot show a "
               + (target == null ? "null" : target.getClass().toString()));
     }

Modified: 
gate/trunk/plugins/TermRaider/src/gate/termraider/output/CsvGenerator.java
===================================================================
--- gate/trunk/plugins/TermRaider/src/gate/termraider/output/CsvGenerator.java  
2014-02-25 02:22:21 UTC (rev 17416)
+++ gate/trunk/plugins/TermRaider/src/gate/termraider/output/CsvGenerator.java  
2014-02-25 08:36:20 UTC (rev 17417)
@@ -15,54 +15,27 @@
 
 import java.io.*;
 import java.util.*;
-
-import org.apache.commons.lang.*;
-
 import gate.termraider.bank.*;
 import gate.termraider.util.*;
 
+
 public class CsvGenerator {
   
-  public static void generateAndSaveCsv(AbstractBank bank, 
+  public static void generateAndSaveCsv(AbstractTermbank bank, 
           Number threshold, File outputFile) throws GateException {
     PrintWriter writer = initializeWriter(outputFile);
-    
-    if (bank instanceof AbstractTermbank) {
-      String scorePropertyName = bank.getScoreProperty();
-      generateTermbankCsv((AbstractTermbank) bank, writer, 
threshold.doubleValue(), scorePropertyName);
-    }
-    else if (bank instanceof DocumentFrequencyBank) {
-      generateDFCsv((DocumentFrequencyBank) bank, writer, 
threshold.intValue());
-    }
-    
-    writer.flush();
-    writer.close();
-    if (bank.getDebugMode()) {
-      System.out.println("Saved CSV to " + outputFile.getAbsolutePath() +
-              " from " + bank.getName() + " (" + bank.getClass().getName() + 
")");
-    }
-  }
-  
-  
-  private static void generateTermbankCsv(AbstractTermbank bank, PrintWriter 
writer, 
-          double threshold, String scorePropertyName) {
     Map<Term, Double> termScores = bank.getTermScores();
-    Map<Term, Set<String>> termDocuments = bank.getTermDocuments();
-    Map<Term, Integer> termFrequencies = null;
-    termFrequencies = bank.getTermFrequencies();
     addComment(bank, "threshold = " + threshold);
     List<Term> sortedTerms = bank.getTermsByDescendingScore();
     
     addComment(bank, "Unfiltered nbr of terms = " + sortedTerms.size());
     int written = 0;
-    writeTermbankHeader(writer);
+    writer.println(bank.getCsvHeader());
     
     for (Term term : sortedTerms) {
       Double score = termScores.get(term);
-      if (score >= threshold) {
-        Set<String> documents = termDocuments.get(term);
-        Integer frequency = termFrequencies.get(term);
-        writeTermBankContent(writer, term, score, documents, frequency, 
scorePropertyName);
+      if (score >= threshold.doubleValue()) {
+        writer.println(bank.getCsvLine(term));
         written++;
       }
       else {  // the rest must be lower
@@ -73,30 +46,6 @@
   }
 
   
-  private static void generateDFCsv(DocumentFrequencyBank bank, PrintWriter 
writer, int threshold) {
-    Map<Term, Integer> frequencies = bank.getDocFrequencies();
-    addComment(bank, "threshold = " + threshold);
-    List<Term> sortedTerms = bank.getTermsByDescendingFreq();
-    
-    addComment(bank, "Unfiltered nbr of terms = " + sortedTerms.size());
-    int written = 0;
-    writeDFHeader(writer);
-    writeDFContent(writer, "_TOTAL_DOCS_", bank.getTotalDocs());
-    
-    for (Term term : sortedTerms) {
-      Integer freq = frequencies.get(term);
-      if (freq >= threshold) {
-        writeDFContent(writer, term, freq);
-        written++;
-      }
-      else {  // the rest must be lower
-        break;
-      }
-    }
-    addComment(bank, "Filtered nbr of terms = " + written);
-  }
-
-  
   private static void addComment(AbstractBank termbank, String commentStr) {
     if (termbank.getDebugMode()) {
       System.out.println(commentStr);
@@ -113,74 +62,4 @@
     }
   }
   
-  
-  private static void writeTermBankContent(PrintWriter writer, Term term, 
Double score,
-          Set<String> documents, Integer frequency, String scorePropertyName) {
-    StringBuilder sb = new StringBuilder();
-    sb.append(StringEscapeUtils.escapeCsv(term.getTermString()));
-    sb.append(',');
-    sb.append(StringEscapeUtils.escapeCsv(term.getLanguageCode()));
-    sb.append(',');
-    sb.append(StringEscapeUtils.escapeCsv(term.getType()));
-    sb.append(',');
-    sb.append(StringEscapeUtils.escapeCsv(scorePropertyName));
-    sb.append(',');
-    sb.append(StringEscapeUtils.escapeCsv(score.toString()));
-    sb.append(',');
-    sb.append(StringEscapeUtils.escapeCsv(Integer.toString(documents.size())));
-    sb.append(',');
-    sb.append(StringEscapeUtils.escapeCsv(frequency.toString()));
-    writer.println(sb.toString());
-  }
-  
-  
-  private static void writeTermbankHeader(PrintWriter writer) {
-    StringBuilder sb = new StringBuilder();
-    sb.append(StringEscapeUtils.escapeCsv("Term"));
-    sb.append(',').append(StringEscapeUtils.escapeCsv("Lang"));
-    sb.append(',').append(StringEscapeUtils.escapeCsv("Type"));
-    sb.append(',').append(StringEscapeUtils.escapeCsv("ScoreType"));
-    sb.append(',').append(StringEscapeUtils.escapeCsv("Score"));
-    sb.append(',').append(StringEscapeUtils.escapeCsv("Document_Count"));
-    sb.append(',').append(StringEscapeUtils.escapeCsv("Term_Frequency"));
-    writer.println(sb.toString());
-  }
-
-
-  private static void writeDFContent(PrintWriter writer, Term term, Integer 
frequency) {
-    StringBuilder sb = new StringBuilder();
-    sb.append(StringEscapeUtils.escapeCsv(term.getTermString()));
-    sb.append(',');
-    sb.append(StringEscapeUtils.escapeCsv(term.getLanguageCode()));
-    sb.append(',');
-    sb.append(StringEscapeUtils.escapeCsv(term.getType()));
-    sb.append(',');
-    sb.append(StringEscapeUtils.escapeCsv(frequency.toString()));
-    writer.println(sb.toString());
-  }
-
-  
-
-  private static void writeDFContent(PrintWriter writer, String string, 
Integer frequency) {
-    StringBuilder sb = new StringBuilder();
-    sb.append(StringEscapeUtils.escapeCsv(string));
-    sb.append(',');
-    sb.append(StringEscapeUtils.escapeCsv(""));
-    sb.append(',');
-    sb.append(StringEscapeUtils.escapeCsv(""));
-    sb.append(',');
-    sb.append(StringEscapeUtils.escapeCsv(frequency.toString()));
-    writer.println(sb.toString());
-  }
-
-
-  private static void writeDFHeader(PrintWriter writer) {
-    StringBuilder sb = new StringBuilder();
-    sb.append(StringEscapeUtils.escapeCsv("Term"));
-    sb.append(',').append(StringEscapeUtils.escapeCsv("Lang"));
-    sb.append(',').append(StringEscapeUtils.escapeCsv("Type"));
-    sb.append(',').append(StringEscapeUtils.escapeCsv("DocFrequency"));
-    writer.println(sb.toString());
-  }
-  
 }

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
Flow-based real-time traffic analytics software. Cisco certified tool.
Monitor traffic, SLAs, QoS, Medianet, WAAS etc. with NetFlow Analyzer
Customize your own dashboards, set traffic alerts and generate reports.
Network behavioral analysis & security monitoring. All-in-one tool.
http://pubads.g.doubleclick.net/gampad/clk?id=126839071&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs

Reply via email to