Revision: 17555
          http://sourceforge.net/p/gate/code/17555
Author:   markagreenwood
Date:     2014-03-06 11:57:37 +0000 (Thu, 06 Mar 2014)
Log Message:
-----------
upgraded to support different column separator and quote characters

Modified Paths:
--------------
    gate/trunk/plugins/Format_CSV/src/gate/corpora/CSVImporter.java

Modified: gate/trunk/plugins/Format_CSV/src/gate/corpora/CSVImporter.java
===================================================================
--- gate/trunk/plugins/Format_CSV/src/gate/corpora/CSVImporter.java     
2014-03-06 11:25:03 UTC (rev 17554)
+++ gate/trunk/plugins/Format_CSV/src/gate/corpora/CSVImporter.java     
2014-03-06 11:57:37 UTC (rev 17555)
@@ -53,6 +53,8 @@
 import javax.swing.SpinnerNumberModel;
 
 import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang.StringEscapeUtils;
+import org.apache.commons.lang.StringUtils;
 
 import au.com.bytecode.opencsv.CSVReader;
 
@@ -63,18 +65,22 @@
   private static JComponent dialog = new JPanel();
 
   private static SpinnerNumberModel textColModel = new SpinnerNumberModel(0, 0,
-    Integer.MAX_VALUE, 1);
+      Integer.MAX_VALUE, 1);
 
   private static JCheckBox cboFeatures = new JCheckBox(
-    "1st Row Contains Column Labels", true);
+      "1st Row Contains Column Labels", true);
 
   private static JCheckBox cboDocuments = new JCheckBox(
-    "Create One Document Per Row", false);
+      "Create One Document Per Row", false);
 
   private static JTextField txtURL = new JTextField(30);
 
+  private static JTextField txtSeparator = new JTextField(",", 3);
+
+  private static JTextField txtQuoteChar = new JTextField("\"", 3);
+
   private static FileFilter CSV_FILE_FILTER = new ExtensionFileFilter(
-    "CSV Files (*.csv)", "csv");
+      "CSV Files (*.csv)", "csv");
 
   static {
     // we'll use the same dialog instance regardless of the corpus we are
@@ -94,7 +100,7 @@
     constraints = new GridBagConstraints();
     constraints.gridx = GridBagConstraints.RELATIVE;
     constraints.gridy = 0;
-    constraints.gridwidth = 3;
+    constraints.gridwidth = 5;
     constraints.fill = GridBagConstraints.HORIZONTAL;
     constraints.insets = new Insets(0, 0, 0, 10);
     dialog.add(txtURL, constraints);
@@ -110,6 +116,38 @@
     constraints = new GridBagConstraints();
     constraints.gridx = GridBagConstraints.RELATIVE;
     constraints.gridy = 1;
+    constraints.gridwidth = 2;
+    constraints.fill = GridBagConstraints.HORIZONTAL;
+    constraints.insets = new Insets(0, 0, 15, 5);
+    dialog.add(new JLabel("Column Separator:"), constraints);
+
+    constraints = new GridBagConstraints();
+    constraints.gridx = GridBagConstraints.RELATIVE;
+    constraints.gridy = 1;
+    constraints.gridwidth = 1;
+    constraints.fill = GridBagConstraints.HORIZONTAL;
+    constraints.insets = new Insets(0, 15, 15, 10);
+    dialog.add(txtSeparator, constraints);
+
+    constraints = new GridBagConstraints();
+    constraints.gridx = GridBagConstraints.RELATIVE;
+    constraints.gridy = 1;
+    constraints.gridwidth = 1;
+    constraints.fill = GridBagConstraints.HORIZONTAL;
+    constraints.insets = new Insets(0, 0, 15, 5);
+    dialog.add(new JLabel("Quote Character:"), constraints);
+
+    constraints = new GridBagConstraints();
+    constraints.gridx = GridBagConstraints.RELATIVE;
+    constraints.gridy = 1;
+    constraints.gridwidth = 1;
+    constraints.fill = GridBagConstraints.HORIZONTAL;
+    constraints.insets = new Insets(0, 0, 15, 10);
+    dialog.add(txtQuoteChar, constraints);
+
+    constraints = new GridBagConstraints();
+    constraints.gridx = GridBagConstraints.RELATIVE;
+    constraints.gridy = 2;
     constraints.gridwidth = 3;
     constraints.anchor = GridBagConstraints.NORTHWEST;
     constraints.insets = new Insets(0, 0, 15, 5);
@@ -117,21 +155,21 @@
 
     constraints = new GridBagConstraints();
     constraints.gridx = GridBagConstraints.RELATIVE;
-    constraints.gridy = 1;
+    constraints.gridy = 2;
     constraints.gridwidth = 3;
     constraints.anchor = GridBagConstraints.NORTHWEST;
     dialog.add(new JSpinner(textColModel), constraints);
 
     constraints = new GridBagConstraints();
     constraints.gridx = GridBagConstraints.RELATIVE;
-    constraints.gridy = 2;
+    constraints.gridy = 3;
     constraints.gridwidth = GridBagConstraints.RELATIVE;
     constraints.anchor = GridBagConstraints.NORTHWEST;
     dialog.add(cboFeatures, constraints);
 
     constraints = new GridBagConstraints();
     constraints.gridx = GridBagConstraints.RELATIVE;
-    constraints.gridy = 3;
+    constraints.gridy = 4;
     constraints.gridwidth = GridBagConstraints.RELATIVE;
     constraints.anchor = GridBagConstraints.NORTHWEST;
     dialog.add(cboDocuments, constraints);
@@ -146,14 +184,14 @@
         filer.resetChoosableFileFilters();
         filer.setAcceptAllFileFilterUsed(false);
         filer
-          
.addChoosableFileFilter((javax.swing.filechooser.FileFilter)CSV_FILE_FILTER);
+            
.addChoosableFileFilter((javax.swing.filechooser.FileFilter)CSV_FILE_FILTER);
         filer
-          .setFileFilter((javax.swing.filechooser.FileFilter)CSV_FILE_FILTER);
+            
.setFileFilter((javax.swing.filechooser.FileFilter)CSV_FILE_FILTER);
 
         if(filer.showOpenDialog(dialog) != JFileChooser.APPROVE_OPTION) return;
         try {
           txtURL.setText(filer.getSelectedFile().toURI().toURL()
-            .toExternalForm());
+              .toExternalForm());
         } catch(IOException ioe) {
           // do nothing here
         }
@@ -173,84 +211,98 @@
 
         // display the populater dialog and return if it is cancelled
         if(JOptionPane.showConfirmDialog(null, dialog,
-          "Populate From CSV File", JOptionPane.OK_CANCEL_OPTION,
-          JOptionPane.PLAIN_MESSAGE) != JOptionPane.OK_OPTION) return;
+            "Populate From CSV File", JOptionPane.OK_CANCEL_OPTION,
+            JOptionPane.PLAIN_MESSAGE) != JOptionPane.OK_OPTION) return;
 
         // we want to run the population in a separate thread so we don't lock
         // up the GUI
         Thread thread =
-          new Thread(Thread.currentThread().getThreadGroup(),
-            "CSV Corpus Populater") {
+            new Thread(Thread.currentThread().getThreadGroup(),
+                "CSV Corpus Populater") {
 
-            public void run() {
-              try {
-                // see if we can convert the URL to a File instance
-                File file = null;
+              public void run() {
                 try {
-                  file = Files.fileFromURL(new URL(txtURL.getText()));
-                } catch(IllegalArgumentException iae) {
-                  // this will happen if someone enters an actual URL, but we
-                  // handle that later so we can just ignore the exception for
-                  // now and keep going
-                }
 
-                if(file != null && file.isDirectory()) {
-                  // if we have a File instance and that points at a directory
-                  // then....
+                  // unescape the strings that define the format of the file 
and
+                  // get the actual chars
+                  char separator =
+                      StringEscapeUtils.unescapeJava(txtSeparator.getText())
+                          .charAt(0);
+                  char quote =
+                      StringEscapeUtils.unescapeJava(txtQuoteChar.getText())
+                          .charAt(0);
 
-                  // get all the CSV files in the directory structure
-                  File[] files =
-                    Files.listFilesRecursively(file, CSV_FILE_FILTER);
+                  // see if we can convert the URL to a File instance
+                  File file = null;
+                  try {
+                    file = Files.fileFromURL(new URL(txtURL.getText()));
+                  } catch(IllegalArgumentException iae) {
+                    // this will happen if someone enters an actual URL, but we
+                    // handle that later so we can just ignore the exception 
for
+                    // now and keep going
+                  }
 
-                  for(File f : files) {
-                    // for each file...
+                  if(file != null && file.isDirectory()) {
+                    // if we have a File instance and that points at a 
directory
+                    // then....
 
-                    // skip directories as we don't want to handle those
-                    if(f.isDirectory()) continue;
+                    // get all the CSV files in the directory structure
+                    File[] files =
+                        Files.listFilesRecursively(file, CSV_FILE_FILTER);
 
+                    for(File f : files) {
+                      // for each file...
+
+                      // skip directories as we don't want to handle those
+                      if(f.isDirectory()) continue;
+
+                      if(cboDocuments.isSelected()) {
+                        // if we are creating lots of documents from a single
+                        // file
+                        // then call the populate method passing through all 
the
+                        // options from the GUI
+                        populate((Corpus)handle.getTarget(), f.toURI().toURL(),
+                            (Integer)textColModel.getValue(),
+                            cboFeatures.isSelected(), separator, quote);
+                      } else {
+                        // if we are creating a single document from a single
+                        // file
+                        // then call the createDoc method passing through all
+                        // the
+                        // options from the GUI
+                        createDoc((Corpus)handle.getTarget(),
+                            f.toURI().toURL(),
+                            (Integer)textColModel.getValue(),
+                            cboFeatures.isSelected(), separator, quote);
+                      }
+                    }
+                  } else {
+                    // we have a single URL to process so...
+
                     if(cboDocuments.isSelected()) {
                       // if we are creating lots of documents from a single 
file
                       // then call the populate method passing through all the
                       // options from the GUI
-                      populate((Corpus)handle.getTarget(), f.toURI().toURL(),
-                        (Integer)textColModel.getValue(),
-                        cboFeatures.isSelected());
+                      populate((Corpus)handle.getTarget(),
+                          new URL(txtURL.getText()),
+                          (Integer)textColModel.getValue(),
+                          cboFeatures.isSelected(), separator, quote);
                     } else {
                       // if we are creating a single document from a single 
file
                       // then call the createDoc method passing through all the
                       // options from the GUI
-                      createDoc((Corpus)handle.getTarget(), f.toURI().toURL(),
-                        (Integer)textColModel.getValue(),
-                        cboFeatures.isSelected());
+                      createDoc((Corpus)handle.getTarget(),
+                          new URL(txtURL.getText()),
+                          (Integer)textColModel.getValue(),
+                          cboFeatures.isSelected(), separator, quote);
                     }
                   }
-                } else {
-                  // we have a single URL to process so...
-
-                  if(cboDocuments.isSelected()) {
-                    // if we are creating lots of documents from a single file
-                    // then call the populate method passing through all the
-                    // options from the GUI
-                    populate((Corpus)handle.getTarget(),
-                      new URL(txtURL.getText()),
-                      (Integer)textColModel.getValue(),
-                      cboFeatures.isSelected());
-                  } else {
-                    // if we are creating a single document from a single file
-                    // then call the createDoc method passing through all the
-                    // options from the GUI
-                    createDoc((Corpus)handle.getTarget(),
-                      new URL(txtURL.getText()),
-                      (Integer)textColModel.getValue(),
-                      cboFeatures.isSelected());
-                  }
+                } catch(Exception e) {
+                  // TODO give a sensible error message
+                  e.printStackTrace();
                 }
-              } catch(Exception e) {
-                // TODO give a sensible error message
-                e.printStackTrace();
               }
-            }
-          };
+            };
 
         // let's leave the GUI nice and responsive
         thread.setPriority(Thread.MIN_PRIORITY);
@@ -264,6 +316,11 @@
     return actions;
   }
 
+  public static void populate(Corpus corpus, URL csv, int column,
+      boolean colLabels) {
+    populate(corpus, csv, column, colLabels, ',', '"');
+  }
+
   /**
    * Create a new document from each row and push it into the specified corpus
    * 
@@ -275,13 +332,20 @@
    *          the (zero index based) column which contains the text content
    * @param colLabels
    *          true if the first row contains column labels, true otherwise
+   * @param separator
+   *          the character that is used to separate columns (usually ,)
+   * @param quote
+   *          the character used to quote data that includes the column
+   *          separator (usually ")
    */
   public static void populate(Corpus corpus, URL csv, int column,
-                              boolean colLabels) {
+      boolean colLabels, char separator, char quote) {
     CSVReader reader = null;
     try {
       // open a CSVReader over the URL
-      reader = new CSVReader(new InputStreamReader(csv.openStream()));
+      reader =
+          new CSVReader(new InputStreamReader(csv.openStream()), separator,
+              quote);
 
       // if we are adding features read the first line
       String[] features = (colLabels ? reader.readNext() : null);
@@ -311,12 +375,12 @@
         // setup the initialization params for the document
         FeatureMap params = Factory.newFeatureMap();
         params.put(Document.DOCUMENT_STRING_CONTENT_PARAMETER_NAME,
-          nextLine[column]);
+            nextLine[column]);
 
         // create the document
         Document doc =
-          (Document)Factory.createResource(
-            gate.corpora.DocumentImpl.class.getName(), params, fmap);
+            (Document)Factory.createResource(
+                gate.corpora.DocumentImpl.class.getName(), params, fmap);
 
         // add the document to the corpus
         corpus.add(doc);
@@ -343,6 +407,11 @@
     }
   }
 
+  public static void createDoc(Corpus corpus, URL csv, int column,
+      boolean colLabels) {
+    createDoc(corpus, csv, column, colLabels, ',', '"');
+  }
+
   /**
    * Creates a single document from the CSV file
    * 
@@ -354,14 +423,21 @@
    *          the (zero index based) column which contains the text content
    * @param colLabels
    *          true if the first row contains column labels, true otherwise
+   * @param separator
+   *          the character that is used to separate columns (usually ,)
+   * @param quote
+   *          the character used to quote data that includes the column
+   *          separator (usually ")
    */
   public static void createDoc(Corpus corpus, URL csv, int column,
-                               boolean colLabels) {
+      boolean colLabels, char separator, char quote) {
     CSVReader reader = null;
     Document doc = null;
     try {
       // open a CSVReader over the URL
-      reader = new CSVReader(new InputStreamReader(csv.openStream()));
+      reader =
+          new CSVReader(new InputStreamReader(csv.openStream()), separator,
+              quote);
 
       // if we are adding features read the first line
       String[] features = (colLabels ? reader.readNext() : null);
@@ -396,14 +472,14 @@
         long length = doc.getContent().size();
 
         // add the new text to the document
-        doc.edit(length, length, new DocumentContentImpl(nextLine[column] +
-          "\n\n"));
+        doc.edit(length, length, new DocumentContentImpl(nextLine[column]
+            + "\n\n"));
 
         // add the spanning annotation to the Original markups set, we use the
         // type "Text" if the columns don't have labels
         doc.getAnnotations("Original markups").add(length,
-          length + nextLine[column].length(),
-          (colLabels ? features[column] : "Text"), fmap);
+            length + nextLine[column].length(),
+            (colLabels ? features[column] : "Text"), fmap);
       }
 
       // store the original csv file URL as a document feature
@@ -413,7 +489,7 @@
       // created into the init param that will be used if the document is
       // recreated
       doc.setParameterValue(Document.DOCUMENT_STRING_CONTENT_PARAMETER_NAME,
-        doc.toXml());
+          doc.toXml());
 
       // add the document to the corpus
       corpus.add(doc);

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
Subversion Kills Productivity. Get off Subversion & Make the Move to Perforce.
With Perforce, you get hassle-free workflows. Merge that actually works. 
Faster operations. Version large binaries.  Built-in WAN optimization and the
freedom to use Git, Perforce or both. Make the move to Perforce.
http://pubads.g.doubleclick.net/gampad/clk?id=122218951&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs

Reply via email to