This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/main by this push:
     new 7c45137  ORC-1023: Support writing bloom filters in ConvertTool (#933)
7c45137 is described below

commit 7c451371add581472c05164c0c0951bb64963116
Author: Quanlong Huang <[email protected]>
AuthorDate: Mon Oct 11 23:25:23 2021 +0800

    ORC-1023: Support writing bloom filters in ConvertTool (#933)
    
    ### What changes were proposed in this pull request?
    
    This PR adds an option to the java tool ConvertTool to specify which 
columns it should generate bloom filters.
    
    ### Why are the changes needed?
    
    While debugging an issue, I need to generate an ORC file with bloom filters 
using the Java APIs. The ConvertTool is easy to use but it doesn't generate 
bloom filters. It'd be helpful to add an option for it.
    
    ### How was this patch tested?
    
    Didn't find any existing tests on ConvertTool. So I manually tested it and 
verified the bloom filters are generated.
---
 .../java/org/apache/orc/tools/convert/ConvertTool.java    | 15 +++++++++++++--
 site/_docs/java-tools.md                                  | 10 +++++++---
 2 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/java/tools/src/java/org/apache/orc/tools/convert/ConvertTool.java 
b/java/tools/src/java/org/apache/orc/tools/convert/ConvertTool.java
index fbb2337..87a7015 100644
--- a/java/tools/src/java/org/apache/orc/tools/convert/ConvertTool.java
+++ b/java/tools/src/java/org/apache/orc/tools/convert/ConvertTool.java
@@ -58,6 +58,7 @@ public class ConvertTool {
   private final int csvHeaderLines;
   private final String csvNullString;
   private final String timestampFormat;
+  private final String bloomFilterColumns;
   private final Writer writer;
   private final VectorizedRowBatch batch;
 
@@ -194,11 +195,17 @@ public class ConvertTool {
     this.csvHeaderLines = getIntOption(opts, 'H', 0);
     this.csvNullString = opts.getOptionValue('n', "");
     this.timestampFormat = opts.getOptionValue("t", DEFAULT_TIMESTAMP_FORMAT);
+    this.bloomFilterColumns = opts.getOptionValue('b', null);
     String outFilename = opts.hasOption('o')
         ? opts.getOptionValue('o') : "output.orc";
     boolean overwrite = opts.hasOption('O');
-    writer = OrcFile.createWriter(new Path(outFilename),
-        OrcFile.writerOptions(conf).setSchema(schema).overwrite(overwrite));
+    OrcFile.WriterOptions writerOpts = OrcFile.writerOptions(conf)
+        .setSchema(schema)
+        .overwrite(overwrite);
+    if (this.bloomFilterColumns != null) {
+      writerOpts.bloomFilterColumns(this.bloomFilterColumns);
+    }
+    writer = OrcFile.createWriter(new Path(outFilename), writerOpts);
     batch = schema.createRowBatch();
   }
 
@@ -239,6 +246,10 @@ public class ConvertTool {
         Option.builder("s").longOpt("schema").hasArg()
             .desc("The schema to write in to the file").build());
     options.addOption(
+        Option.builder("b").longOpt("bloomFilterColumns").hasArg()
+            .desc("Comma separated values of column names for which bloom 
filter is " +
+                "to be created").build());
+    options.addOption(
         Option.builder("o").longOpt("output").desc("Output filename")
             .hasArg().build());
     options.addOption(
diff --git a/site/_docs/java-tools.md b/site/_docs/java-tools.md
index 8358911..8630b33 100644
--- a/site/_docs/java-tools.md
+++ b/site/_docs/java-tools.md
@@ -11,7 +11,7 @@ supports both the local file system and HDFS.
 
 The subcommands for the tools are:
 
-  * convert (since ORC 1.4) - convert JSON files to ORC
+  * convert (since ORC 1.4) - convert JSON/CSV files to ORC
   * count (since ORC 1.6) - recursively find *.orc and print the number of rows
   * data - print the data of an ORC file
   * json-schema (since ORC 1.4) - determine the schema of JSON documents
@@ -28,9 +28,13 @@ The command line looks like:
 
 ## Java Convert
 
-The convert command reads several JSON files and converts them into a
+The convert command reads several JSON/CSV files and converts them into a
 single ORC file.
 
+`-b,--bloomFilterColumns <columns>`
+  : Comma separated values of column names for which bloom filter is to be 
created.
+  By default, no bloom filters will be created.
+
 `-e,--escape <escape>`
   : Sets CSV escape character
 
@@ -311,4 +315,4 @@ cost of printing the data out.
 
 ## Java Version
 
-The version command prints the version of this ORC tool.
\ No newline at end of file
+The version command prints the version of this ORC tool.

Reply via email to