ORC-386 Add spark benchmarks.

I also refactored all of the old benchmarks to reduce the common code. I also 
split it into three modules so that I could
separate the common code, the code that depends on hive, and the code that 
depends on spark. Avoiding building an uber
jar that has both hive and spark made life much easier.

Fixes #290

Signed-off-by: Owen O'Malley <omal...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/orc/repo
Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/a6211816
Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/a6211816
Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/a6211816

Branch: refs/heads/master
Commit: a621181684b3db0311086fd777f99f88813e28a0
Parents: edbb967
Author: Owen O'Malley <omal...@apache.org>
Authored: Sat Jun 9 23:37:31 2018 +0200
Committer: Owen O'Malley <omal...@apache.org>
Committed: Fri Jul 13 14:33:53 2018 -0700

----------------------------------------------------------------------
 java/bench/README.md                            |  22 +-
 java/bench/core/pom.xml                         | 141 ++++
 java/bench/core/src/assembly/uber.xml           |  33 +
 java/bench/core/src/findbugs/exclude.xml        |  25 +
 .../hadoop/fs/TrackingLocalFileSystem.java      |  65 ++
 .../apache/orc/bench/core/BenchmarkOptions.java |  63 ++
 .../apache/orc/bench/core/CompressionKind.java  |  96 +++
 .../java/org/apache/orc/bench/core/Driver.java  |  65 ++
 .../apache/orc/bench/core/NullFileSystem.java   | 120 ++++
 .../org/apache/orc/bench/core/OrcBenchmark.java |  44 ++
 .../apache/orc/bench/core/RandomGenerator.java  | 524 ++++++++++++++
 .../org/apache/orc/bench/core/ReadCounters.java |  86 +++
 .../apache/orc/bench/core/RecordCounters.java   |  52 ++
 .../apache/orc/bench/core/SalesGenerator.java   | 206 ++++++
 .../org/apache/orc/bench/core/Utilities.java    | 110 +++
 .../orc/bench/core/convert/BatchReader.java     |  34 +
 .../orc/bench/core/convert/BatchWriter.java     |  35 +
 .../bench/core/convert/GenerateVariants.java    | 234 +++++++
 .../orc/bench/core/convert/ScanVariants.java    | 100 +++
 .../orc/bench/core/convert/avro/AvroReader.java | 304 ++++++++
 .../core/convert/avro/AvroSchemaUtils.java      | 192 +++++
 .../orc/bench/core/convert/avro/AvroWriter.java | 363 ++++++++++
 .../orc/bench/core/convert/csv/CsvReader.java   | 172 +++++
 .../orc/bench/core/convert/json/JsonReader.java | 277 ++++++++
 .../orc/bench/core/convert/json/JsonWriter.java | 217 ++++++
 .../orc/bench/core/convert/orc/OrcReader.java   |  50 ++
 .../orc/bench/core/convert/orc/OrcWriter.java   |  54 ++
 .../core/convert/parquet/ParquetReader.java     |  66 ++
 .../core/convert/parquet/ParquetWriter.java     |  83 +++
 java/bench/core/src/resources/github.schema     | 702 +++++++++++++++++++
 java/bench/core/src/resources/log4j.properties  |  18 +
 java/bench/core/src/resources/sales.schema      |  56 ++
 java/bench/core/src/resources/taxi.schema       |  21 +
 java/bench/hive/pom.xml                         | 138 ++++
 java/bench/hive/src/assembly/uber.xml           |  33 +
 java/bench/hive/src/findbugs/exclude.xml        |  25 +
 .../hive/ql/io/orc/OrcBenchmarkUtilities.java   |  54 ++
 .../bench/hive/ColumnProjectionBenchmark.java   | 149 ++++
 .../org/apache/orc/bench/hive/DecimalBench.java | 253 +++++++
 .../orc/bench/hive/FullReadBenchmark.java       | 181 +++++
 java/bench/pom.xml                              | 643 ++++++++++++-----
 java/bench/spark/pom.xml                        | 203 ++++++
 .../apache/orc/bench/spark/SparkBenchmark.java  | 292 ++++++++
 .../org/apache/orc/bench/spark/SparkSchema.java |  95 +++
 java/bench/src/assembly/uber.xml                |  33 -
 java/bench/src/findbugs/exclude.xml             |  25 -
 .../hadoop/fs/TrackingLocalFileSystem.java      |  57 --
 .../hive/ql/io/orc/OrcBenchmarkUtilities.java   |  54 --
 .../orc/bench/ColumnProjectionBenchmark.java    | 188 -----
 .../org/apache/orc/bench/CompressionKind.java   |  87 ---
 .../java/org/apache/orc/bench/DecimalBench.java | 272 -------
 .../src/java/org/apache/orc/bench/Driver.java   |  82 ---
 .../org/apache/orc/bench/FullReadBenchmark.java | 223 ------
 .../org/apache/orc/bench/NullFileSystem.java    | 121 ----
 .../org/apache/orc/bench/RandomGenerator.java   | 524 --------------
 .../org/apache/orc/bench/SalesGenerator.java    | 206 ------
 .../java/org/apache/orc/bench/Utilities.java    | 127 ----
 .../apache/orc/bench/convert/BatchReader.java   |  34 -
 .../apache/orc/bench/convert/BatchWriter.java   |  34 -
 .../orc/bench/convert/GenerateVariants.java     | 220 ------
 .../apache/orc/bench/convert/ScanVariants.java  |  87 ---
 .../orc/bench/convert/avro/AvroReader.java      | 299 --------
 .../orc/bench/convert/avro/AvroSchemaUtils.java | 192 -----
 .../orc/bench/convert/avro/AvroWriter.java      | 363 ----------
 .../apache/orc/bench/convert/csv/CsvReader.java | 175 -----
 .../orc/bench/convert/json/JsonReader.java      | 279 --------
 .../orc/bench/convert/json/JsonWriter.java      | 217 ------
 .../apache/orc/bench/convert/orc/OrcReader.java |  50 --
 .../apache/orc/bench/convert/orc/OrcWriter.java |  54 --
 .../bench/convert/parquet/ParquetReader.java    | 297 --------
 .../bench/convert/parquet/ParquetWriter.java    |  86 ---
 java/bench/src/main/resources/github.schema     | 702 -------------------
 java/bench/src/main/resources/log4j.properties  |  18 -
 java/bench/src/main/resources/sales.schema      |  56 --
 java/bench/src/main/resources/taxi.schema       |  21 -
 java/pom.xml                                    |   4 +
 76 files changed, 6539 insertions(+), 5344 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/orc/blob/a6211816/java/bench/README.md
----------------------------------------------------------------------
diff --git a/java/bench/README.md b/java/bench/README.md
index 12cedea..f49404d 100644
--- a/java/bench/README.md
+++ b/java/bench/README.md
@@ -7,6 +7,12 @@ These big data file format benchmarks, compare:
 * ORC
 * Parquet
 
+There are three sub-modules to try to mitigate dependency hell:
+
+* core - the shared part of the benchmarks
+* hive - the Hive benchmarks
+* spark - the Spark benchmarks
+
 To build this library:
 
 ```% mvn clean package```
@@ -17,17 +23,25 @@ To fetch the source data:
 
 To generate the derived data:
 
-```% java -jar target/orc-benchmarks-*-uber.jar generate data```
+```% java -jar core/target/orc-benchmarks-core-*-uber.jar generate data```
 
 To run a scan of all of the data:
 
-```% java -jar target/orc-benchmarks-*-uber.jar scan data```
+```% java -jar core/target/orc-benchmarks-core-*-uber.jar scan data```
 
 To run full read benchmark:
 
-```% java -jar target/orc-benchmarks-*-uber.jar read-all data```
+```% java -jar hive/target/orc-benchmarks-hive-*-uber.jar read-all data```
 
 To run column projection benchmark:
 
-```% java -jar target/orc-benchmarks-*-uber.jar read-some data```
+```% java -jar hive/target/orc-benchmarks-hive-*-uber.jar read-some data```
+
+To run decimal/decimal64 benchmark:
+
+```% java -jar hive/target/orc-benchmarks-hive-*-uber.jar decimal data```
+
+To run spark benchmark:
+
+```% java -jar spark/target/orc-benchmarks-spark-*.jar spark data```
 

http://git-wip-us.apache.org/repos/asf/orc/blob/a6211816/java/bench/core/pom.xml
----------------------------------------------------------------------
diff --git a/java/bench/core/pom.xml b/java/bench/core/pom.xml
new file mode 100644
index 0000000..d0dcc69
--- /dev/null
+++ b/java/bench/core/pom.xml
@@ -0,0 +1,141 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0";
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
http://maven.apache.org/xsd/maven-4.0.0.xsd";>
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.apache.orc</groupId>
+    <artifactId>orc-benchmarks</artifactId>
+    <version>1.6.0-SNAPSHOT</version>
+    <relativePath>..</relativePath>
+  </parent>
+
+  <groupId>org.apache.orc</groupId>
+  <artifactId>orc-benchmarks-core</artifactId>
+  <version>1.6.0-SNAPSHOT</version>
+  <packaging>jar</packaging>
+  <name>ORC Benchmarks Core</name>
+  <description>
+    The core parts of the benchmarks for comparing performance across formats.
+  </description>
+
+  <dependencies>
+    <dependency>
+      <groupId>com.fasterxml.jackson.core</groupId>
+      <artifactId>jackson-core</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>com.google.auto.service</groupId>
+      <artifactId>auto-service</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>com.google.code.gson</groupId>
+      <artifactId>gson</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>commons-cli</groupId>
+      <artifactId>commons-cli</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>io.airlift</groupId>
+      <artifactId>aircompressor</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.avro</groupId>
+      <artifactId>avro</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.avro</groupId>
+      <artifactId>avro-mapred</artifactId>
+      <classifier>hadoop2</classifier>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.commons</groupId>
+      <artifactId>commons-csv</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-common</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hive</groupId>
+      <artifactId>hive-storage-api</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.orc</groupId>
+      <artifactId>orc-core</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.parquet</groupId>
+      <artifactId>parquet-avro</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.parquet</groupId>
+      <artifactId>parquet-hadoop</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.openjdk.jmh</groupId>
+      <artifactId>jmh-core</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>slf4j-api</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>slf4j-log4j12</artifactId>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <sourceDirectory>${basedir}/src/java</sourceDirectory>
+    <testSourceDirectory>${basedir}/src/test</testSourceDirectory>
+    <resources>
+      <resource>
+        <directory>src/resources</directory>
+      </resource>
+    </resources>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-compiler-plugin</artifactId>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-enforcer-plugin</artifactId>
+      </plugin>
+      <plugin>
+        <artifactId>maven-assembly-plugin</artifactId>
+        <configuration>
+          <archive>
+            <manifest>
+              <mainClass>org.apache.orc.bench.core.Driver</mainClass>
+            </manifest>
+          </archive>
+        </configuration>
+      </plugin>
+    </plugins>
+  </build>
+
+  <profiles>
+    <profile>
+      <id>cmake</id>
+      <build>
+        <directory>${build.dir}/bench/core</directory>
+      </build>
+    </profile>
+  </profiles>
+</project>

http://git-wip-us.apache.org/repos/asf/orc/blob/a6211816/java/bench/core/src/assembly/uber.xml
----------------------------------------------------------------------
diff --git a/java/bench/core/src/assembly/uber.xml 
b/java/bench/core/src/assembly/uber.xml
new file mode 100644
index 0000000..014eab9
--- /dev/null
+++ b/java/bench/core/src/assembly/uber.xml
@@ -0,0 +1,33 @@
+<!--
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<assembly>
+  <id>uber</id>
+  <formats>
+    <format>jar</format>
+  </formats>
+  <includeBaseDirectory>false</includeBaseDirectory>
+  <dependencySets>
+    <dependencySet>
+      <outputDirectory>/</outputDirectory>
+      <useProjectArtifact>true</useProjectArtifact>
+      <unpack>true</unpack>
+      <scope>runtime</scope>
+    </dependencySet>
+  </dependencySets>
+  <containerDescriptorHandlers>
+    <containerDescriptorHandler>
+      <handlerName>metaInf-services</handlerName>
+    </containerDescriptorHandler>
+  </containerDescriptorHandlers>
+</assembly>

http://git-wip-us.apache.org/repos/asf/orc/blob/a6211816/java/bench/core/src/findbugs/exclude.xml
----------------------------------------------------------------------
diff --git a/java/bench/core/src/findbugs/exclude.xml 
b/java/bench/core/src/findbugs/exclude.xml
new file mode 100644
index 0000000..dde1471
--- /dev/null
+++ b/java/bench/core/src/findbugs/exclude.xml
@@ -0,0 +1,25 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<FindBugsFilter>
+  <Match>
+    <Bug pattern="EI_EXPOSE_REP,EI_EXPOSE_REP2"/>
+  </Match>
+  <Match>
+    <Class name="~org\.openjdk\.jmh\.infra\.generated.*"/>
+  </Match>
+  <Match>
+    <Class name="~org\.apache\.orc\.bench\.generated.*"/>
+  </Match>
+</FindBugsFilter>

http://git-wip-us.apache.org/repos/asf/orc/blob/a6211816/java/bench/core/src/java/org/apache/hadoop/fs/TrackingLocalFileSystem.java
----------------------------------------------------------------------
diff --git 
a/java/bench/core/src/java/org/apache/hadoop/fs/TrackingLocalFileSystem.java 
b/java/bench/core/src/java/org/apache/hadoop/fs/TrackingLocalFileSystem.java
new file mode 100644
index 0000000..bd3b027
--- /dev/null
+++ b/java/bench/core/src/java/org/apache/hadoop/fs/TrackingLocalFileSystem.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.fs;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.net.URI;
+
+public class TrackingLocalFileSystem extends RawLocalFileSystem {
+  static final URI NAME = URI.create("track:///");
+
+  class TrackingFileInputStream extends 
RawLocalFileSystem.LocalFSFileInputStream {
+
+    public TrackingFileInputStream(Path f) throws IOException {
+      super(f);
+    }
+
+    public int read() throws IOException {
+      statistics.incrementReadOps(1);
+      return super.read();
+    }
+
+    public int read(byte[] b, int off, int len) throws IOException {
+      statistics.incrementReadOps(1);
+      return super.read(b, off, len);
+    }
+
+    public int read(long position, byte[] b, int off, int len) throws 
IOException {
+      statistics.incrementReadOps(1);
+      return super.read(position, b, off, len);
+    }
+  }
+
+  public FSDataInputStream open(Path f, int bufferSize) throws IOException {
+    if (!exists(f)) {
+      throw new FileNotFoundException(f.toString());
+    }
+    return new FSDataInputStream(new BufferedFSInputStream(
+        new TrackingFileInputStream(f), bufferSize));
+  }
+
+  @Override
+  public URI getUri() {
+    return NAME;
+  }
+
+  public FileSystem.Statistics getLocalStatistics() {
+    return statistics;
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/a6211816/java/bench/core/src/java/org/apache/orc/bench/core/BenchmarkOptions.java
----------------------------------------------------------------------
diff --git 
a/java/bench/core/src/java/org/apache/orc/bench/core/BenchmarkOptions.java 
b/java/bench/core/src/java/org/apache/orc/bench/core/BenchmarkOptions.java
new file mode 100644
index 0000000..a64c605
--- /dev/null
+++ b/java/bench/core/src/java/org/apache/orc/bench/core/BenchmarkOptions.java
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench.core;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.DefaultParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
+
+public class BenchmarkOptions {
+
+  public static final String HELP = "help";
+  public static final String ITERATIONS = "iterations";
+  public static final String WARMUP_ITERATIONS = "warmup-iterations";
+  public static final String FORK = "fork";
+  public static final String TIME = "time";
+  public static final String MIN_MEMORY = "min-memory";
+  public static final String MAX_MEMORY = "max-memory";
+  public static final String GC = "gc";
+
+  public static CommandLine parseCommandLine(String[] args) {
+    Options options = new Options()
+        .addOption("h", HELP, false, "Provide help")
+        .addOption("i", ITERATIONS, true, "Number of iterations")
+        .addOption("I", WARMUP_ITERATIONS, true, "Number of warmup iterations")
+        .addOption("f", FORK, true, "How many forks to use")
+        .addOption("t", TIME, true, "How long each iteration is in seconds")
+        .addOption("m", MIN_MEMORY, true, "The minimum size of each JVM")
+        .addOption("M", MAX_MEMORY, true, "The maximum size of each JVM")
+        .addOption("g", GC, false, "Should GC be profiled");
+    CommandLine result;
+    try {
+      result = new DefaultParser().parse(options, args, true);
+    } catch (ParseException pe) {
+      System.err.println("Argument exception - " + pe.getMessage());
+      result = null;
+    }
+    if (result == null || result.hasOption(HELP) || result.getArgs().length == 
0) {
+      new HelpFormatter().printHelp("java -jar <jar> <command> <options> 
<data>",
+          options);
+      System.err.println();
+      System.exit(1);
+    }
+    return result;
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/a6211816/java/bench/core/src/java/org/apache/orc/bench/core/CompressionKind.java
----------------------------------------------------------------------
diff --git 
a/java/bench/core/src/java/org/apache/orc/bench/core/CompressionKind.java 
b/java/bench/core/src/java/org/apache/orc/bench/core/CompressionKind.java
new file mode 100644
index 0000000..2cd783d
--- /dev/null
+++ b/java/bench/core/src/java/org/apache/orc/bench/core/CompressionKind.java
@@ -0,0 +1,96 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench.core;
+
+import io.airlift.compress.snappy.SnappyCodec;
+import org.apache.hadoop.fs.Path;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.zip.GZIPInputStream;
+import java.util.zip.GZIPOutputStream;
+
+/**
+ * Enum for handling the compression codecs for the benchmark
+ */
+public enum CompressionKind {
+  NONE("none"),
+  ZLIB("gz"),
+  SNAPPY("snappy");
+
+  CompressionKind(String extendsion) {
+    this.extension = extendsion;
+  }
+
+  private final String extension;
+
+  public String getExtension() {
+    return extension;
+  }
+
+  public OutputStream create(OutputStream out) throws IOException {
+    switch (this) {
+      case NONE:
+        return out;
+      case ZLIB:
+        return new GZIPOutputStream(out);
+      case SNAPPY:
+        return new SnappyCodec().createOutputStream(out);
+      default:
+        throw new IllegalArgumentException("Unhandled kind " + this);
+    }
+  }
+
+  public InputStream read(InputStream in) throws IOException {
+    switch (this) {
+      case NONE:
+        return in;
+      case ZLIB:
+        return new GZIPInputStream(in);
+      case SNAPPY:
+        return new SnappyCodec().createInputStream(in);
+      default:
+        throw new IllegalArgumentException("Unhandled kind " + this);
+    }
+  }
+
+  public static CompressionKind fromPath(Path path) {
+    String name = path.getName();
+    int lastDot = name.lastIndexOf('.');
+    if (lastDot >= 0) {
+      String ext = name.substring(lastDot);
+      for (CompressionKind value : values()) {
+        if (ext.equals("." + value.getExtension())) {
+          return value;
+        }
+      }
+    }
+    return NONE;
+  }
+
+  public static CompressionKind fromExtension(String extension) {
+    for (CompressionKind value: values()) {
+      if (value.extension.equals(extension)) {
+        return value;
+      }
+    }
+    throw new IllegalArgumentException("Unknown compression " + extension);
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/a6211816/java/bench/core/src/java/org/apache/orc/bench/core/Driver.java
----------------------------------------------------------------------
diff --git a/java/bench/core/src/java/org/apache/orc/bench/core/Driver.java 
b/java/bench/core/src/java/org/apache/orc/bench/core/Driver.java
new file mode 100644
index 0000000..08b1288
--- /dev/null
+++ b/java/bench/core/src/java/org/apache/orc/bench/core/Driver.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench.core;
+
+import java.util.Arrays;
+import java.util.Map;
+import java.util.ServiceLoader;
+import java.util.TreeMap;
+
+/**
+ * A driver tool to call the various benchmark classes.
+ */
+public class Driver {
+  private static final ServiceLoader<OrcBenchmark> loader =
+      ServiceLoader.load(OrcBenchmark.class);
+
+  private static Map<String, OrcBenchmark> getBenchmarks() {
+    Map<String, OrcBenchmark> result = new TreeMap<>();
+    for(OrcBenchmark bench: loader) {
+      result.put(bench.getName(), bench);
+    }
+    return result;
+  }
+
+  private static final String PATTERN = "  %10s - %s";
+
+  private static void printUsageAndExit(Map<String, OrcBenchmark> benchmarks) {
+    System.err.println("Commands:");
+    for(OrcBenchmark bench: benchmarks.values()) {
+      System.err.println(String.format(PATTERN, bench.getName(),
+          bench.getDescription()));
+    }
+    System.exit(1);
+  }
+
+  public static void main(String[] args) throws Exception {
+    Map<String, OrcBenchmark> benchmarks = getBenchmarks();
+    if (args.length == 0) {
+      printUsageAndExit(benchmarks);
+    }
+    String command = args[0];
+    args = Arrays.copyOfRange(args, 1, args.length);
+    OrcBenchmark bench = benchmarks.get(command);
+    if (bench == null) {
+      printUsageAndExit(benchmarks);
+    }
+    bench.run(args);
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/a6211816/java/bench/core/src/java/org/apache/orc/bench/core/NullFileSystem.java
----------------------------------------------------------------------
diff --git 
a/java/bench/core/src/java/org/apache/orc/bench/core/NullFileSystem.java 
b/java/bench/core/src/java/org/apache/orc/bench/core/NullFileSystem.java
new file mode 100644
index 0000000..0907d62
--- /dev/null
+++ b/java/bench/core/src/java/org/apache/orc/bench/core/NullFileSystem.java
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench.core;
+
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.permission.FsPermission;
+import org.apache.hadoop.util.Progressable;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.net.URI;
+import java.net.URISyntaxException;
+
+public class NullFileSystem extends FileSystem {
+  @Override
+  public URI getUri() {
+    try {
+      return new URI("null:///");
+    } catch (URISyntaxException e) {
+      throw new IllegalArgumentException("Bad URL", e);
+    }
+  }
+
+  @Override
+  public FSDataInputStream open(Path path, int i) {
+    return new FSDataInputStream(new InputStream() {
+      @Override
+      public int read() {
+        return -1;
+      }
+    });
+  }
+
+  static class NullOutput extends OutputStream {
+
+    @Override
+    public void write(int b) {
+      // pass
+    }
+
+    public void write(byte[] buffer, int offset, int length) {
+      // pass
+    }
+  }
+  private static final OutputStream NULL_OUTPUT = new NullOutput();
+
+  @Override
+  public FSDataOutputStream create(Path path,
+                                   FsPermission fsPermission,
+                                   boolean b,
+                                   int i,
+                                   short i1,
+                                   long l,
+                                   Progressable progressable) throws 
IOException {
+    return new FSDataOutputStream(NULL_OUTPUT, null);
+  }
+
+  @Override
+  public FSDataOutputStream append(Path path,
+                                   int i,
+                                   Progressable progressable) throws 
IOException {
+    return new FSDataOutputStream(NULL_OUTPUT, null);
+  }
+
+  @Override
+  public boolean rename(Path path, Path path1) {
+    return false;
+  }
+
+  @Override
+  public boolean delete(Path path, boolean b) {
+    return false;
+  }
+
+  @Override
+  public FileStatus[] listStatus(Path path)  {
+    return null;
+  }
+
+  @Override
+  public void setWorkingDirectory(Path path) {
+    // pass
+  }
+
+  @Override
+  public Path getWorkingDirectory() {
+    return null;
+  }
+
+  @Override
+  public boolean mkdirs(Path path, FsPermission fsPermission) {
+    return false;
+  }
+
+  @Override
+  public FileStatus getFileStatus(Path path) {
+    return null;
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/a6211816/java/bench/core/src/java/org/apache/orc/bench/core/OrcBenchmark.java
----------------------------------------------------------------------
diff --git 
a/java/bench/core/src/java/org/apache/orc/bench/core/OrcBenchmark.java 
b/java/bench/core/src/java/org/apache/orc/bench/core/OrcBenchmark.java
new file mode 100644
index 0000000..63290fa
--- /dev/null
+++ b/java/bench/core/src/java/org/apache/orc/bench/core/OrcBenchmark.java
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench.core;
+
+/**
+ * API to support adding additional benchmarks to the Driver.
+ */
+public interface OrcBenchmark {
+
+  /**
+   * Get the name of the subcommand to invoke this benchmark.
+   * @return a simple string, hopefully lowercase
+   */
+  String getName();
+
+  /**
+   * The human readable description of this benchmark
+   * @return
+   */
+  String getDescription();
+
+  /**
+   * Run the benchmark
+   * @param args the arguments from the user
+   * @throws Exception
+   */
+  void run(String[] args) throws Exception;
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/a6211816/java/bench/core/src/java/org/apache/orc/bench/core/RandomGenerator.java
----------------------------------------------------------------------
diff --git 
a/java/bench/core/src/java/org/apache/orc/bench/core/RandomGenerator.java 
b/java/bench/core/src/java/org/apache/orc/bench/core/RandomGenerator.java
new file mode 100644
index 0000000..9220775
--- /dev/null
+++ b/java/bench/core/src/java/org/apache/orc/bench/core/RandomGenerator.java
@@ -0,0 +1,524 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench.core;
+
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.orc.TypeDescription;
+
+import java.nio.charset.StandardCharsets;
+import java.sql.Timestamp;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+
+public class RandomGenerator {
+  private final TypeDescription schema = TypeDescription.createStruct();
+  private final List<Field> fields = new ArrayList<>();
+  private final Random random;
+
+  public RandomGenerator(int seed) {
+    random = new Random(seed);
+  }
+
+  private abstract class ValueGenerator {
+    double nullProbability = 0;
+    abstract void generate(ColumnVector vector, int valueCount);
+  }
+
+  private class RandomBoolean extends ValueGenerator {
+    public void generate(ColumnVector v, int valueCount) {
+      LongColumnVector vector = (LongColumnVector) v;
+      for(int r=0; r < valueCount; ++r) {
+        if (nullProbability != 0 && random.nextDouble() < nullProbability) {
+          v.noNulls = false;
+          v.isNull[r] = true;
+        } else {
+          vector.vector[r] = random.nextInt(2);
+        }
+      }
+    }
+  }
+
+  private class RandomList extends ValueGenerator {
+    private final int minSize;
+    private final int sizeRange;
+    private final Field child;
+
+    public RandomList(int minSize, int maxSize, Field child) {
+      this.minSize = minSize;
+      this.sizeRange = maxSize - minSize + 1;
+      this.child = child;
+    }
+
+    public void generate(ColumnVector v, int valueCount) {
+      ListColumnVector vector = (ListColumnVector) v;
+      for(int r=0; r < valueCount; ++r) {
+        if (nullProbability != 0 && random.nextDouble() < nullProbability) {
+          v.noNulls = false;
+          v.isNull[r] = true;
+        } else {
+          vector.offsets[r] = vector.childCount;
+          vector.lengths[r] = random.nextInt(sizeRange) + minSize;
+          vector.childCount += vector.lengths[r];
+        }
+      }
+      vector.child.ensureSize(vector.childCount, false);
+      child.generator.generate(vector.child, vector.childCount);
+    }
+  }
+
+  private class RandomStruct extends ValueGenerator {
+    private final Field[] children;
+
+    public RandomStruct(Field[] children) {
+      this.children = children;
+    }
+
+    public void generate(ColumnVector v, int valueCount) {
+      StructColumnVector vector = (StructColumnVector) v;
+      for(int r=0; r < valueCount; ++r) {
+        if (nullProbability != 0 && random.nextDouble() < nullProbability) {
+          v.noNulls = false;
+          v.isNull[r] = true;
+        }
+      }
+      for(int c=0; c < children.length; ++c) {
+        children[c].generator.generate(vector.fields[c], valueCount);
+      }
+    }
+  }
+
+  private abstract class IntegerGenerator extends ValueGenerator {
+    private final long sign;
+    private final long mask;
+
+    private IntegerGenerator(TypeDescription.Category kind) {
+      int bits = getIntegerLength(kind);
+      mask = bits == 64 ? 0 : -1L << bits;
+      sign = 1L << (bits - 1);
+    }
+
+    protected void normalize(LongColumnVector vector, int valueCount) {
+      // make sure the value stays in range by sign extending it
+      for(int r=0; r < valueCount; ++r) {
+        if ((vector.vector[r] & sign) == 0) {
+          vector.vector[r] &= ~mask;
+        } else {
+          vector.vector[r] |= mask;
+        }
+      }
+    }
+  }
+
+  private class AutoIncrement extends IntegerGenerator {
+    private long value;
+    private final long increment;
+
+    private AutoIncrement(TypeDescription.Category kind, long start,
+                          long increment) {
+      super(kind);
+      this.value = start;
+      this.increment = increment;
+    }
+
+    public void generate(ColumnVector v, int valueCount) {
+      LongColumnVector vector = (LongColumnVector) v;
+      for(int r=0; r < valueCount; ++r) {
+        if (nullProbability != 0 && random.nextDouble() >= nullProbability) {
+          v.noNulls = false;
+          v.isNull[r] = true;
+        } else {
+          vector.vector[r] = value;
+          value += increment;
+        }
+      }
+      normalize(vector, valueCount);
+    }
+  }
+
+  private class RandomInteger extends IntegerGenerator {
+
+    private RandomInteger(TypeDescription.Category kind) {
+      super(kind);
+    }
+
+    public void generate(ColumnVector v, int valueCount) {
+      LongColumnVector vector = (LongColumnVector) v;
+      for(int r=0; r < valueCount; ++r) {
+        if (nullProbability != 0 && random.nextDouble() < nullProbability) {
+          v.noNulls = false;
+          v.isNull[r] = true;
+        } else {
+          vector.vector[r] = random.nextLong();
+        }
+      }
+      normalize(vector, valueCount);
+    }
+  }
+
+  private class IntegerRange extends IntegerGenerator {
+    private final long minimum;
+    private final long range;
+    private final long limit;
+
+    private IntegerRange(TypeDescription.Category kind, long minimum,
+                         long maximum) {
+      super(kind);
+      this.minimum = minimum;
+      this.range = maximum - minimum + 1;
+      if (this.range < 0) {
+        throw new IllegalArgumentException("Can't support a negative range "
+            + range);
+      }
+      limit = (Long.MAX_VALUE / range) * range;
+    }
+
+    public void generate(ColumnVector v, int valueCount) {
+      LongColumnVector vector = (LongColumnVector) v;
+      for(int r=0; r < valueCount; ++r) {
+        if (nullProbability != 0 && random.nextDouble() < nullProbability) {
+          v.noNulls = false;
+          v.isNull[r] = true;
+        } else {
+          long rand;
+          do {
+            // clear the sign bit
+            rand = random.nextLong() & Long.MAX_VALUE;
+          } while (rand >= limit);
+          vector.vector[r] = (rand % range) + minimum;
+        }
+      }
+      normalize(vector, valueCount);
+    }
+  }
+
+  private class StringChooser extends ValueGenerator {
+    private final byte[][] choices;
+    private StringChooser(String[] values) {
+      choices = new byte[values.length][];
+      for(int e=0; e < values.length; ++e) {
+        choices[e] = values[e].getBytes(StandardCharsets.UTF_8);
+      }
+    }
+
+    public void generate(ColumnVector v, int valueCount) {
+      BytesColumnVector vector = (BytesColumnVector) v;
+      for(int r=0; r < valueCount; ++r) {
+        if (nullProbability != 0 && random.nextDouble() < nullProbability) {
+          v.noNulls = false;
+          v.isNull[r] = true;
+        } else {
+          int val = random.nextInt(choices.length);
+          vector.setRef(r, choices[val], 0, choices[val].length);
+        }
+      }
+    }
+  }
+
+  private static byte[] concat(byte[] left, byte[] right) {
+    byte[] result = new byte[left.length + right.length];
+    System.arraycopy(left, 0, result, 0, left.length);
+    System.arraycopy(right, 0, result, left.length, right.length);
+    return result;
+  }
+
+  private static byte pickOne(byte[] choices, Random random) {
+    return choices[random.nextInt(choices.length)];
+  }
+
+  private static final byte[] LOWER_CONSONANTS =
+      "bcdfghjklmnpqrstvwxyz".getBytes(StandardCharsets.UTF_8);
+  private static final byte[] UPPER_CONSONANTS =
+      "BCDFGHJKLMNPQRSTVWXYZ".getBytes(StandardCharsets.UTF_8);
+  private static final byte[] CONSONANTS =
+      concat(LOWER_CONSONANTS, UPPER_CONSONANTS);
+  private static final byte[] LOWER_VOWELS = 
"aeiou".getBytes(StandardCharsets.UTF_8);
+  private static final byte[] UPPER_VOWELS = 
"AEIOU".getBytes(StandardCharsets.UTF_8);
+  private static final byte[] VOWELS = concat(LOWER_VOWELS, UPPER_VOWELS);
+  private static final byte[] LOWER_LETTERS =
+      concat(LOWER_CONSONANTS, LOWER_VOWELS);
+  private static final byte[] UPPER_LETTERS =
+      concat(UPPER_CONSONANTS, UPPER_VOWELS);
+  private static final byte[] LETTERS = concat(LOWER_LETTERS, UPPER_LETTERS);
+  private static final byte[] NATURAL_DIGITS = 
"123456789".getBytes(StandardCharsets.UTF_8);
+  private static final byte[] DIGITS = 
"0123456789".getBytes(StandardCharsets.UTF_8);
+
+  private class StringPattern extends ValueGenerator {
+    private final byte[] buffer;
+    private final byte[][] choices;
+    private final int[] locations;
+
+    private StringPattern(String pattern) {
+      buffer = pattern.getBytes(StandardCharsets.UTF_8);
+      int locs = 0;
+      for(int i=0; i < buffer.length; ++i) {
+        switch (buffer[i]) {
+          case 'C':
+          case 'c':
+          case 'E':
+          case 'V':
+          case 'v':
+          case 'F':
+          case 'l':
+          case 'L':
+          case 'D':
+          case 'x':
+          case 'X':
+            locs += 1;
+            break;
+          default:
+            break;
+        }
+      }
+      locations = new int[locs];
+      choices = new byte[locs][];
+      locs = 0;
+      for(int i=0; i < buffer.length; ++i) {
+        switch (buffer[i]) {
+          case 'C':
+            locations[locs] = i;
+            choices[locs++] = UPPER_CONSONANTS;
+            break;
+          case 'c':
+            locations[locs] = i;
+            choices[locs++] = LOWER_CONSONANTS;
+            break;
+          case 'E':
+            locations[locs] = i;
+            choices[locs++] = CONSONANTS;
+            break;
+          case 'V':
+            locations[locs] = i;
+            choices[locs++] = UPPER_VOWELS;
+            break;
+          case 'v':
+            locations[locs] = i;
+            choices[locs++] = LOWER_VOWELS;
+            break;
+          case 'F':
+            locations[locs] = i;
+            choices[locs++] = VOWELS;
+            break;
+          case 'l':
+            locations[locs] = i;
+            choices[locs++] = LOWER_LETTERS;
+            break;
+          case 'L':
+            locations[locs] = i;
+            choices[locs++] = UPPER_LETTERS;
+            break;
+          case 'D':
+            locations[locs] = i;
+            choices[locs++] = LETTERS;
+            break;
+          case 'x':
+            locations[locs] = i;
+            choices[locs++] = NATURAL_DIGITS;
+            break;
+          case 'X':
+            locations[locs] = i;
+            choices[locs++] = DIGITS;
+            break;
+          default:
+            break;
+        }
+      }
+    }
+
+    public void generate(ColumnVector v, int valueCount) {
+      BytesColumnVector vector = (BytesColumnVector) v;
+      for(int r=0; r < valueCount; ++r) {
+        if (nullProbability != 0 && random.nextDouble() < nullProbability) {
+          v.noNulls = false;
+          v.isNull[r] = true;
+        } else {
+          for(int m=0; m < locations.length; ++m) {
+            buffer[locations[m]] = pickOne(choices[m], random);
+          }
+          vector.setVal(r, buffer, 0, buffer.length);
+        }
+      }
+    }
+  }
+
+  private class TimestampRange extends ValueGenerator {
+    private final long minimum;
+    private final long range;
+    private final long limit;
+
+    private TimestampRange(String min, String max) {
+      minimum = Timestamp.valueOf(min).getTime();
+      range = Timestamp.valueOf(max).getTime() - minimum + 1;
+      if (range < 0) {
+        throw new IllegalArgumentException("Negative range " + range);
+      }
+      limit = (Long.MAX_VALUE / range) * range;
+    }
+
+    public void generate(ColumnVector v, int valueCount) {
+      TimestampColumnVector vector = (TimestampColumnVector) v;
+      for(int r=0; r < valueCount; ++r) {
+        if (nullProbability != 0 && random.nextDouble() < nullProbability) {
+          v.noNulls = false;
+          v.isNull[r] = true;
+        } else {
+          long rand;
+          do {
+            // clear the sign bit
+            rand = random.nextLong() & Long.MAX_VALUE;
+          } while (rand >= limit);
+          vector.time[r] = (rand % range) + minimum;
+          vector.nanos[r] = random.nextInt(1000000);
+        }
+      }
+    }
+  }
+
+  private static int getIntegerLength(TypeDescription.Category kind) {
+    switch (kind) {
+      case BYTE:
+        return 8;
+      case SHORT:
+        return 16;
+      case INT:
+        return 32;
+      case LONG:
+        return 64;
+      default:
+        throw new IllegalArgumentException("Unhandled type " + kind);
+    }
+  }
+
+  public class Field {
+    private final TypeDescription type;
+    private Field[] children;
+    private ValueGenerator generator;
+
+    private Field(TypeDescription type) {
+      this.type = type;
+      if (!type.getCategory().isPrimitive()) {
+        List<TypeDescription> childrenTypes = type.getChildren();
+        children = new Field[childrenTypes.size()];
+        for(int c=0; c < children.length; ++c) {
+          children[c] = new Field(childrenTypes.get(c));
+        }
+      }
+    }
+
+    public Field addAutoIncrement(long start, long increment) {
+      generator = new AutoIncrement(type.getCategory(), start, increment);
+      return this;
+    }
+
+    public Field addIntegerRange(long min, long max) {
+      generator = new IntegerRange(type.getCategory(), min, max);
+      return this;
+    }
+
+    public Field addRandomInt() {
+      generator = new RandomInteger(type.getCategory());
+      return this;
+    }
+
+    public Field addStringChoice(String... choices) {
+      if (type.getCategory() != TypeDescription.Category.STRING) {
+        throw new IllegalArgumentException("Must be string - " + type);
+      }
+      generator = new StringChooser(choices);
+      return this;
+    }
+
+    public Field addStringPattern(String pattern) {
+      if (type.getCategory() != TypeDescription.Category.STRING) {
+        throw new IllegalArgumentException("Must be string - " + type);
+      }
+      generator = new StringPattern(pattern);
+      return this;
+    }
+
+    public Field addTimestampRange(String start, String end) {
+      if (type.getCategory() != TypeDescription.Category.TIMESTAMP) {
+        throw new IllegalArgumentException("Must be timestamp - " + type);
+      }
+      generator = new TimestampRange(start, end);
+      return this;
+    }
+
+    public Field addBoolean() {
+      if (type.getCategory() != TypeDescription.Category.BOOLEAN) {
+        throw new IllegalArgumentException("Must be boolean - " + type);
+      }
+      generator = new RandomBoolean();
+      return this;
+    }
+
+    public Field hasNulls(double probability) {
+      generator.nullProbability = probability;
+      return this;
+    }
+
+    public Field addStruct() {
+      generator = new RandomStruct(children);
+      return this;
+    }
+
+    public Field addList(int minSize, int maxSize) {
+      generator = new RandomList(minSize, maxSize, children[0]);
+      return this;
+    }
+
+    public Field getChildField(int child) {
+      return children[child];
+    }
+  }
+
+  public Field addField(String name, TypeDescription.Category kind) {
+    TypeDescription type = new TypeDescription(kind);
+    return addField(name, type);
+  }
+
+  public Field addField(String name, TypeDescription type) {
+    schema.addField(name, type);
+    Field result = new Field(type);
+    fields.add(result);
+    return result;
+  }
+
+  public void generate(VectorizedRowBatch batch, int rowCount) {
+    batch.reset();
+    for(int c=0; c < batch.cols.length; ++c) {
+      fields.get(c).generator.generate(batch.cols[c], rowCount);
+    }
+    batch.size = rowCount;
+  }
+
+  /**
+   * Get the schema for the table that is being generated.
+   * @return
+   */
+  public TypeDescription getSchema() {
+    return schema;
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/a6211816/java/bench/core/src/java/org/apache/orc/bench/core/ReadCounters.java
----------------------------------------------------------------------
diff --git 
a/java/bench/core/src/java/org/apache/orc/bench/core/ReadCounters.java 
b/java/bench/core/src/java/org/apache/orc/bench/core/ReadCounters.java
new file mode 100644
index 0000000..6c07458
--- /dev/null
+++ b/java/bench/core/src/java/org/apache/orc/bench/core/ReadCounters.java
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench.core;
+
+import org.openjdk.jmh.annotations.AuxCounters;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.TearDown;
+
+/**
+ * A class to track the number of rows, bytes, and read operations that have
+ * been read.
+ */
+@AuxCounters(AuxCounters.Type.EVENTS)
+@State(Scope.Thread)
+public class ReadCounters {
+  long bytesRead;
+  long reads;
+  RecordCounters recordCounters;
+
+  @Setup(Level.Iteration)
+  public void setup(RecordCounters records) {
+    bytesRead = 0;
+    reads = 0;
+    recordCounters = records;
+  }
+
+  @TearDown(Level.Iteration)
+  public void print() {
+    if (recordCounters != null) {
+      recordCounters.print();
+    }
+    System.out.println("Reads: " + reads);
+    System.out.println("Bytes: " + bytesRead);
+  }
+
+  public double bytesPerRecord() {
+    return recordCounters == null || recordCounters.records == 0 ?
+        0 : ((double) bytesRead) / recordCounters.records;
+  }
+
+  public long records() {
+    return recordCounters == null || recordCounters.invocations == 0 ?
+        0 : recordCounters.records / recordCounters.invocations;
+  }
+
+  public long reads() {
+    return recordCounters == null || recordCounters.invocations == 0 ?
+        0 : reads / recordCounters.invocations;
+  }
+
+  public void addRecords(long value) {
+    if (recordCounters != null) {
+      recordCounters.records += value;
+    }
+  }
+
+  public void addInvocation() {
+    if (recordCounters != null) {
+      recordCounters.invocations += 1;
+    }
+  }
+
+  public void addBytes(long newReads, long newBytes) {
+    bytesRead += newBytes;
+    reads += newReads;
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/a6211816/java/bench/core/src/java/org/apache/orc/bench/core/RecordCounters.java
----------------------------------------------------------------------
diff --git 
a/java/bench/core/src/java/org/apache/orc/bench/core/RecordCounters.java 
b/java/bench/core/src/java/org/apache/orc/bench/core/RecordCounters.java
new file mode 100644
index 0000000..7cc079b
--- /dev/null
+++ b/java/bench/core/src/java/org/apache/orc/bench/core/RecordCounters.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench.core;
+
+import org.openjdk.jmh.annotations.AuxCounters;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+
+/**
+ * A class to track the number of rows that have been read.
+ */
+@AuxCounters(AuxCounters.Type.OPERATIONS)
+@State(Scope.Thread)
+public class RecordCounters {
+  long records;
+  long invocations;
+
+  @Setup(Level.Iteration)
+  public void setup() {
+    records = 0;
+    invocations = 0;
+  }
+
+  public long perRecord() {
+    return records;
+  }
+
+  public void print() {
+    System.out.println();
+    System.out.println("Records: " + records);
+    System.out.println("Invocations: " + invocations);
+  }
+}
+

http://git-wip-us.apache.org/repos/asf/orc/blob/a6211816/java/bench/core/src/java/org/apache/orc/bench/core/SalesGenerator.java
----------------------------------------------------------------------
diff --git 
a/java/bench/core/src/java/org/apache/orc/bench/core/SalesGenerator.java 
b/java/bench/core/src/java/org/apache/orc/bench/core/SalesGenerator.java
new file mode 100644
index 0000000..9ac1264
--- /dev/null
+++ b/java/bench/core/src/java/org/apache/orc/bench/core/SalesGenerator.java
@@ -0,0 +1,206 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench.core;
+
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.bench.core.convert.BatchReader;
+
+public class SalesGenerator implements BatchReader {
+  private final RandomGenerator generator;
+  private long rowsRemaining;
+  private final static double MOSTLY = 0.99999;
+
+  public SalesGenerator(long rows) {
+    this(rows, 42);
+  }
+
+  public SalesGenerator(long rows, int seed) {
+    generator = new RandomGenerator(seed);
+    // column 1
+    generator.addField("sales_id", TypeDescription.Category.LONG)
+        .addAutoIncrement(1000000000, 1);
+    generator.addField("customer_id", TypeDescription.Category.LONG)
+        .addIntegerRange(1000000000, 2000000000);
+    generator.addField("col3", TypeDescription.Category.LONG)
+        .addIntegerRange(1, 10000).hasNulls(0.9993100389335173);
+
+    // column 4
+    generator.addField("item_category", TypeDescription.Category.LONG)
+        .addIntegerRange(1, 1000000).hasNulls(0.00014784879996054823);
+    generator.addField("item_count", TypeDescription.Category.LONG)
+        .addIntegerRange(1, 1000);
+    generator.addField("change_ts", TypeDescription.Category.TIMESTAMP)
+        .addTimestampRange("2003-01-01 00:00:00", "2017-03-14 23:59:59");
+
+    // column 7
+    generator.addField("store_location", TypeDescription.Category.STRING)
+        .addStringChoice("Los Angeles", "New York", "Cupertino", "Sunnyvale",
+            "Boston", "Chicago", "Seattle", "Jackson",
+            "Palo Alto", "San Mateo", "San Jose", "Santa Clara",
+            "Irvine", "Torrance", "Gardena", "Hermosa", "Manhattan")
+        .hasNulls(0.0004928293332019384);
+    generator.addField("associate_id", TypeDescription.Category.STRING)
+        .addStringPattern("MR V").hasNulls(0.05026859198659506);
+    generator.addField("col9", TypeDescription.Category.LONG)
+        .addIntegerRange(1, 1000000000).hasNulls(MOSTLY);
+
+    // column 10
+    generator.addField("rebate_id", TypeDescription.Category.STRING)
+        .addStringPattern("xxxxxx").hasNulls(MOSTLY);
+    generator.addField("create_ts", TypeDescription.Category.TIMESTAMP)
+        .addTimestampRange("2003-01-01 00:00:00", "2017-03-14 23:59:59");
+    generator.addField("col13", TypeDescription.Category.LONG)
+        .addIntegerRange(1, 100000).hasNulls(MOSTLY);
+
+    // column 13
+    generator.addField("size", TypeDescription.Category.STRING)
+        .addStringChoice("Small", "Medium", "Large", "XL")
+        .hasNulls(0.9503720861465674);
+    generator.addField("col14", TypeDescription.Category.LONG)
+        .addIntegerRange(1, 100000);
+    generator.addField("fulfilled", TypeDescription.Category.BOOLEAN)
+        .addBoolean();
+
+    // column 16
+    generator.addField("global_id", TypeDescription.Category.STRING)
+        .addStringPattern("xxxxxxxxxxxxxxxx").hasNulls(0.021388793060962974);
+    generator.addField("col17", TypeDescription.Category.STRING)
+        .addStringPattern("L-xx").hasNulls(MOSTLY);
+    generator.addField("col18", TypeDescription.Category.STRING)
+        .addStringPattern("ll").hasNulls(MOSTLY);
+
+    // column 19
+    generator.addField("col19", TypeDescription.Category.LONG)
+        .addIntegerRange(1, 100000);
+    generator.addField("has_rebate", TypeDescription.Category.BOOLEAN)
+        .addBoolean();
+    RandomGenerator.Field list =
+        generator.addField("col21",
+        TypeDescription.fromString("array<struct<sub1:bigint,sub2:string," +
+            "sub3:string,sub4:bigint,sub5:bigint,sub6:string>>"))
+        .addList(0, 3)
+        .hasNulls(MOSTLY);
+    RandomGenerator.Field struct = list.getChildField(0).addStruct();
+    struct.getChildField(0).addIntegerRange(0, 10000000);
+    struct.getChildField(1).addStringPattern("VVVVV");
+    struct.getChildField(2).addStringPattern("VVVVVVVV");
+    struct.getChildField(3).addIntegerRange(0, 10000000);
+    struct.getChildField(4).addIntegerRange(0, 10000000);
+    struct.getChildField(5).addStringPattern("VVVVVVVV");
+
+    // column 38
+    generator.addField("vendor_id", TypeDescription.Category.STRING)
+        .addStringPattern("Lxxxxxx").hasNulls(0.1870780148834459);
+    generator.addField("country", TypeDescription.Category.STRING)
+        .addStringChoice("USA", "Germany", "Ireland", "Canada", "Mexico",
+            "Denmark").hasNulls(0.0004928293332019384);
+
+    // column 40
+    generator.addField("backend_version", TypeDescription.Category.STRING)
+        .addStringPattern("X.xx").hasNulls(0.0005913951998423039);
+    generator.addField("col41", TypeDescription.Category.LONG)
+        .addIntegerRange(1000000000, 100000000000L);
+    generator.addField("col42", TypeDescription.Category.LONG)
+        .addIntegerRange(1, 1000000000);
+
+    // column 43
+    generator.addField("col43", TypeDescription.Category.LONG)
+        .addIntegerRange(1000000000, 
10000000000L).hasNulls(0.9763934749396284);
+    generator.addField("col44", TypeDescription.Category.LONG)
+        .addIntegerRange(1, 100000000);
+    generator.addField("col45", TypeDescription.Category.LONG)
+        .addIntegerRange(1, 100000000);
+
+    // column 46
+    generator.addField("col46", TypeDescription.Category.LONG)
+        .addIntegerRange(1, 10000000);
+    generator.addField("col47", TypeDescription.Category.LONG)
+        .addIntegerRange(1, 1000);
+    generator.addField("col48", TypeDescription.Category.LONG)
+        .addIntegerRange(1, 1000000).hasNulls(MOSTLY);
+
+    // column 49
+    generator.addField("col49", TypeDescription.Category.STRING)
+        .addStringPattern("xxxx").hasNulls(0.0004928293332019384);
+    generator.addField("col50", TypeDescription.Category.STRING)
+        .addStringPattern("ll").hasNulls(0.9496821250800848);
+    generator.addField("col51", TypeDescription.Category.LONG)
+        .addIntegerRange(1, 1000000).hasNulls(0.9999014341333596);
+
+    // column 52
+    generator.addField("col52", TypeDescription.Category.LONG)
+        .addIntegerRange(1, 1000000).hasNulls(0.9980779656005125);
+    generator.addField("col53", TypeDescription.Category.LONG)
+        .addIntegerRange(1, 1000000000);
+    generator.addField("col54", TypeDescription.Category.LONG)
+        .addIntegerRange(1,  1000000000);
+
+    // column 55
+    generator.addField("col55", TypeDescription.Category.STRING)
+        .addStringChoice("X");
+    generator.addField("col56", TypeDescription.Category.TIMESTAMP)
+        .addTimestampRange("2003-01-01 00:00:00", "2017-03-14 23:59:59");
+    generator.addField("col57", TypeDescription.Category.TIMESTAMP)
+        .addTimestampRange("2003-01-01 00:00:00", "2017-03-14 23:59:59");
+
+    // column 58
+    generator.addField("md5", TypeDescription.Category.LONG)
+        .addRandomInt();
+    generator.addField("col59", TypeDescription.Category.LONG)
+        .addIntegerRange(1000000000, 10000000000L);
+    generator.addField("col69", TypeDescription.Category.TIMESTAMP)
+        .addTimestampRange("2003-01-01 00:00:00", "2017-03-14 23:59:59")
+        .hasNulls(MOSTLY);
+
+    // column 61
+    generator.addField("col61", TypeDescription.Category.STRING)
+        .addStringPattern("X.xx").hasNulls(0.11399142476960233);
+    generator.addField("col62", TypeDescription.Category.STRING)
+        .addStringPattern("X.xx").hasNulls(0.9986200778670347);
+    generator.addField("col63", TypeDescription.Category.TIMESTAMP)
+        .addTimestampRange("2003-01-01 00:00:00", "2017-03-14 23:59:59");
+
+    // column 64
+    generator.addField("col64", TypeDescription.Category.LONG)
+        .addIntegerRange(1, 1000000).hasNulls(MOSTLY);
+    rowsRemaining = rows;
+  }
+
+  public boolean nextBatch(VectorizedRowBatch batch) {
+    int rows = (int) Math.min(batch.getMaxSize(), rowsRemaining);
+    generator.generate(batch, rows);
+    rowsRemaining -= rows;
+    return rows != 0;
+  }
+
+  @Override
+  public void close() {
+    // PASS
+  }
+
+  public TypeDescription getSchema() {
+    return generator.getSchema();
+  }
+
+  public static void main(String[] args) throws Exception {
+    SalesGenerator sales = new SalesGenerator(10, 42);
+    System.out.println("Schema " + sales.getSchema());
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/a6211816/java/bench/core/src/java/org/apache/orc/bench/core/Utilities.java
----------------------------------------------------------------------
diff --git a/java/bench/core/src/java/org/apache/orc/bench/core/Utilities.java 
b/java/bench/core/src/java/org/apache/orc/bench/core/Utilities.java
new file mode 100644
index 0000000..dad605c
--- /dev/null
+++ b/java/bench/core/src/java/org/apache/orc/bench/core/Utilities.java
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench.core;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.hadoop.fs.Path;
+import org.apache.orc.TypeDescription;
+import org.openjdk.jmh.runner.Runner;
+import org.openjdk.jmh.runner.options.ChainedOptionsBuilder;
+import org.openjdk.jmh.runner.options.Options;
+import org.openjdk.jmh.runner.options.OptionsBuilder;
+import org.openjdk.jmh.runner.options.TimeValue;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+
+public class Utilities {
+
+  public static TypeDescription loadSchema(String name) throws IOException {
+    InputStream in = 
Utilities.class.getClassLoader().getResourceAsStream(name);
+    byte[] buffer= new byte[1 * 1024];
+    int len = in.read(buffer);
+    StringBuilder string = new StringBuilder();
+    while (len > 0) {
+      for(int i=0; i < len; ++i) {
+        // strip out
+        if (buffer[i] != '\n' && buffer[i] != ' ') {
+          string.append((char) buffer[i]);
+        }
+      }
+      len = in.read(buffer);
+    }
+    return TypeDescription.fromString(string.toString());
+  }
+
+  public static org.apache.orc.CompressionKind getCodec(CompressionKind 
compression) {
+    switch (compression) {
+      case NONE:
+        return org.apache.orc.CompressionKind.NONE;
+      case ZLIB:
+        return org.apache.orc.CompressionKind.ZLIB;
+      case SNAPPY:
+        return org.apache.orc.CompressionKind.SNAPPY;
+      default:
+        throw new IllegalArgumentException("Unknown compression " + 
compression);
+    }
+  }
+
+  public static Path getVariant(Path root,
+                                String data,
+                                String format,
+                                String compress) {
+    return new Path(root, "generated/" + data + "/" + format + "." + compress);
+  }
+
+  private static final String ROOT_PROPERTY_NAME = "bench.root.dir";
+
+  /**
+   * Get the benchmark data root in the child jvm.
+   * @return the path to the benchmark data or null if it wasn't found
+   */
+  public static Path getBenchmarkRoot() {
+    String value = System.getProperty(ROOT_PROPERTY_NAME);
+    return value == null ? null : new Path(value);
+  }
+
+  public static Options parseOptions(String[] args,
+                                     Class cls) throws IOException {
+    CommandLine options = BenchmarkOptions.parseCommandLine(args);
+    String dataPath = new File(options.getArgs()[0]).getCanonicalPath();
+    OptionsBuilder builder = new OptionsBuilder();
+    builder.include(cls.getSimpleName());
+    if (options.hasOption(BenchmarkOptions.GC)) {
+      builder.addProfiler("hs_gc");
+    }
+    builder.measurementIterations(Integer.parseInt(options.getOptionValue(
+        BenchmarkOptions.ITERATIONS, "5")));
+    builder.warmupIterations(Integer.parseInt(options.getOptionValue(
+        BenchmarkOptions.WARMUP_ITERATIONS, "2")));
+    builder.forks(Integer.parseInt(options.getOptionValue(
+        BenchmarkOptions.FORK, "1")));
+    TimeValue iterationTime = TimeValue.seconds(Long.parseLong(
+        options.getOptionValue(BenchmarkOptions.TIME, "10")));
+    builder.measurementTime(iterationTime);
+    builder.warmupTime(iterationTime);
+    String minMemory = options.getOptionValue(BenchmarkOptions.MIN_MEMORY, 
"256m");
+    String maxMemory = options.getOptionValue(BenchmarkOptions.MAX_MEMORY, 
"2g");
+    builder.jvmArgs("-server",
+        "-Xms"+ minMemory, "-Xmx" + maxMemory,
+        "-D" + ROOT_PROPERTY_NAME + "=" + dataPath);
+    return builder.build();
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/a6211816/java/bench/core/src/java/org/apache/orc/bench/core/convert/BatchReader.java
----------------------------------------------------------------------
diff --git 
a/java/bench/core/src/java/org/apache/orc/bench/core/convert/BatchReader.java 
b/java/bench/core/src/java/org/apache/orc/bench/core/convert/BatchReader.java
new file mode 100644
index 0000000..9a127ff
--- /dev/null
+++ 
b/java/bench/core/src/java/org/apache/orc/bench/core/convert/BatchReader.java
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench.core.convert;
+
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+
+import java.io.IOException;
+
+/**
+ * Generic interface for reading data.
+ */
+public interface BatchReader extends AutoCloseable {
+
+  boolean nextBatch(VectorizedRowBatch batch) throws IOException;
+
+  @Override
+  void close() throws IOException;
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/a6211816/java/bench/core/src/java/org/apache/orc/bench/core/convert/BatchWriter.java
----------------------------------------------------------------------
diff --git 
a/java/bench/core/src/java/org/apache/orc/bench/core/convert/BatchWriter.java 
b/java/bench/core/src/java/org/apache/orc/bench/core/convert/BatchWriter.java
new file mode 100644
index 0000000..2d75ee1
--- /dev/null
+++ 
b/java/bench/core/src/java/org/apache/orc/bench/core/convert/BatchWriter.java
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench.core.convert;
+
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+
+import java.io.Closeable;
+import java.io.IOException;
+
+/**
+ * Generic interface for writing data.
+ */
+public interface BatchWriter extends Closeable {
+
+  void writeBatch(VectorizedRowBatch batch) throws IOException;
+
+  @Override
+  void close() throws IOException;
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/a6211816/java/bench/core/src/java/org/apache/orc/bench/core/convert/GenerateVariants.java
----------------------------------------------------------------------
diff --git 
a/java/bench/core/src/java/org/apache/orc/bench/core/convert/GenerateVariants.java
 
b/java/bench/core/src/java/org/apache/orc/bench/core/convert/GenerateVariants.java
new file mode 100644
index 0000000..f4c9bc6
--- /dev/null
+++ 
b/java/bench/core/src/java/org/apache/orc/bench/core/convert/GenerateVariants.java
@@ -0,0 +1,234 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench.core.convert;
+
+import com.google.auto.service.AutoService;
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.DefaultParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.LocatedFileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.RemoteIterator;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.bench.core.CompressionKind;
+import org.apache.orc.bench.core.OrcBenchmark;
+import org.apache.orc.bench.core.SalesGenerator;
+import org.apache.orc.bench.core.Utilities;
+import org.apache.orc.bench.core.convert.avro.AvroReader;
+import org.apache.orc.bench.core.convert.avro.AvroWriter;
+import org.apache.orc.bench.core.convert.csv.CsvReader;
+import org.apache.orc.bench.core.convert.json.JsonReader;
+import org.apache.orc.bench.core.convert.json.JsonWriter;
+import org.apache.orc.bench.core.convert.orc.OrcReader;
+import org.apache.orc.bench.core.convert.orc.OrcWriter;
+import org.apache.orc.bench.core.convert.parquet.ParquetReader;
+import org.apache.orc.bench.core.convert.parquet.ParquetWriter;
+
+import java.io.IOException;
+
+/**
+ * A tool to create the different variants that we need to benchmark against.
+ */
+@AutoService(OrcBenchmark.class)
+public class GenerateVariants implements OrcBenchmark {
+
+  public static BatchWriter createFileWriter(Path file,
+                                             String format,
+                                             TypeDescription schema,
+                                             Configuration conf,
+                                             CompressionKind compress
+                                             ) throws IOException {
+    FileSystem fs = file.getFileSystem(conf);
+    fs.delete(file, false);
+    fs.mkdirs(file.getParent());
+    switch (format) {
+      case "json":
+        return new JsonWriter(file, schema, conf, compress);
+      case "orc":
+        return new OrcWriter(file, schema, conf, compress);
+      case "avro":
+        return new AvroWriter(file, schema, conf, compress);
+      case "parquet":
+        return new ParquetWriter(file, schema, conf, compress);
+      default:
+        throw new IllegalArgumentException("Unknown format " + format);
+    }
+  }
+
+  public static BatchReader createFileReader(Path file,
+                                             String format,
+                                             TypeDescription schema,
+                                             Configuration conf,
+                                             CompressionKind compress
+                                             ) throws IOException {
+    switch (format) {
+      case "csv":
+        return new CsvReader(file, schema, conf, compress);
+      case "json":
+        return new JsonReader(file, schema, conf, compress);
+      case "orc":
+        return new OrcReader(file, schema, conf);
+      case "avro":
+        return new AvroReader(file, schema, conf);
+      case "parquet":
+        return new ParquetReader(file, schema, conf);
+      default:
+        throw new IllegalArgumentException("Unknown format " + format);
+    }
+  }
+
+  @Override
+  public String getName() {
+    return "generate";
+  }
+
+  @Override
+  public String getDescription() {
+    return "generate all of the data variants";
+  }
+
+  @Override
+  public void run(String[] args) throws Exception {
+    CommandLine cli = parseCommandLine(args);
+    String[] compressList =
+        cli.getOptionValue("compress", "none,snappy,zlib").split(",");
+    String[] dataList =
+        cli.getOptionValue("data", "taxi,sales,github").split(",");
+    String[] formatList =
+        cli.getOptionValue("format", "avro,json,orc,parquet").split(",");
+    long records = Long.parseLong(cli.getOptionValue("sales", "25000000"));
+    Configuration conf = new Configuration();
+    Path root = new Path(cli.getArgs()[0]);
+    for(String data: dataList) {
+      // Set up the reader
+      TypeDescription schema = Utilities.loadSchema(data + ".schema");
+      BatchReader reader = createReader(root, data, schema, conf, records);
+
+      // Set up the writers for each combination
+      BatchWriter[] writers = new BatchWriter[compressList.length * 
formatList.length];
+      for(int compress=0; compress < compressList.length; ++compress) {
+        CompressionKind compressionKind =
+            CompressionKind.valueOf(compressList[compress].toUpperCase());
+        for(int format=0; format < formatList.length; ++format) {
+          Path outPath = Utilities.getVariant(root, data, formatList[format],
+              compressionKind.getExtension());
+          writers[compress * formatList.length + format] =
+              createFileWriter(outPath, formatList[format], schema, conf,
+                  compressionKind);
+        }
+      }
+
+      // Copy the rows
+      VectorizedRowBatch batch = schema.createRowBatch();
+      while (reader.nextBatch(batch)) {
+        for(BatchWriter writer: writers) {
+          writer.writeBatch(batch);
+        }
+      }
+      reader.close();
+      for(BatchWriter writer: writers) {
+        writer.close();
+      }
+    }
+  }
+
+  public static class RecursiveReader implements BatchReader {
+    private final RemoteIterator<LocatedFileStatus> filenames;
+    private final String format;
+    private final TypeDescription schema;
+    private final Configuration conf;
+    private final CompressionKind compress;
+    private BatchReader current = null;
+
+    public RecursiveReader(Path root,
+                    String format,
+                    TypeDescription schema,
+                    Configuration conf,
+                    CompressionKind compress) throws IOException {
+      FileSystem fs = root.getFileSystem(conf);
+      filenames = fs.listFiles(root, true);
+      this.format = format;
+      this.schema = schema;
+      this.conf = conf;
+      this.compress = compress;
+    }
+
+    @Override
+    public boolean nextBatch(VectorizedRowBatch batch) throws IOException {
+      while (current == null || !current.nextBatch(batch)) {
+        if (filenames.hasNext()) {
+          LocatedFileStatus next = filenames.next();
+          if (next.isFile()) {
+            current = createFileReader(next.getPath(), format, schema, conf,
+                compress);
+          }
+        } else {
+          return false;
+        }
+      }
+      return true;
+    }
+
+    @Override
+    public void close() throws IOException {
+      if (current != null) {
+        current.close();
+      }
+    }
+  }
+
+  public static BatchReader createReader(Path root,
+                                         String dataName,
+                                         TypeDescription schema,
+                                         Configuration conf,
+                                         long salesRecords) throws IOException 
{
+    switch (dataName) {
+      case "taxi":
+        return new RecursiveReader(new Path(root, "sources/" + dataName), 
"csv",
+            schema, conf, CompressionKind.ZLIB);
+      case "sales":
+        return new SalesGenerator(salesRecords);
+      case "github":
+        return new RecursiveReader(new Path(root, "sources/" + dataName), 
"json",
+            schema, conf, CompressionKind.ZLIB);
+      default:
+        throw new IllegalArgumentException("Unknown data name " + dataName);
+    }
+  }
+
+  static CommandLine parseCommandLine(String[] args) throws ParseException {
+    Options options = new Options()
+        .addOption("h", "help", false, "Provide help")
+        .addOption("c", "compress", true, "List of compression")
+        .addOption("d", "data", true, "List of data sets")
+        .addOption("f", "format", true, "List of formats")
+        .addOption("s", "sales", true, "Number of records for sales");
+    CommandLine result = new DefaultParser().parse(options, args);
+    if (result.hasOption("help") || result.getArgs().length == 0) {
+      new HelpFormatter().printHelp("convert <root>", options);
+      System.exit(1);
+    }
+    return result;
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/a6211816/java/bench/core/src/java/org/apache/orc/bench/core/convert/ScanVariants.java
----------------------------------------------------------------------
diff --git 
a/java/bench/core/src/java/org/apache/orc/bench/core/convert/ScanVariants.java 
b/java/bench/core/src/java/org/apache/orc/bench/core/convert/ScanVariants.java
new file mode 100644
index 0000000..14c570d
--- /dev/null
+++ 
b/java/bench/core/src/java/org/apache/orc/bench/core/convert/ScanVariants.java
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.bench.core.convert;
+
+import com.google.auto.service.AutoService;
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.DefaultParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.bench.core.CompressionKind;
+import org.apache.orc.bench.core.OrcBenchmark;
+import org.apache.orc.bench.core.Utilities;
+
+/**
+ * A tool to create the different variants that we need to benchmark against.
+ */
+@AutoService(OrcBenchmark.class)
+public class ScanVariants implements OrcBenchmark {
+
+
+  static CommandLine parseCommandLine(String[] args) throws ParseException {
+    Options options = new Options()
+        .addOption("h", "help", false, "Provide help")
+        .addOption("c", "compress", true, "List of compression")
+        .addOption("d", "data", true, "List of data sets")
+        .addOption("f", "format", true, "List of formats");
+    CommandLine result = new DefaultParser().parse(options, args);
+    if (result.hasOption("help") || result.getArgs().length == 0) {
+      new HelpFormatter().printHelp("scan <root>", options);
+      System.exit(1);
+    }
+    return result;
+  }
+
+  @Override
+  public String getName() {
+    return "scan";
+  }
+
+  @Override
+  public String getDescription() {
+    return "scan all of the data variants";
+  }
+
+  @Override
+  public void run(String[] args) throws Exception {
+    CommandLine cli = parseCommandLine(args);
+    String[] compressList =
+        cli.getOptionValue("compress", "none,snappy,gz").split(",");
+    String[] dataList =
+        cli.getOptionValue("data", "taxi,sales,github").split(",");
+    String[] formatList =
+        cli.getOptionValue("format", "avro,json,orc,parquet").split(",");
+    Configuration conf = new Configuration();
+    Path root = new Path(cli.getArgs()[0]);
+    for(String data: dataList) {
+      TypeDescription schema = Utilities.loadSchema(data + ".schema");
+      VectorizedRowBatch batch = schema.createRowBatch();
+      for (String compress : compressList) {
+        CompressionKind compressKind = CompressionKind.fromExtension(compress);
+        for (String format : formatList) {
+          Path filename = Utilities.getVariant(root, data, format,
+              compress);
+          BatchReader reader = GenerateVariants.createFileReader(filename,
+              format, schema, conf, compressKind);
+          long rows = 0;
+          long batches = 0;
+          while (reader.nextBatch(batch)) {
+            batches += 1;
+            rows += batch.size;
+          }
+          System.out.println(filename + " rows: " + rows + " batches: "
+              + batches);
+          reader.close();
+        }
+      }
+    }
+  }
+}

Reply via email to