spark git commit: [SPARK-15198][SQL] Support for pushing down filters for boolean types in ORC data source

lian Mon, 04 Jul 2016 23:02:59 -0700

Repository: spark
Updated Branches:
  refs/heads/master 8f6cf00c6 -> 7742d9f15



[SPARK-15198][SQL] Support for pushing down filters for boolean types in ORC 
data source

## What changes were proposed in this pull request?

It seems ORC supports all the types in  
([`PredicateLeaf.Type`](https://github.com/apache/hive/blob/e085b7e9bd059d91aaf013df0db4d71dca90ec6f/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/PredicateLeaf.java#L50-L56))
 which includes boolean types. So, this was tested first.

This PR adds the support for pushing filters down for `BooleanType` in ORC data 
source.

This PR also removes `OrcTableScan` class and the companion object, which is 
not used anymore.

## How was this patch tested?

Unittest in `OrcFilterSuite` and `OrcQuerySuite`.

Author: hyukjinkwon <gurwls...@gmail.com>

Closes #12972 from HyukjinKwon/SPARK-15198.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7742d9f1
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7742d9f1
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7742d9f1

Branch: refs/heads/master
Commit: 7742d9f1584150befeb2f3d76cdbd4ea1f37c914
Parents: 8f6cf00
Author: hyukjinkwon <gurwls...@gmail.com>
Authored: Tue Jul 5 13:59:13 2016 +0800
Committer: Cheng Lian <l...@databricks.com>
Committed: Tue Jul 5 13:59:13 2016 +0800

----------------------------------------------------------------------
 .../spark/sql/hive/orc/OrcFileFormat.scala      | 10 ++++----
 .../apache/spark/sql/hive/orc/OrcFilters.scala  |  2 +-
 .../spark/sql/hive/orc/OrcFilterSuite.scala     | 25 ++++++++++++++++----
 .../spark/sql/hive/orc/OrcQuerySuite.scala      | 13 ++++++++++
 4 files changed, 39 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/7742d9f1/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
----------------------------------------------------------------------
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
index 5de3507..1d3c466 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
@@ -111,7 +111,7 @@ private[sql] class OrcFileFormat
     if (sparkSession.sessionState.conf.orcFilterPushDown) {
       // Sets pushed predicates
       OrcFilters.createFilter(requiredSchema, filters.toArray).foreach { f =>
-        hadoopConf.set(OrcTableScan.SARG_PUSHDOWN, f.toKryo)
+        hadoopConf.set(OrcRelation.SARG_PUSHDOWN, f.toKryo)
         hadoopConf.setBoolean(ConfVars.HIVEOPTINDEXFILTER.varname, true)
       }
     }
@@ -258,15 +258,13 @@ private[orc] class OrcOutputWriter(
   }
 }
 
-private[orc] object OrcTableScan {
-  // This constant duplicates `OrcInputFormat.SARG_PUSHDOWN`, which is 
unfortunately not public.
-  private[orc] val SARG_PUSHDOWN = "sarg.pushdown"
-}
-
 private[orc] object OrcRelation extends HiveInspectors {
   // The references of Hive's classes will be minimized.
   val ORC_COMPRESSION = "orc.compress"
 
+  // This constant duplicates `OrcInputFormat.SARG_PUSHDOWN`, which is 
unfortunately not public.
+  private[orc] val SARG_PUSHDOWN = "sarg.pushdown"
+
   // The extensions for ORC compression codecs
   val extensionsForCompressionCodecNames = Map(
     "NONE" -> "",

http://git-wip-us.apache.org/repos/asf/spark/blob/7742d9f1/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala
----------------------------------------------------------------------
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala
index c463bc8..6ab8244 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala
@@ -83,7 +83,7 @@ private[orc] object OrcFilters extends Logging {
       // Only the values in the Spark types below can be recognized by
       // the `SearchArgumentImpl.BuilderImpl.boxLiteral()` method.
       case ByteType | ShortType | FloatType | DoubleType => true
-      case IntegerType | LongType | StringType => true
+      case IntegerType | LongType | StringType | BooleanType => true
       case _ => false
     }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/7742d9f1/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcFilterSuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcFilterSuite.scala 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcFilterSuite.scala
index 8c027f9..7a30e54 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcFilterSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcFilterSuite.scala
@@ -208,6 +208,27 @@ class OrcFilterSuite extends QueryTest with OrcTest {
     }
   }
 
+  test("filter pushdown - boolean") {
+    withOrcDataFrame((true :: false :: Nil).map(b => Tuple1.apply(Option(b)))) 
{ implicit df =>
+      checkFilterPredicate('_1.isNull, PredicateLeaf.Operator.IS_NULL)
+
+      checkFilterPredicate('_1 === true, PredicateLeaf.Operator.EQUALS)
+      checkFilterPredicate('_1 <=> true, 
PredicateLeaf.Operator.NULL_SAFE_EQUALS)
+
+      checkFilterPredicate('_1 < true, PredicateLeaf.Operator.LESS_THAN)
+      checkFilterPredicate('_1 > false, 
PredicateLeaf.Operator.LESS_THAN_EQUALS)
+      checkFilterPredicate('_1 <= false, 
PredicateLeaf.Operator.LESS_THAN_EQUALS)
+      checkFilterPredicate('_1 >= false, PredicateLeaf.Operator.LESS_THAN)
+
+      checkFilterPredicate(Literal(false) === '_1, 
PredicateLeaf.Operator.EQUALS)
+      checkFilterPredicate(Literal(false) <=> '_1, 
PredicateLeaf.Operator.NULL_SAFE_EQUALS)
+      checkFilterPredicate(Literal(false) > '_1, 
PredicateLeaf.Operator.LESS_THAN)
+      checkFilterPredicate(Literal(true) < '_1, 
PredicateLeaf.Operator.LESS_THAN_EQUALS)
+      checkFilterPredicate(Literal(true) >= '_1, 
PredicateLeaf.Operator.LESS_THAN_EQUALS)
+      checkFilterPredicate(Literal(true) <= '_1, 
PredicateLeaf.Operator.LESS_THAN)
+    }
+  }
+
   test("filter pushdown - combinations with logical operators") {
     withOrcDataFrame((1 to 4).map(i => Tuple1(Option(i)))) { implicit df =>
       // Because `ExpressionTree` is not accessible at Hive 1.2.x, this should 
be checked
@@ -264,10 +285,6 @@ class OrcFilterSuite extends QueryTest with OrcTest {
     withOrcDataFrame((1 to 4).map(i => Tuple1(i.b))) { implicit df =>
       checkNoFilterPredicate('_1 <=> 1.b)
     }
-    // BooleanType
-    withOrcDataFrame((true :: false :: Nil).map(b => Tuple1.apply(Option(b)))) 
{ implicit df =>
-      checkNoFilterPredicate('_1 === true)
-    }
     // TimestampType
     val stringTimestamp = "2015-08-20 15:57:00"
     withOrcDataFrame(Seq(Tuple1(Timestamp.valueOf(stringTimestamp)))) { 
implicit df =>

http://git-wip-us.apache.org/repos/asf/spark/blob/7742d9f1/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
index 4a86987..af8115c 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
@@ -464,6 +464,19 @@ class OrcQuerySuite extends QueryTest with 
BeforeAndAfterAll with OrcTest {
     }
   }
 
+  test("SPARK-15198 Support for pushing down filters for boolean types") {
+    withSQLConf(SQLConf.ORC_FILTER_PUSHDOWN_ENABLED.key -> "true") {
+      val data = (0 until 10).map(_ => (true, false))
+      withOrcFile(data) { file =>
+        val df = spark.read.orc(file).where("_2 == true")
+        val actual = stripSparkFilter(df).count()
+
+        // ORC filter should be applied and the total count should be 0.
+        assert(actual === 0)
+      }
+    }
+  }
+
   test("column nullability and comment - write and then read") {
     val schema = (new StructType)
       .add("cl1", IntegerType, nullable = false, comment = "test")


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-15198][SQL] Support for pushing down filters for boolean types in ORC data source

Reply via email to