Github user wangyum commented on a diff in the pull request: https://github.com/apache/spark/pull/21603#discussion_r197338867 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala --- @@ -270,6 +270,11 @@ private[parquet] class ParquetFilters(pushDownDate: Boolean) { case sources.Not(pred) => createFilter(schema, pred).map(FilterApi.not) + case sources.In(name, values) if canMakeFilterOn(name) && values.length < 20 => --- End diff -- It seems that the push-down performance is better when threshold is less than `300`: <img width="553" alt="spark-17091-perf" src="https://user-images.githubusercontent.com/5399861/41757743-7e411532-7616-11e8-8844-45132c50c535.png"> The code: ```scala withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED.key -> "true") { import testImplicits._ withTempPath { path => val total = 10000000 (0 to total).toDF().coalesce(1) .write.option("parquet.block.size", 512) .parquet(path.getAbsolutePath) val df = spark.read.parquet(path.getAbsolutePath) // scalastyle:off println var lastSize = -1 var i = 16000 while (i < total) { val filter = Range(0, total).filter(_ % i == 0) i += 100 if (lastSize != filter.size) { if (lastSize == -1) println(s"start size: ${filter.size}") lastSize = filter.size sql("set spark.sql.parquet.pushdown.inFilterThreshold=1000000") val begin1 = System.currentTimeMillis() df.where(s"id in(${filter.mkString(",")})").count() val end1 = System.currentTimeMillis() val time1 = end1 - begin1 sql("set spark.sql.parquet.pushdown.inFilterThreshold=10") val begin2 = System.currentTimeMillis() df.where(s"id in(${filter.mkString(",")})").count() val end2 = System.currentTimeMillis() val time2 = end2 - begin2 if (time1 <= time2) println(s"Max threshold: $lastSize") } } } } ```
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org