Github user dbtsai commented on a diff in the pull request: https://github.com/apache/spark/pull/22418#discussion_r218158845 --- Diff: sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala --- @@ -50,6 +55,66 @@ abstract class OrcSuite extends OrcTest with BeforeAndAfterAll { .createOrReplaceTempView("orc_temp_table") } + protected def testBloomFilterCreation(bloomFilterKind: Kind) { + val tableName = "bloomFilter" + + withTempDir { dir => + withTable(tableName) { + val sqlStatement = orcImp match { + case "native" => + s""" + |CREATE TABLE $tableName (a INT, b STRING) + |USING ORC + |OPTIONS ( + | path '${dir.toURI}', + | orc.bloom.filter.columns '*', + | orc.bloom.filter.fpp 0.1 + |) + """.stripMargin + case "hive" => + s""" + |CREATE TABLE $tableName (a INT, b STRING) + |STORED AS ORC + |LOCATION '${dir.toURI}' + |TBLPROPERTIES ( + | orc.bloom.filter.columns='*', + | orc.bloom.filter.fpp=0.1 + |) + """.stripMargin + case impl => + throw new UnsupportedOperationException(s"Unknown ORC implementation: $impl") + } + + sql(sqlStatement) + sql(s"INSERT INTO $tableName VALUES (1, 'str')") + + val partFiles = dir.listFiles() + .filter(f => f.isFile && !f.getName.startsWith(".") && !f.getName.startsWith("_")) + assert(partFiles.length === 1) + + val orcFilePath = new Path(partFiles.head.getAbsolutePath) + val readerOptions = OrcFile.readerOptions(new Configuration()) + val reader = OrcFile.createReader(orcFilePath, readerOptions) + var recordReader: RecordReaderImpl = null + try { + recordReader = reader.rows.asInstanceOf[RecordReaderImpl] + + // BloomFilter array is created for all types; `struct`, int (`a`), string (`b`) + val sargColumns = Array(true, true, true) + val orcIndex = recordReader.readRowIndex(0, null, sargColumns) + + // Check the types and counts of bloom filters + assert(orcIndex.getBloomFilterKinds.forall(_ === bloomFilterKind)) --- End diff -- It seems the test here is creating orc with bloom filter using spark with options, and read it back through native ORC reader. Is there a plan to add an optimizer rule in Spark to show the functionality of this in the physical plan like predicate pushdown in parquet?
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org