[GitHub] spark pull request #22418: [SPARK-25427][SQL][TEST] Add BloomFilter creation...

dbtsai Mon, 17 Sep 2018 10:32:32 -0700

Github user dbtsai commented on a diff in the pull request:

    https://github.com/apache/spark/pull/22418#discussion_r218158845
  
    --- Diff: 
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala
 ---
    @@ -50,6 +55,66 @@ abstract class OrcSuite extends OrcTest with 
BeforeAndAfterAll {
           .createOrReplaceTempView("orc_temp_table")
       }
     
    +  protected def testBloomFilterCreation(bloomFilterKind: Kind) {
    +    val tableName = "bloomFilter"
    +
    +    withTempDir { dir =>
    +      withTable(tableName) {
    +        val sqlStatement = orcImp match {
    +          case "native" =>
    +            s"""
    +               |CREATE TABLE $tableName (a INT, b STRING)
    +               |USING ORC
    +               |OPTIONS (
    +               |  path '${dir.toURI}',
    +               |  orc.bloom.filter.columns '*',
    +               |  orc.bloom.filter.fpp 0.1
    +               |)
    +            """.stripMargin
    +          case "hive" =>
    +            s"""
    +               |CREATE TABLE $tableName (a INT, b STRING)
    +               |STORED AS ORC
    +               |LOCATION '${dir.toURI}'
    +               |TBLPROPERTIES (
    +               |  orc.bloom.filter.columns='*',
    +               |  orc.bloom.filter.fpp=0.1
    +               |)
    +            """.stripMargin
    +          case impl =>
    +            throw new UnsupportedOperationException(s"Unknown ORC 
implementation: $impl")
    +        }
    +
    +        sql(sqlStatement)
    +        sql(s"INSERT INTO $tableName VALUES (1, 'str')")
    +
    +        val partFiles = dir.listFiles()
    +          .filter(f => f.isFile && !f.getName.startsWith(".") && 
!f.getName.startsWith("_"))
    +        assert(partFiles.length === 1)
    +
    +        val orcFilePath = new Path(partFiles.head.getAbsolutePath)
    +        val readerOptions = OrcFile.readerOptions(new Configuration())
    +        val reader = OrcFile.createReader(orcFilePath, readerOptions)
    +        var recordReader: RecordReaderImpl = null
    +        try {
    +          recordReader = reader.rows.asInstanceOf[RecordReaderImpl]
    +
    +          // BloomFilter array is created for all types; `struct`, int 
(`a`), string (`b`)
    +          val sargColumns = Array(true, true, true)
    +          val orcIndex = recordReader.readRowIndex(0, null, sargColumns)
    +
    +          // Check the types and counts of bloom filters
    +          assert(orcIndex.getBloomFilterKinds.forall(_ === 
bloomFilterKind))
    --- End diff --
    
    It seems the test here  is creating orc with bloom filter using spark with 
options, and read it back through native ORC reader. Is there a plan to add an 
optimizer rule in Spark to show the functionality of this in the physical plan 
like predicate pushdown in parquet?



---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] spark pull request #22418: [SPARK-25427][SQL][TEST] Add BloomFilter creation...

Reply via email to