[GitHub] spark pull request #20087: [SPARK-21786][SQL] The 'spark.sql.parquet.compres...

fjh100456 Thu, 18 Jan 2018 17:55:29 -0800

Github user fjh100456 commented on a diff in the pull request:

    https://github.com/apache/spark/pull/20087#discussion_r162520864
  
    --- Diff: 
sql/hive/src/test/scala/org/apache/spark/sql/hive/CompressionCodecSuite.scala 
---
    @@ -0,0 +1,321 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one or more
    + * contributor license agreements.  See the NOTICE file distributed with
    + * this work for additional information regarding copyright ownership.
    + * The ASF licenses this file to You under the Apache License, Version 2.0
    + * (the "License"); you may not use this file except in compliance with
    + * the License.  You may obtain a copy of the License at
    + *
    + *    http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +package org.apache.spark.sql.hive
    +
    +import java.io.File
    +
    +import scala.collection.JavaConverters._
    +
    +import org.apache.hadoop.fs.Path
    +import org.apache.orc.OrcConf.COMPRESS
    +import org.apache.parquet.hadoop.ParquetOutputFormat
    +import org.scalatest.BeforeAndAfterAll
    +
    +import org.apache.spark.sql.execution.datasources.orc.OrcOptions
    +import org.apache.spark.sql.execution.datasources.parquet.{ParquetOptions, 
ParquetTest}
    +import org.apache.spark.sql.hive.orc.OrcFileOperator
    +import org.apache.spark.sql.hive.test.TestHiveSingleton
    +import org.apache.spark.sql.internal.SQLConf
    +
    +class CompressionCodecSuite extends TestHiveSingleton with ParquetTest 
with BeforeAndAfterAll {
    +  import spark.implicits._
    +
    +  override def beforeAll(): Unit = {
    +    super.beforeAll()
    +    (0 until 
maxRecordNum).toDF("a").createOrReplaceTempView("table_source")
    +  }
    +
    +  override def afterAll(): Unit = {
    +    try {
    +      spark.catalog.dropTempView("table_source")
    +    } finally {
    +      super.afterAll()
    +    }
    +  }
    +
    +  private val maxRecordNum = 500
    +
    +  private def getConvertMetastoreConfName(format: String): String = 
format.toLowerCase match {
    +    case "parquet" => HiveUtils.CONVERT_METASTORE_PARQUET.key
    +    case "orc" => HiveUtils.CONVERT_METASTORE_ORC.key
    +  }
    +
    +  private def getSparkCompressionConfName(format: String): String = 
format.toLowerCase match {
    +    case "parquet" => SQLConf.PARQUET_COMPRESSION.key
    +    case "orc" => SQLConf.ORC_COMPRESSION.key
    +  }
    +
    +  private def getHiveCompressPropName(format: String): String = 
format.toLowerCase match {
    +    case "parquet" => ParquetOutputFormat.COMPRESSION
    +    case "orc" => COMPRESS.getAttribute
    +  }
    +
    +  private def normalizeCodecName(format: String, name: String): String = {
    +    format.toLowerCase match {
    +      case "parquet" => 
ParquetOptions.shortParquetCompressionCodecNames(name).name()
    +      case "orc" => OrcOptions.shortOrcCompressionCodecNames(name)
    +    }
    +  }
    +
    +  private def getTableCompressionCodec(path: String, format: String): 
Seq[String] = {
    +    val hadoopConf = spark.sessionState.newHadoopConf()
    +    val codecs = format.toLowerCase match {
    +      case "parquet" => for {
    +        footer <- readAllFootersWithoutSummaryFiles(new Path(path), 
hadoopConf)
    +        block <- footer.getParquetMetadata.getBlocks.asScala
    +        column <- block.getColumns.asScala
    +      } yield column.getCodec.name()
    +      case "orc" => new File(path).listFiles().filter{ file =>
    +        file.isFile && !file.getName.endsWith(".crc") && file.getName != 
"_SUCCESS"
    +      }.map { orcFile =>
    +        
OrcFileOperator.getFileReader(orcFile.toPath.toString).get.getCompression.toString
    +      }.toSeq
    +    }
    +    codecs.distinct
    +  }
    +
    +  private def createTable(
    +      rootDir: File,
    +      tableName: String,
    +      isPartitioned: Boolean,
    +      format: String,
    +      compressionCodec: Option[String]): Unit = {
    +    val tblProperties = compressionCodec match {
    +      case Some(prop) => 
s"TBLPROPERTIES('${getHiveCompressPropName(format)}'='$prop')"
    +      case _ => ""
    +    }
    +    val partitionCreate = if (isPartitioned) "PARTITIONED BY (p string)" 
else ""
    +    sql(
    +      s"""
    +        |CREATE TABLE $tableName(a int)
    +        |$partitionCreate
    +        |STORED AS $format
    +        |LOCATION '${rootDir.toURI.toString.stripSuffix("/")}/$tableName'
    +        |$tblProperties
    +      """.stripMargin)
    +  }
    +
    +  private def writeDataToTable(
    +      tableName: String,
    +      partition: Option[String]): Unit = {
    +    val partitionInsert = partition.map(p => s"partition 
(p='$p')").mkString
    +    sql(
    +      s"""
    +        |INSERT INTO TABLE $tableName
    +        |$partitionInsert
    +        |SELECT * FROM table_source
    +      """.stripMargin)
    +  }
    +
    +  private def getTableSize(path: String): Long = {
    +    val dir = new File(path)
    +    val files = dir.listFiles().filter(_.getName.startsWith("part-"))
    +    files.map(_.length()).sum
    +  }
    +
    +  private def getTablePartitionPath(dir: File, tableName: String, 
partition: Option[String]) = {
    +    val partitionPath = partition.map(p => s"p=$p").mkString
    +    s"${dir.getPath.stripSuffix("/")}/$tableName/$partitionPath"
    +  }
    +
    +  private def getUncompressedDataSizeByFormat(
    +      format: String, isPartitioned: Boolean): Long = {
    +    var totalSize = 0L
    +    val tableName = s"tbl_$format"
    +    val codecName = normalizeCodecName(format, "uncompressed")
    +    withSQLConf(getSparkCompressionConfName(format) -> codecName) {
    +      withTempDir { tmpDir =>
    +        withTable(tableName) {
    +          createTable(tmpDir, tableName, isPartitioned, format, 
Option(codecName))
    +          val partition = if (isPartitioned) Some("test") else None
    +          writeDataToTable(tableName, partition)
    +          val path = getTablePartitionPath(tmpDir, tableName, partition)
    +          totalSize = getTableSize(path)
    +        }
    +      }
    +    }
    +    assert(totalSize > 0L)
    +    totalSize
    +  }
    +
    +  private def checkCompressionCodecForTable(
    +      format: String,
    +      isPartitioned: Boolean,
    +      compressionCodec: Option[String])
    +      (assertion: (String, Long) => Unit): Unit = {
    +    val tableName = s"tbl_$format$isPartitioned"
    +    withTempDir { tmpDir =>
    +      withTable(tableName) {
    +        createTable(tmpDir, tableName, isPartitioned, format, 
compressionCodec)
    +        val partition = if (isPartitioned) Some("test") else None
    +        writeDataToTable(tableName, partition)
    +        val path = getTablePartitionPath(tmpDir, tableName, partition)
    +        val relCompressionCodecs = getTableCompressionCodec(path, format)
    +        assert(relCompressionCodecs.length == 1)
    +        val tableSize = getTableSize(path)
    +        assertion(relCompressionCodecs.head, tableSize)
    +      }
    +    }
    +  }
    +
    +  private def checkTableCompressionCodecForCodecs(
    +      format: String,
    +      isPartitioned: Boolean,
    +      convertMetastore: Boolean,
    +      compressionCodecs: List[String],
    +      tableCompressionCodecs: List[String])
    +      (assertionCompressionCodec: (Option[String], String, String, Long) 
=> Unit): Unit = {
    +    withSQLConf(getConvertMetastoreConfName(format) -> 
convertMetastore.toString) {
    +      tableCompressionCodecs.foreach { tableCompression =>
    +        compressionCodecs.foreach { sessionCompressionCodec =>
    +          withSQLConf(getSparkCompressionConfName(format) -> 
sessionCompressionCodec) {
    +            // 'tableCompression = null' means no table-level compression
    +            val compression = Option(tableCompression)
    +            checkCompressionCodecForTable(format, isPartitioned, 
compression) {
    +              case (realCompressionCodec, tableSize) => 
assertionCompressionCodec(compression,
    +                sessionCompressionCodec, realCompressionCodec, tableSize)
    +            }
    +          }
    +        }
    +      }
    +    }
    +  }
    +
    +  // When the amount of data is small, compressed data size may be larger 
than uncompressed one,
    +  // so we just check the difference when compressionCodec is not NONE or 
UNCOMPRESSED.
    +  private def checkTableSize(
    +      format: String,
    +      compressionCodec: String,
    +      isPartitioned: Boolean,
    +      convertMetastore: Boolean,
    +      tableSize: Long): Boolean = {
    +    format match {
    +      case "parquet" =>
    +        val uncompressedSize = if (!convertMetastore || isPartitioned) {
    +          getUncompressedDataSizeByFormat(format, isPartitioned = true)
    +        } else {
    +          getUncompressedDataSizeByFormat(format, isPartitioned = false)
    +        }
    +
    +        if (compressionCodec == "UNCOMPRESSED") {
    +          tableSize == uncompressedSize
    +        } else {
    +          tableSize != uncompressedSize
    +        }
    +      case "orc" =>
    +        val uncompressedSize = if (!convertMetastore || isPartitioned) {
    +          getUncompressedDataSizeByFormat(format, isPartitioned = true)
    +        } else {
    +          getUncompressedDataSizeByFormat(format, isPartitioned = false)
    +        }
    +
    +        if (compressionCodec == "NONE") {
    +          tableSize == uncompressedSize
    +        } else {
    +          tableSize != uncompressedSize
    +        }
    +      case _ => false
    +    }
    +  }
    +
    +  def checkForTableWithCompressProp(format: String, compressCodecs: 
List[String]): Unit = {
    +    Seq(true, false).foreach { isPartitioned =>
    +      Seq(true, false).foreach { convertMetastore =>
    +        checkTableCompressionCodecForCodecs(
    +          format,
    +          isPartitioned,
    +          convertMetastore,
    +          compressionCodecs = compressCodecs,
    +          tableCompressionCodecs = compressCodecs) {
    +          case (tableCompressionCodec, sessionCompressionCodec, 
realCompressionCodec, tableSize) =>
    +            // For non-partitioned table and when convertMetastore is 
false, Expect session-level
    +            // take effect, and in other cases expect table-level take 
effect
    +            val expectCompressionCodec =
    +              if (convertMetastore && !isPartitioned) 
sessionCompressionCodec
    +              else tableCompressionCodec.get
    +
    +            assert(expectCompressionCodec == realCompressionCodec)
    +            assert(checkTableSize(format, expectCompressionCodec,
    +              isPartitioned, convertMetastore, tableSize))
    +        }
    +      }
    +    }
    +  }
    +
    +  def checkForTableWithoutCompressProp(format: String, compressCodecs: 
List[String]): Unit = {
    +    Seq(true, false).foreach { isPartitioned =>
    +      Seq(true, false).foreach { convertMetastore =>
    +        checkTableCompressionCodecForCodecs(
    +          format,
    +          isPartitioned,
    +          convertMetastore,
    +          compressionCodecs = compressCodecs,
    +          tableCompressionCodecs = List(null)) {
    --- End diff --
    
    If change to `Nil`, the follow function may requires special handling of 
this situation. Set to `null` is to get a `None`.  Can we keep it?
    
    ``` scala
    private def checkTableCompressionCodecForCodecs(
          format: String,
          isPartitioned: Boolean,
          convertMetastore: Boolean,
          compressionCodecs: List[String],
          tableCompressionCodecs: List[String])
          (assertionCompressionCodec: (Option[String], String, String, Long) => 
Unit): Unit = {
        withSQLConf(getConvertMetastoreConfName(format) -> 
convertMetastore.toString) {
          tableCompressionCodecs.foreach { tableCompression =>
            compressionCodecs.foreach { sessionCompressionCodec =>
              withSQLConf(getSparkCompressionConfName(format) -> 
sessionCompressionCodec) {
                // 'tableCompression = null' means no table-level compression
                val compression = Option(tableCompression)
                checkCompressionCodecForTable(format, isPartitioned, 
compression) {
                  case (realCompressionCodec, tableSize) =>
                    assertionCompressionCodec(
                      compression, sessionCompressionCodec, 
realCompressionCodec, tableSize)
                }
              }
            }
          }
        }
      }
    ```



---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] spark pull request #20087: [SPARK-21786][SQL] The 'spark.sql.parquet.compres...

Reply via email to