Github user fjh100456 commented on a diff in the pull request: https://github.com/apache/spark/pull/20087#discussion_r162520864 --- Diff: sql/hive/src/test/scala/org/apache/spark/sql/hive/CompressionCodecSuite.scala --- @@ -0,0 +1,321 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hive + +import java.io.File + +import scala.collection.JavaConverters._ + +import org.apache.hadoop.fs.Path +import org.apache.orc.OrcConf.COMPRESS +import org.apache.parquet.hadoop.ParquetOutputFormat +import org.scalatest.BeforeAndAfterAll + +import org.apache.spark.sql.execution.datasources.orc.OrcOptions +import org.apache.spark.sql.execution.datasources.parquet.{ParquetOptions, ParquetTest} +import org.apache.spark.sql.hive.orc.OrcFileOperator +import org.apache.spark.sql.hive.test.TestHiveSingleton +import org.apache.spark.sql.internal.SQLConf + +class CompressionCodecSuite extends TestHiveSingleton with ParquetTest with BeforeAndAfterAll { + import spark.implicits._ + + override def beforeAll(): Unit = { + super.beforeAll() + (0 until maxRecordNum).toDF("a").createOrReplaceTempView("table_source") + } + + override def afterAll(): Unit = { + try { + spark.catalog.dropTempView("table_source") + } finally { + super.afterAll() + } + } + + private val maxRecordNum = 500 + + private def getConvertMetastoreConfName(format: String): String = format.toLowerCase match { + case "parquet" => HiveUtils.CONVERT_METASTORE_PARQUET.key + case "orc" => HiveUtils.CONVERT_METASTORE_ORC.key + } + + private def getSparkCompressionConfName(format: String): String = format.toLowerCase match { + case "parquet" => SQLConf.PARQUET_COMPRESSION.key + case "orc" => SQLConf.ORC_COMPRESSION.key + } + + private def getHiveCompressPropName(format: String): String = format.toLowerCase match { + case "parquet" => ParquetOutputFormat.COMPRESSION + case "orc" => COMPRESS.getAttribute + } + + private def normalizeCodecName(format: String, name: String): String = { + format.toLowerCase match { + case "parquet" => ParquetOptions.shortParquetCompressionCodecNames(name).name() + case "orc" => OrcOptions.shortOrcCompressionCodecNames(name) + } + } + + private def getTableCompressionCodec(path: String, format: String): Seq[String] = { + val hadoopConf = spark.sessionState.newHadoopConf() + val codecs = format.toLowerCase match { + case "parquet" => for { + footer <- readAllFootersWithoutSummaryFiles(new Path(path), hadoopConf) + block <- footer.getParquetMetadata.getBlocks.asScala + column <- block.getColumns.asScala + } yield column.getCodec.name() + case "orc" => new File(path).listFiles().filter{ file => + file.isFile && !file.getName.endsWith(".crc") && file.getName != "_SUCCESS" + }.map { orcFile => + OrcFileOperator.getFileReader(orcFile.toPath.toString).get.getCompression.toString + }.toSeq + } + codecs.distinct + } + + private def createTable( + rootDir: File, + tableName: String, + isPartitioned: Boolean, + format: String, + compressionCodec: Option[String]): Unit = { + val tblProperties = compressionCodec match { + case Some(prop) => s"TBLPROPERTIES('${getHiveCompressPropName(format)}'='$prop')" + case _ => "" + } + val partitionCreate = if (isPartitioned) "PARTITIONED BY (p string)" else "" + sql( + s""" + |CREATE TABLE $tableName(a int) + |$partitionCreate + |STORED AS $format + |LOCATION '${rootDir.toURI.toString.stripSuffix("/")}/$tableName' + |$tblProperties + """.stripMargin) + } + + private def writeDataToTable( + tableName: String, + partition: Option[String]): Unit = { + val partitionInsert = partition.map(p => s"partition (p='$p')").mkString + sql( + s""" + |INSERT INTO TABLE $tableName + |$partitionInsert + |SELECT * FROM table_source + """.stripMargin) + } + + private def getTableSize(path: String): Long = { + val dir = new File(path) + val files = dir.listFiles().filter(_.getName.startsWith("part-")) + files.map(_.length()).sum + } + + private def getTablePartitionPath(dir: File, tableName: String, partition: Option[String]) = { + val partitionPath = partition.map(p => s"p=$p").mkString + s"${dir.getPath.stripSuffix("/")}/$tableName/$partitionPath" + } + + private def getUncompressedDataSizeByFormat( + format: String, isPartitioned: Boolean): Long = { + var totalSize = 0L + val tableName = s"tbl_$format" + val codecName = normalizeCodecName(format, "uncompressed") + withSQLConf(getSparkCompressionConfName(format) -> codecName) { + withTempDir { tmpDir => + withTable(tableName) { + createTable(tmpDir, tableName, isPartitioned, format, Option(codecName)) + val partition = if (isPartitioned) Some("test") else None + writeDataToTable(tableName, partition) + val path = getTablePartitionPath(tmpDir, tableName, partition) + totalSize = getTableSize(path) + } + } + } + assert(totalSize > 0L) + totalSize + } + + private def checkCompressionCodecForTable( + format: String, + isPartitioned: Boolean, + compressionCodec: Option[String]) + (assertion: (String, Long) => Unit): Unit = { + val tableName = s"tbl_$format$isPartitioned" + withTempDir { tmpDir => + withTable(tableName) { + createTable(tmpDir, tableName, isPartitioned, format, compressionCodec) + val partition = if (isPartitioned) Some("test") else None + writeDataToTable(tableName, partition) + val path = getTablePartitionPath(tmpDir, tableName, partition) + val relCompressionCodecs = getTableCompressionCodec(path, format) + assert(relCompressionCodecs.length == 1) + val tableSize = getTableSize(path) + assertion(relCompressionCodecs.head, tableSize) + } + } + } + + private def checkTableCompressionCodecForCodecs( + format: String, + isPartitioned: Boolean, + convertMetastore: Boolean, + compressionCodecs: List[String], + tableCompressionCodecs: List[String]) + (assertionCompressionCodec: (Option[String], String, String, Long) => Unit): Unit = { + withSQLConf(getConvertMetastoreConfName(format) -> convertMetastore.toString) { + tableCompressionCodecs.foreach { tableCompression => + compressionCodecs.foreach { sessionCompressionCodec => + withSQLConf(getSparkCompressionConfName(format) -> sessionCompressionCodec) { + // 'tableCompression = null' means no table-level compression + val compression = Option(tableCompression) + checkCompressionCodecForTable(format, isPartitioned, compression) { + case (realCompressionCodec, tableSize) => assertionCompressionCodec(compression, + sessionCompressionCodec, realCompressionCodec, tableSize) + } + } + } + } + } + } + + // When the amount of data is small, compressed data size may be larger than uncompressed one, + // so we just check the difference when compressionCodec is not NONE or UNCOMPRESSED. + private def checkTableSize( + format: String, + compressionCodec: String, + isPartitioned: Boolean, + convertMetastore: Boolean, + tableSize: Long): Boolean = { + format match { + case "parquet" => + val uncompressedSize = if (!convertMetastore || isPartitioned) { + getUncompressedDataSizeByFormat(format, isPartitioned = true) + } else { + getUncompressedDataSizeByFormat(format, isPartitioned = false) + } + + if (compressionCodec == "UNCOMPRESSED") { + tableSize == uncompressedSize + } else { + tableSize != uncompressedSize + } + case "orc" => + val uncompressedSize = if (!convertMetastore || isPartitioned) { + getUncompressedDataSizeByFormat(format, isPartitioned = true) + } else { + getUncompressedDataSizeByFormat(format, isPartitioned = false) + } + + if (compressionCodec == "NONE") { + tableSize == uncompressedSize + } else { + tableSize != uncompressedSize + } + case _ => false + } + } + + def checkForTableWithCompressProp(format: String, compressCodecs: List[String]): Unit = { + Seq(true, false).foreach { isPartitioned => + Seq(true, false).foreach { convertMetastore => + checkTableCompressionCodecForCodecs( + format, + isPartitioned, + convertMetastore, + compressionCodecs = compressCodecs, + tableCompressionCodecs = compressCodecs) { + case (tableCompressionCodec, sessionCompressionCodec, realCompressionCodec, tableSize) => + // For non-partitioned table and when convertMetastore is false, Expect session-level + // take effect, and in other cases expect table-level take effect + val expectCompressionCodec = + if (convertMetastore && !isPartitioned) sessionCompressionCodec + else tableCompressionCodec.get + + assert(expectCompressionCodec == realCompressionCodec) + assert(checkTableSize(format, expectCompressionCodec, + isPartitioned, convertMetastore, tableSize)) + } + } + } + } + + def checkForTableWithoutCompressProp(format: String, compressCodecs: List[String]): Unit = { + Seq(true, false).foreach { isPartitioned => + Seq(true, false).foreach { convertMetastore => + checkTableCompressionCodecForCodecs( + format, + isPartitioned, + convertMetastore, + compressionCodecs = compressCodecs, + tableCompressionCodecs = List(null)) { --- End diff -- If change to `Nil`, the follow function may requires special handling of this situation. Set to `null` is to get a `None`. Can we keep it? ``` scala private def checkTableCompressionCodecForCodecs( format: String, isPartitioned: Boolean, convertMetastore: Boolean, compressionCodecs: List[String], tableCompressionCodecs: List[String]) (assertionCompressionCodec: (Option[String], String, String, Long) => Unit): Unit = { withSQLConf(getConvertMetastoreConfName(format) -> convertMetastore.toString) { tableCompressionCodecs.foreach { tableCompression => compressionCodecs.foreach { sessionCompressionCodec => withSQLConf(getSparkCompressionConfName(format) -> sessionCompressionCodec) { // 'tableCompression = null' means no table-level compression val compression = Option(tableCompression) checkCompressionCodecForTable(format, isPartitioned, compression) { case (realCompressionCodec, tableSize) => assertionCompressionCodec( compression, sessionCompressionCodec, realCompressionCodec, tableSize) } } } } } } ```
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org