[GitHub] spark pull request #16781: [SPARK-12297][SQL] Hive compatibility for Parquet...

squito Wed, 19 Apr 2017 21:17:43 -0700

Github user squito commented on a diff in the pull request:

    https://github.com/apache/spark/pull/16781#discussion_r112362704
  
    --- Diff: 
sql/hive/src/test/scala/org/apache/spark/sql/hive/ParquetHiveCompatibilitySuite.scala
 ---
    @@ -141,4 +160,326 @@ class ParquetHiveCompatibilitySuite extends 
ParquetCompatibilityTest with TestHi
           Row(Seq(Row(1))),
           "ARRAY<STRUCT<array_element: INT>>")
       }
    +
    +  val testTimezones = Seq(
    +    "UTC" -> "UTC",
    +    "LA" -> "America/Los_Angeles",
    +    "Berlin" -> "Europe/Berlin"
    +  )
    +  // Check creating parquet tables with timestamps, writing data into 
them, and reading it back out
    +  // under a variety of conditions:
    +  // * tables with explicit tz and those without
    +  // * altering table properties directly
    +  // * variety of timezones, local & non-local
    +  val sessionTimezones = testTimezones.map(_._2).map(Some(_)) ++ Seq(None)
    +  sessionTimezones.foreach { sessionTzOpt =>
    +    val sparkSession = spark.newSession()
    +    sessionTzOpt.foreach { tz => 
sparkSession.conf.set(SQLConf.SESSION_LOCAL_TIMEZONE.key, tz) }
    +    testCreateWriteRead(sparkSession, "no_tz", None, sessionTzOpt)
    +    val localTz = TimeZone.getDefault.getID()
    +    testCreateWriteRead(sparkSession, "local", Some(localTz), sessionTzOpt)
    +    // check with a variety of timezones.  The unit tests currently are 
configured to always use
    +    // America/Los_Angeles, but even if they didn't, we'd be sure to cover 
a non-local timezone.
    +    Seq(
    +      "UTC" -> "UTC",
    +      "LA" -> "America/Los_Angeles",
    +      "Berlin" -> "Europe/Berlin"
    +    ).foreach { case (tableName, zone) =>
    +      if (zone != localTz) {
    +        testCreateWriteRead(sparkSession, tableName, Some(zone), 
sessionTzOpt)
    +      }
    +    }
    +  }
    +
    +  private def testCreateWriteRead(
    +      sparkSession: SparkSession,
    +      baseTable: String,
    +      explicitTz: Option[String],
    +      sessionTzOpt: Option[String]): Unit = {
    +    testCreateAlterTablesWithTimezone(sparkSession, baseTable, explicitTz, 
sessionTzOpt)
    +    testWriteTablesWithTimezone(sparkSession, baseTable, explicitTz, 
sessionTzOpt)
    +    testReadTablesWithTimezone(sparkSession, baseTable, explicitTz, 
sessionTzOpt)
    +  }
    +
    +  private def checkHasTz(table: String, tz: Option[String]): Unit = {
    +    val tableMetadata = 
spark.sessionState.catalog.getTableMetadata(TableIdentifier(table))
    +    
assert(tableMetadata.properties.get(ParquetFileFormat.PARQUET_TIMEZONE_TABLE_PROPERTY)
 === tz)
    +  }
    +
    +  private def testCreateAlterTablesWithTimezone(
    +      spark: SparkSession,
    +      baseTable: String,
    +      explicitTz: Option[String],
    +      sessionTzOpt: Option[String]): Unit = {
    +    test(s"SPARK-12297: Create and Alter Parquet tables and timezones; 
explicitTz = $explicitTz; " +
    +      s"sessionTzOpt = $sessionTzOpt") {
    +      val key = ParquetFileFormat.PARQUET_TIMEZONE_TABLE_PROPERTY
    +      withTable(baseTable, s"like_$baseTable", s"select_$baseTable") {
    +        val localTz = TimeZone.getDefault()
    +        val localTzId = localTz.getID()
    +        // If we ever add a property to set the table timezone by default, 
defaultTz would change
    +        val defaultTz = None
    +        // check that created tables have correct TBLPROPERTIES
    +        val tblProperties = explicitTz.map {
    +          tz => raw"""TBLPROPERTIES ($key="$tz")"""
    +        }.getOrElse("")
    +        spark.sql(
    +          raw"""CREATE TABLE $baseTable (
    +                |  x int
    +                | )
    +                | STORED AS PARQUET
    +                | $tblProperties
    +            """.stripMargin)
    +        val expectedTableTz = explicitTz.orElse(defaultTz)
    +        checkHasTz(baseTable, expectedTableTz)
    +        spark.sql(s"CREATE TABLE like_$baseTable LIKE $baseTable")
    +        checkHasTz(s"like_$baseTable", expectedTableTz)
    +        spark.sql(
    +          raw"""CREATE TABLE select_$baseTable
    +                | STORED AS PARQUET
    +                | AS
    +                | SELECT * from $baseTable
    +            """.stripMargin)
    +        checkHasTz(s"select_$baseTable", defaultTz)
    +
    +        // check alter table, setting, unsetting, resetting the property
    +        spark.sql(
    +          raw"""ALTER TABLE $baseTable SET TBLPROPERTIES 
($key="America/Los_Angeles")""")
    +        checkHasTz(baseTable, Some("America/Los_Angeles"))
    +        spark.sql( raw"""ALTER TABLE $baseTable SET TBLPROPERTIES 
($key="UTC")""")
    +        checkHasTz(baseTable, Some("UTC"))
    +        spark.sql( raw"""ALTER TABLE $baseTable UNSET TBLPROPERTIES 
($key)""")
    +        checkHasTz(baseTable, None)
    +        explicitTz.foreach { tz =>
    +          spark.sql( raw"""ALTER TABLE $baseTable SET TBLPROPERTIES 
($key="$tz")""")
    +          checkHasTz(baseTable, expectedTableTz)
    +        }
    +      }
    +    }
    +  }
    +
    +  val desiredTimestampStrings = Seq(
    +    "2015-12-31 22:49:59.123",
    +    "2015-12-31 23:50:59.123",
    +    "2016-01-01 00:39:59.123",
    +    "2016-01-01 01:29:59.123"
    +  )
    +  // We don't want to mess with timezones inside the tests themselves, 
since we use a shared
    +  // spark context, and then we might be prone to issues from lazy vals 
for timezones.  Instead,
    +  // we manually adjust the timezone just to determine what the desired 
millis (since epoch, in utc)
    +  // is for various "wall-clock" times in different timezones, and then we 
can compare against those
    +  // in our tests.
    +  val originalTz = TimeZone.getDefault
    +  val timestampTimezoneToMillis = try {
    +    (for {
    +      timestampString <- desiredTimestampStrings
    +      timezone <- Seq("America/Los_Angeles", "Europe/Berlin", "UTC").map {
    +        TimeZone.getTimeZone(_)
    +      }
    +    } yield {
    +      TimeZone.setDefault(timezone)
    +      val timestamp = Timestamp.valueOf(timestampString)
    +      (timestampString, timezone.getID()) -> timestamp.getTime()
    +    }).toMap
    +  } finally {
    +    TimeZone.setDefault(originalTz)
    +  }
    +
    +  private def createRawData(spark: SparkSession): Dataset[(String, 
Timestamp)] = {
    +    val rowRdd = spark.sparkContext.parallelize(desiredTimestampStrings, 
1).map(Row(_))
    +    val schema = StructType(Seq(
    +      StructField("display", StringType, true)
    +    ))
    +    val df = spark.createDataFrame(rowRdd, schema)
    +    // this will get the millis corresponding to the display time given 
the current *session*
    +    // timezone.
    +    import spark.implicits._
    +    df.withColumn("ts", expr("cast(display as timestamp)")).as[(String, 
Timestamp)]
    +  }
    +
    +  private def testWriteTablesWithTimezone(
    +      spark: SparkSession,
    +      baseTable: String,
    +      explicitTz: Option[String],
    +      sessionTzOpt: Option[String]) : Unit = {
    +    val key = ParquetFileFormat.PARQUET_TIMEZONE_TABLE_PROPERTY
    +    test(s"SPARK-12297: Write to Parquet tables with Timestamps; 
explicitTz = $explicitTz; " +
    +        s"sessionTzOpt = $sessionTzOpt") {
    +
    +      withTable(s"saveAsTable_$baseTable", s"insert_$baseTable") {
    +        val sessionTzId = 
sessionTzOpt.getOrElse(TimeZone.getDefault().getID())
    +        // check that created tables have correct TBLPROPERTIES
    +        val tblProperties = explicitTz.map {
    +          tz => raw"""TBLPROPERTIES ($key="$tz")"""
    +        }.getOrElse("")
    +
    +
    +        val rawData = createRawData(spark)
    +        // Check writing data out.
    +        // We write data into our tables, and then check the raw parquet 
files to see whether
    +        // the correct conversion was applied.
    +        rawData.write.saveAsTable(s"saveAsTable_$baseTable")
    +        checkHasTz(s"saveAsTable_$baseTable", None)
    +        spark.sql(
    +          raw"""CREATE TABLE insert_$baseTable (
    +                |  display string,
    +                |  ts timestamp
    +                | )
    +                | STORED AS PARQUET
    +                | $tblProperties
    +               """.stripMargin)
    +        checkHasTz(s"insert_$baseTable", explicitTz)
    +        rawData.write.insertInto(s"insert_$baseTable")
    +        // no matter what, roundtripping via the table should leave the 
data unchanged
    +        val readFromTable = spark.table(s"insert_$baseTable").collect()
    +          .map { row => (row.getAs[String](0), 
row.getAs[Timestamp](1)).toString() }.sorted
    +        assert(readFromTable === 
rawData.collect().map(_.toString()).sorted)
    +
    +        // Now we load the raw parquet data on disk, and check if it was 
adjusted correctly.
    +        // Note that we only store the timezone in the table property, so 
when we read the
    +        // data this way, we're bypassing all of the conversion logic, and 
reading the raw
    +        // values in the parquet file.
    +        val onDiskLocation = spark.sessionState.catalog
    +          
.getTableMetadata(TableIdentifier(s"insert_$baseTable")).location.getPath
    +        // we test reading the data back with and without the vectorized 
reader, to make sure we
    +        // haven't broken reading parquet from non-hive tables, with both 
readers.
    +        Seq(false, true).foreach { vectorized =>
    +          spark.conf.set(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key, 
vectorized)
    +          val readFromDisk = spark.read.parquet(onDiskLocation).collect()
    +          val storageTzId = explicitTz.getOrElse(sessionTzId)
    +          readFromDisk.foreach { row =>
    +            val displayTime = row.getAs[String](0)
    +            val millis = row.getAs[Timestamp](1).getTime()
    +            val expectedMillis = timestampTimezoneToMillis((displayTime, 
storageTzId))
    +            assert(expectedMillis === millis, s"Display time 
'$displayTime' was stored " +
    +              s"incorrectly with sessionTz = ${sessionTzOpt}; Got $millis, 
expected " +
    +              s"$expectedMillis (delta = ${millis - expectedMillis})")
    +          }
    +        }
    +      }
    +    }
    +  }
    +
    +  private def testReadTablesWithTimezone(
    +      spark: SparkSession,
    +      baseTable: String,
    +      explicitTz: Option[String],
    +      sessionTzOpt: Option[String]): Unit = {
    +      val key = ParquetFileFormat.PARQUET_TIMEZONE_TABLE_PROPERTY
    +    test(s"SPARK-12297: Read from Parquet tables with Timestamps; 
explicitTz = $explicitTz; " +
    +      s"sessionTzOpt = $sessionTzOpt") {
    +      withTable(s"external_$baseTable") {
    +        // we intentionally save this data directly, without creating a 
table, so we can
    +        // see that the data is read back differently depending on table 
properties.
    +        // we'll save with adjusted millis, so that it should be the 
correct millis after reading
    +        // back.
    +        val rawData = createRawData(spark)
    +        // to avoid closing over entire class
    +        val timestampTimezoneToMillis = this.timestampTimezoneToMillis
    +        import spark.implicits._
    +        val adjustedRawData = (explicitTz match {
    +          case Some(tzId) =>
    +            rawData.map { case (displayTime, _) =>
    +              val storageMillis = timestampTimezoneToMillis((displayTime, 
tzId))
    +              (displayTime, new Timestamp(storageMillis))
    +            }
    +          case _ =>
    +            rawData
    +        }).withColumnRenamed("_1", "display").withColumnRenamed("_2", "ts")
    +        withTempPath { path =>
    +          adjustedRawData.write.parquet(path.getCanonicalPath)
    +          val options = Map("path" -> path.getCanonicalPath) ++
    +            explicitTz.map { tz => Map(key -> tz) }.getOrElse(Map())
    +
    +          spark.catalog.createTable(
    +            tableName = s"external_$baseTable",
    +            source = "parquet",
    +            schema = new StructType().add("display", StringType).add("ts", 
TimestampType),
    +            options = options
    +          )
    +          Seq(false, true).foreach { vectorized =>
    +            withClue(s"vectorized = $vectorized;") {
    +              
spark.conf.set(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key, vectorized)
    --- End diff --
    
    I was initially using `SQLTestUtils.withSQLConf`, but I discovered that it 
wasn't actually taking any effect. I dunno if that is because 
`TestHiveSingleton` does something strange, or maybe I'm doing something else 
weird in this test by creating many new spark sessions.  But I did that because 
it was the only way I could get the conf changes applied consistently.
    
    Since I am creating new sessions, I don't think this has any risk of a 
failed test not cleaning and triggering failures in other tests outside of this 
suite.  But it still seems like I might be doing something wrong ...



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] spark pull request #16781: [SPARK-12297][SQL] Hive compatibility for Parquet...

Reply via email to