o Spark version

GitBox Tue, 26 May 2020 20:30:11 -0700


cloud-fan commented on a change in pull request #28630:
URL: https://github.com/apache/spark/pull/28630#discussion_r429950987




##########
File path: 
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
##########
@@ -875,81 +876,149 @@ class ParquetIOSuite extends QueryTest with ParquetTest 
with SharedSparkSession
     }
   }
 
+  // It generates input files for the test below:
+  // "SPARK-31159: compatibility with Spark 2.4 in reading dates/timestamps"
+  ignore("SPARK-31806: generate test files for checking compatibility with 
Spark 2.4") {
+    val resourceDir = "sql/core/src/test/resources/test-data"
+    val version = "2_4_5"
+    val N = 8
+    def save(
+        in: Seq[(String, String)],
+        t: String,
+        dstFile: String,
+        options: Map[String, String] = Map.empty): Unit = {
+      withTempDir { dir =>
+        in.toDF("dict", "plain")
+          .select($"dict".cast(t), $"plain".cast(t))
+          .repartition(1)
+          .write
+          .mode("overwrite")
+          .options(options)
+          .parquet(dir.getCanonicalPath)
+        Files.copy(
+          
dir.listFiles().filter(_.getName.endsWith(".snappy.parquet")).head.toPath,
+          Paths.get(resourceDir, dstFile),
+          StandardCopyOption.REPLACE_EXISTING)
+      }
+    }
+    DateTimeTestUtils.withDefaultTimeZone(DateTimeTestUtils.LA) {
+      withSQLConf(
+        SQLConf.SESSION_LOCAL_TIMEZONE.key -> DateTimeTestUtils.LA.getId,
+        SQLConf.LEGACY_PARQUET_REBASE_MODE_IN_WRITE.key -> "CORRECTED") {

Review comment:
       why do we need this? we only run this test in 2.x to generate files.

##########
File path: 
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
##########
@@ -875,81 +876,149 @@ class ParquetIOSuite extends QueryTest with ParquetTest 
with SharedSparkSession
     }
   }
 
+  // It generates input files for the test below:
+  // "SPARK-31159: compatibility with Spark 2.4 in reading dates/timestamps"
+  ignore("SPARK-31806: generate test files for checking compatibility with 
Spark 2.4") {
+    val resourceDir = "sql/core/src/test/resources/test-data"
+    val version = "2_4_5"
+    val N = 8
+    def save(
+        in: Seq[(String, String)],
+        t: String,
+        dstFile: String,
+        options: Map[String, String] = Map.empty): Unit = {
+      withTempDir { dir =>
+        in.toDF("dict", "plain")
+          .select($"dict".cast(t), $"plain".cast(t))
+          .repartition(1)
+          .write
+          .mode("overwrite")
+          .options(options)
+          .parquet(dir.getCanonicalPath)
+        Files.copy(
+          
dir.listFiles().filter(_.getName.endsWith(".snappy.parquet")).head.toPath,
+          Paths.get(resourceDir, dstFile),
+          StandardCopyOption.REPLACE_EXISTING)
+      }
+    }
+    DateTimeTestUtils.withDefaultTimeZone(DateTimeTestUtils.LA) {
+      withSQLConf(
+        SQLConf.SESSION_LOCAL_TIMEZONE.key -> DateTimeTestUtils.LA.getId,
+        SQLConf.LEGACY_PARQUET_REBASE_MODE_IN_WRITE.key -> "CORRECTED") {
+        save(
+          (1 to N).map(i => ("1001-01-01", s"1001-01-0$i")),
+          "date",
+          s"before_1582_date_v$version.snappy.parquet")
+        withSQLConf(SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key -> 
"TIMESTAMP_MILLIS") {
+          save(
+            (1 to N).map(i => ("1001-01-01 01:02:03.123", s"1001-01-0$i 
01:02:03.123")),
+            "timestamp",
+            s"before_1582_timestamp_millis_v$version.snappy.parquet")
+        }
+        val usTs = (1 to N).map(i => ("1001-01-01 01:02:03.123456", 
s"1001-01-0$i 01:02:03.123456"))
+        withSQLConf(SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key -> 
"TIMESTAMP_MICROS") {
+          save(usTs, "timestamp", 
s"before_1582_timestamp_micros_v$version.snappy.parquet")
+        }
+        withSQLConf(SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key -> "INT96") {
+          save(
+            usTs,
+            "timestamp",
+            s"before_1582_timestamp_int96_plain_v$version.snappy.parquet",

Review comment:
       why we have 2 files for plain and dictionary-encoding for int96? other 
types just have one file and 2 columns.
   
   if it's caused by some parquet limitation, let's write a comment to explain 
it. and maybe we only need one column for each file?

##########
File path: 
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
##########
@@ -875,81 +876,149 @@ class ParquetIOSuite extends QueryTest with ParquetTest 
with SharedSparkSession
     }
   }
 
+  // It generates input files for the test below:
+  // "SPARK-31159: compatibility with Spark 2.4 in reading dates/timestamps"
+  ignore("SPARK-31806: generate test files for checking compatibility with 
Spark 2.4") {
+    val resourceDir = "sql/core/src/test/resources/test-data"
+    val version = "2_4_5"
+    val N = 8
+    def save(
+        in: Seq[(String, String)],
+        t: String,
+        dstFile: String,
+        options: Map[String, String] = Map.empty): Unit = {
+      withTempDir { dir =>
+        in.toDF("dict", "plain")
+          .select($"dict".cast(t), $"plain".cast(t))
+          .repartition(1)
+          .write
+          .mode("overwrite")
+          .options(options)
+          .parquet(dir.getCanonicalPath)
+        Files.copy(
+          
dir.listFiles().filter(_.getName.endsWith(".snappy.parquet")).head.toPath,
+          Paths.get(resourceDir, dstFile),
+          StandardCopyOption.REPLACE_EXISTING)
+      }
+    }
+    DateTimeTestUtils.withDefaultTimeZone(DateTimeTestUtils.LA) {
+      withSQLConf(
+        SQLConf.SESSION_LOCAL_TIMEZONE.key -> DateTimeTestUtils.LA.getId,
+        SQLConf.LEGACY_PARQUET_REBASE_MODE_IN_WRITE.key -> "CORRECTED") {
+        save(
+          (1 to N).map(i => ("1001-01-01", s"1001-01-0$i")),
+          "date",
+          s"before_1582_date_v$version.snappy.parquet")
+        withSQLConf(SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key -> 
"TIMESTAMP_MILLIS") {
+          save(
+            (1 to N).map(i => ("1001-01-01 01:02:03.123", s"1001-01-0$i 
01:02:03.123")),
+            "timestamp",
+            s"before_1582_timestamp_millis_v$version.snappy.parquet")
+        }
+        val usTs = (1 to N).map(i => ("1001-01-01 01:02:03.123456", 
s"1001-01-0$i 01:02:03.123456"))
+        withSQLConf(SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key -> 
"TIMESTAMP_MICROS") {
+          save(usTs, "timestamp", 
s"before_1582_timestamp_micros_v$version.snappy.parquet")

Review comment:
       for timestamp it should be "before_1900"?

##########
File path: 
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
##########
@@ -875,81 +876,149 @@ class ParquetIOSuite extends QueryTest with ParquetTest 
with SharedSparkSession
     }
   }
 
+  // It generates input files for the test below:
+  // "SPARK-31159: compatibility with Spark 2.4 in reading dates/timestamps"
+  ignore("SPARK-31806: generate test files for checking compatibility with 
Spark 2.4") {
+    val resourceDir = "sql/core/src/test/resources/test-data"
+    val version = "2_4_5"
+    val N = 8
+    def save(
+        in: Seq[(String, String)],
+        t: String,
+        dstFile: String,
+        options: Map[String, String] = Map.empty): Unit = {
+      withTempDir { dir =>
+        in.toDF("dict", "plain")
+          .select($"dict".cast(t), $"plain".cast(t))
+          .repartition(1)
+          .write
+          .mode("overwrite")
+          .options(options)
+          .parquet(dir.getCanonicalPath)
+        Files.copy(
+          
dir.listFiles().filter(_.getName.endsWith(".snappy.parquet")).head.toPath,
+          Paths.get(resourceDir, dstFile),
+          StandardCopyOption.REPLACE_EXISTING)
+      }
+    }
+    DateTimeTestUtils.withDefaultTimeZone(DateTimeTestUtils.LA) {
+      withSQLConf(
+        SQLConf.SESSION_LOCAL_TIMEZONE.key -> DateTimeTestUtils.LA.getId,
+        SQLConf.LEGACY_PARQUET_REBASE_MODE_IN_WRITE.key -> "CORRECTED") {
+        save(
+          (1 to N).map(i => ("1001-01-01", s"1001-01-0$i")),
+          "date",
+          s"before_1582_date_v$version.snappy.parquet")
+        withSQLConf(SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key -> 
"TIMESTAMP_MILLIS") {
+          save(
+            (1 to N).map(i => ("1001-01-01 01:02:03.123", s"1001-01-0$i 
01:02:03.123")),
+            "timestamp",
+            s"before_1582_timestamp_millis_v$version.snappy.parquet")
+        }
+        val usTs = (1 to N).map(i => ("1001-01-01 01:02:03.123456", 
s"1001-01-0$i 01:02:03.123456"))
+        withSQLConf(SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key -> 
"TIMESTAMP_MICROS") {
+          save(usTs, "timestamp", 
s"before_1582_timestamp_micros_v$version.snappy.parquet")
+        }
+        withSQLConf(SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key -> "INT96") {
+          save(
+            usTs,
+            "timestamp",
+            s"before_1582_timestamp_int96_plain_v$version.snappy.parquet",

Review comment:
       why we have 2 files for plain and dictionary-encoding for int96? other 
types just have one file and 2 columns.
   
   if it's caused by some parquet limitation, let's write a comment to explain 
it.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] cloud-fan commented on a change in pull request #28630: [SPARK-31806][SQL][TESTS] Check reading date/timestamp from legacy parquet: dictionary encoding, w/o Spark version

Reply via email to