cloud-fan commented on a change in pull request #31319: URL: https://github.com/apache/spark/pull/31319#discussion_r565018052
########## File path: sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala ########## @@ -3868,6 +3869,57 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark assert(unions.size == 1) } + + test("SPARK-34212 Parquet should read decimals correctly") { + // a is int-decimal (4 bytes), b is long-decimal (8 bytes), c is binary-decimal (16 bytes) + val df = sql("SELECT 1.0 a, CAST(1.23 AS DECIMAL(17, 2)) b, CAST(1.23 AS DECIMAL(36, 2)) c") + + withTempPath { path => + df.write.parquet(path.toString) + + withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") { + val schema1 = "a DECIMAL(3, 2), b DECIMAL(18, 3), c DECIMAL(37, 3)" + checkAnswer(spark.read.schema(schema1).parquet(path.toString), df) + val schema2 = "a DECIMAL(3, 0), b DECIMAL(18, 1), c DECIMAL(37, 1)" + checkAnswer(spark.read.schema(schema2).parquet(path.toString), Row(1, 1.2, 1.2)) + } + + withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "true") { + val e1 = intercept[SparkException] { + spark.read.schema("a DECIMAL(3, 2)").parquet(path.toString).collect() + }.getCause.getCause + assert(e1.isInstanceOf[SchemaColumnConvertNotSupportedException]) + + val e2 = intercept[SparkException] { + spark.read.schema("b DECIMAL(18, 1)").parquet(path.toString).collect() + }.getCause.getCause + assert(e2.isInstanceOf[SchemaColumnConvertNotSupportedException]) + + val e3 = intercept[SparkException] { + spark.read.schema("c DECIMAL(37, 1)").parquet(path.toString).collect() + }.getCause.getCause + assert(e3.isInstanceOf[SchemaColumnConvertNotSupportedException]) + } + } + + withTempPath { path => + val df2 = sql(s"SELECT 1 a, ${Int.MaxValue + 1L} b, CAST(2 AS BINARY) c") Review comment: I'm not very sure about the last one. When the data is binary in parquet files, we don't know what the binary means and is hard to read it out as decimal. For example, what happens if the data is `CAST(1.2 AS BINARY)`? ########## File path: sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala ########## @@ -3868,6 +3869,57 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark assert(unions.size == 1) } + + test("SPARK-34212 Parquet should read decimals correctly") { + // a is int-decimal (4 bytes), b is long-decimal (8 bytes), c is binary-decimal (16 bytes) + val df = sql("SELECT 1.0 a, CAST(1.23 AS DECIMAL(17, 2)) b, CAST(1.23 AS DECIMAL(36, 2)) c") + + withTempPath { path => + df.write.parquet(path.toString) + + withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") { + val schema1 = "a DECIMAL(3, 2), b DECIMAL(18, 3), c DECIMAL(37, 3)" + checkAnswer(spark.read.schema(schema1).parquet(path.toString), df) + val schema2 = "a DECIMAL(3, 0), b DECIMAL(18, 1), c DECIMAL(37, 1)" + checkAnswer(spark.read.schema(schema2).parquet(path.toString), Row(1, 1.2, 1.2)) + } + + withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "true") { + val e1 = intercept[SparkException] { + spark.read.schema("a DECIMAL(3, 2)").parquet(path.toString).collect() + }.getCause.getCause + assert(e1.isInstanceOf[SchemaColumnConvertNotSupportedException]) + + val e2 = intercept[SparkException] { + spark.read.schema("b DECIMAL(18, 1)").parquet(path.toString).collect() + }.getCause.getCause + assert(e2.isInstanceOf[SchemaColumnConvertNotSupportedException]) + + val e3 = intercept[SparkException] { + spark.read.schema("c DECIMAL(37, 1)").parquet(path.toString).collect() + }.getCause.getCause + assert(e3.isInstanceOf[SchemaColumnConvertNotSupportedException]) + } + } + + withTempPath { path => + val df2 = sql(s"SELECT 1 a, ${Int.MaxValue + 1L} b, CAST(2 AS BINARY) c") Review comment: I'm not very sure about the last one. When the data is binary in parquet files, we don't know what the binary means and is hard to read it out as decimal. For example, what happens if the data is `CAST('abc' AS BINARY)`? ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org