This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new a30983d [SPARK-27512][SQL] Avoid to replace ',' in CSV's decimal type inference for backward compatibility a30983d is described below commit a30983db575de5c87b3a4698b223229327fd65cf Author: HyukjinKwon <gurwls...@apache.org> AuthorDate: Wed Apr 24 16:22:07 2019 +0900 [SPARK-27512][SQL] Avoid to replace ',' in CSV's decimal type inference for backward compatibility ## What changes were proposed in this pull request? The code below currently infers as decimal but previously it was inferred as string. **In branch-2.4**, type inference path for decimal and parsing data are different. https://github.com/apache/spark/blob/2a8343121e62aabe5c69d1e20fbb2c01e2e520e7/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala#L153 https://github.com/apache/spark/blob/c284c4e1f6f684ca8db1cc446fdcc43b46e3413c/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala#L125 So the code below: ```scala scala> spark.read.option("delimiter", "|").option("inferSchema", "true").csv(Seq("1,2").toDS).printSchema() ``` produced string as its type. ``` root |-- _c0: string (nullable = true) ``` **In the current master**, it now infers decimal as below: ``` root |-- _c0: decimal(2,0) (nullable = true) ``` It happened after https://github.com/apache/spark/pull/22979 because, now after this PR, we only have one way to parse decimal: https://github.com/apache/spark/blob/7a83d71403edf7d24fa5efc0ef913f3ce76d88b8/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExprUtils.scala#L92 **After the fix:** ``` root |-- _c0: string (nullable = true) ``` This PR proposes to restore the previous behaviour back in `CSVInferSchema`. ## How was this patch tested? Manually tested and unit tests were added. Closes #24437 from HyukjinKwon/SPARK-27512. Authored-by: HyukjinKwon <gurwls...@apache.org> Signed-off-by: HyukjinKwon <gurwls...@apache.org> --- .../scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala | 7 ++++++- .../org/apache/spark/sql/catalyst/csv/CSVInferSchemaSuite.scala | 4 +++- .../org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala | 7 +++++++ 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala index ae9f057..03cc3cb 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.catalyst.csv +import java.util.Locale + import scala.util.control.Exception.allCatch import org.apache.spark.rdd.RDD @@ -32,7 +34,10 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable { options.zoneId, options.locale) - private val decimalParser = { + private val decimalParser = if (options.locale == Locale.US) { + // Special handling the default locale for backward compatibility + s: String => new java.math.BigDecimal(s) + } else { ExprUtils.getDecimalParser(options.locale) } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchemaSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchemaSuite.scala index c2b525a..24d909e 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchemaSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchemaSuite.scala @@ -185,6 +185,8 @@ class CSVInferSchemaSuite extends SparkFunSuite with SQLHelper { assert(inferSchema.inferField(NullType, input) == expectedType) } - Seq("en-US", "ko-KR", "ru-RU", "de-DE").foreach(checkDecimalInfer(_, DecimalType(7, 0))) + // input like '1,0' is inferred as strings for backward compatibility. + Seq("en-US").foreach(checkDecimalInfer(_, StringType)) + Seq("ko-KR", "ru-RU", "de-DE").foreach(checkDecimalInfer(_, DecimalType(7, 0))) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index 6584a30..90deade 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -2052,4 +2052,11 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te checkAnswer(rows, expectedRows) } + + test("SPARK-27512: Decimal type inference should not handle ',' for backward compatibility") { + assert(spark.read + .option("delimiter", "|") + .option("inferSchema", "true") + .csv(Seq("1,2").toDS).schema.head.dataType === StringType) + } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org