This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch branch-3.4 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.4 by this push: new f985d716e164 [SPARK-45433][SQL][3.4] Fix CSV/JSON schema inference when timestamps do not match specified timestampFormat f985d716e164 is described below commit f985d716e164885575ec7f36a7782694411da024 Author: Jia Fan <fanjiaemi...@qq.com> AuthorDate: Thu Oct 12 17:09:48 2023 +0500 [SPARK-45433][SQL][3.4] Fix CSV/JSON schema inference when timestamps do not match specified timestampFormat ### What changes were proposed in this pull request? This is a backport PR of #43243. Fix the bug of schema inference when timestamps do not match specified timestampFormat. Please check #43243 for detail. ### Why are the changes needed? Fix schema inference bug on 3.4. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? add new test. ### Was this patch authored or co-authored using generative AI tooling? Closes #43343 from Hisoka-X/backport-SPARK-45433-inference-schema. Authored-by: Jia Fan <fanjiaemi...@qq.com> Signed-off-by: Max Gekk <max.g...@gmail.com> --- .../org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala | 8 ++++++-- .../org/apache/spark/sql/catalyst/json/JsonInferSchema.scala | 7 +++++-- .../apache/spark/sql/catalyst/csv/CSVInferSchemaSuite.scala | 10 ++++++++++ .../apache/spark/sql/catalyst/json/JsonInferSchemaSuite.scala | 8 ++++++++ 4 files changed, 29 insertions(+), 4 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala index 51586a0065e9..dd8ac3985f19 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala @@ -28,6 +28,7 @@ import org.apache.spark.sql.catalyst.util.{DateFormatter, TimestampFormatter} import org.apache.spark.sql.catalyst.util.LegacyDateFormats.FAST_DATE_FORMAT import org.apache.spark.sql.errors.QueryExecutionErrors import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.SQLConf.LegacyBehaviorPolicy import org.apache.spark.sql.types._ class CSVInferSchema(val options: CSVOptions) extends Serializable { @@ -202,8 +203,11 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable { // We can only parse the value as TimestampNTZType if it does not have zone-offset or // time-zone component and can be parsed with the timestamp formatter. // Otherwise, it is likely to be a timestamp with timezone. - if (timestampNTZFormatter.parseWithoutTimeZoneOptional(field, false).isDefined) { - SQLConf.get.timestampType + val timestampType = SQLConf.get.timestampType + if ((SQLConf.get.legacyTimeParserPolicy == LegacyBehaviorPolicy.LEGACY || + timestampType == TimestampNTZType) && + timestampNTZFormatter.parseWithoutTimeZoneOptional(field, false).isDefined) { + timestampType } else { tryParseTimestamp(field) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala index 5385afe8c935..7e4767750fd3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala @@ -33,6 +33,7 @@ import org.apache.spark.sql.catalyst.util._ import org.apache.spark.sql.catalyst.util.LegacyDateFormats.FAST_DATE_FORMAT import org.apache.spark.sql.errors.QueryExecutionErrors import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.SQLConf.LegacyBehaviorPolicy import org.apache.spark.sql.types._ import org.apache.spark.util.Utils @@ -148,11 +149,13 @@ private[sql] class JsonInferSchema(options: JSONOptions) extends Serializable { val bigDecimal = decimalParser(field) DecimalType(bigDecimal.precision, bigDecimal.scale) } + val timestampType = SQLConf.get.timestampType if (options.prefersDecimal && decimalTry.isDefined) { decimalTry.get - } else if (options.inferTimestamp && + } else if (options.inferTimestamp && (SQLConf.get.legacyTimeParserPolicy == + LegacyBehaviorPolicy.LEGACY || timestampType == TimestampNTZType) && timestampNTZFormatter.parseWithoutTimeZoneOptional(field, false).isDefined) { - SQLConf.get.timestampType + timestampType } else if (options.inferTimestamp && timestampFormatter.parseOptional(field).isDefined) { TimestampType diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchemaSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchemaSuite.scala index acedf7998c2d..fb91200557a6 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchemaSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchemaSuite.scala @@ -263,4 +263,14 @@ class CSVInferSchemaSuite extends SparkFunSuite with SQLHelper { inferSchema = new CSVInferSchema(options) assert(inferSchema.inferField(DateType, "2012_12_12") == DateType) } + + test("SPARK-45433: inferring the schema when timestamps do not match specified timestampFormat" + + " with only one row") { + val options = new CSVOptions( + Map("timestampFormat" -> "yyyy-MM-dd'T'HH:mm:ss"), + columnPruning = false, + defaultTimeZoneId = "UTC") + val inferSchema = new CSVInferSchema(options) + assert(inferSchema.inferField(NullType, "2884-06-24T02:45:51.138") == StringType) + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JsonInferSchemaSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JsonInferSchemaSuite.scala index 8290b38e3393..81a4858dce82 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JsonInferSchemaSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JsonInferSchemaSuite.scala @@ -112,4 +112,12 @@ class JsonInferSchemaSuite extends SparkFunSuite with SQLHelper { checkType(Map("inferTimestamp" -> "true"), json, TimestampType) checkType(Map("inferTimestamp" -> "false"), json, StringType) } + + test("SPARK-45433: inferring the schema when timestamps do not match specified timestampFormat" + + " with only one row") { + checkType( + Map("timestampFormat" -> "yyyy-MM-dd'T'HH:mm:ss", "inferTimestamp" -> "true"), + """{"a": "2884-06-24T02:45:51.138"}""", + StringType) + } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org