[GitHub] [spark] bersprockets commented on a diff in pull request #36871: [SPARK-39469][SQL] Infer date type for CSV schema inference

GitBox Fri, 24 Jun 2022 16:16:03 -0700


bersprockets commented on code in PR #36871:
URL: https://github.com/apache/spark/pull/36871#discussion_r906513077



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala:
##########
@@ -197,34 +198,46 @@ class UnivocityParser(
         Decimal(decimalParser(datum), dt.precision, dt.scale)
       }
 
-    case _: TimestampType => (d: String) =>
+    case _: DateType => (d: String) =>
       nullSafeDatum(d, name, nullable, options) { datum =>
         try {
-          timestampFormatter.parse(datum)
+          dateFormatter.parse(datum)
         } catch {
           case NonFatal(e) =>
             // If fails to parse, then tries the way used in 2.0 and 1.x for 
backwards
             // compatibility.
             val str = 
DateTimeUtils.cleanLegacyTimestampStr(UTF8String.fromString(datum))
-            DateTimeUtils.stringToTimestamp(str, 
options.zoneId).getOrElse(throw e)
+            DateTimeUtils.stringToDate(str).getOrElse(throw e)
         }
       }
 
-    case _: TimestampNTZType => (d: String) =>
-      nullSafeDatum(d, name, nullable, options) { datum =>
-        timestampNTZFormatter.parseWithoutTimeZone(datum, false)
-      }
-
-    case _: DateType => (d: String) =>
+    case _: TimestampType => (d: String) =>
       nullSafeDatum(d, name, nullable, options) { datum =>
         try {
-          dateFormatter.parse(datum)
+          timestampFormatter.parse(datum)
         } catch {
           case NonFatal(e) =>
             // If fails to parse, then tries the way used in 2.0 and 1.x for 
backwards
             // compatibility.
             val str = 
DateTimeUtils.cleanLegacyTimestampStr(UTF8String.fromString(datum))
-            DateTimeUtils.stringToDate(str).getOrElse(throw e)
+            DateTimeUtils.stringToTimestamp(str, options.zoneId).getOrElse {
+              // There may be date type entries in timestamp column due to 
schema inference
+              if (options.inferDate) {
+                daysToMicros(dateFormatter.parse(datum), options.zoneId)
+              } else {
+                throw(e)
+              }
+            }
+        }
+      }
+
+    case _: TimestampNTZType => (d: String) =>
+      nullSafeDatum(d, name, nullable, options) { datum =>
+        try {
+          timestampNTZFormatter.parseWithoutTimeZone(datum, false)
+        } catch {
+          case NonFatal(e) if (options.inferDate) =>
+            daysToMicros(dateFormatter.parse(datum), options.zoneId)

Review Comment:
   I think zoneId should probably be UTC for timestamp_ntz. Otherwise, you end 
up with oddities like this:
   ```
   scala> sql("set spark.sql.timestampType=TIMESTAMP_NTZ")
   res0: org.apache.spark.sql.DataFrame = [key: string, value: string]
   
   scala> val options = Map(
     "inferSchema" -> "true",
     "timestampFormat" -> "yyyy/MM/dd HH:mm:ss",
     "timestampNTZFormat" -> "yyyy-MM-dd'T'HH:mm:ss",  
     "dateFormat" -> "yyyy-MM-dd",
     "inferDate" -> "true")
   
   options: scala.collection.immutable.Map[String,String] = Map(inferSchema -> 
true, timestampFormat -> yyyy/MM/dd HH:mm:ss, timestampNTZFormat -> 
yyyy-MM-dd'T'HH:mm:ss, dateFormat -> yyyy-MM-dd, inferDate -> true)
   
   scala> 
   scala> val csvInput = Seq("2022-01-01T00:00:00", "2022-06-22").toDS()
   csvInput: org.apache.spark.sql.Dataset[String] = [value: string]
   
   scala> val df = spark.read.options(options).csv(csvInput)
   df: org.apache.spark.sql.DataFrame = [_c0: timestamp_ntz]
   
   scala> df.show(false)
   +-------------------+
   |_c0                |
   +-------------------+
   |2022-01-01 00:00:00|
   |2022-06-22 07:00:00|
   +-------------------+
   
   scala> 
   ```
   Note `2022-06-22` becomes `2022-06-22 07:00:00`



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] bersprockets commented on a diff in pull request #36871: [SPARK-39469][SQL] Infer date type for CSV schema inference

Reply via email to