(spark) branch master updated: [SPARK-48215][SQL] Extending support for collated strings on date_format expression

wenchen Wed, 22 May 2024 04:28:22 -0700

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new e04d3d7c430a [SPARK-48215][SQL] Extending support for collated strings 
on date_format expression
e04d3d7c430a is described below

commit e04d3d7c430a1fa446f0379680f619b8b14b5eb5
Author: Nebojsa Savic <nebojsa.sa...@databricks.com>
AuthorDate: Wed May 22 04:28:06 2024 -0700

    [SPARK-48215][SQL] Extending support for collated strings on date_format 
expression
    
    ### What changes were proposed in this pull request?
    We are extending support for collated strings on date_format function, 
since currently it throws DATATYPE_MISSMATCH exception when collated strings 
are passed as "format" parameter. 
https://docs.databricks.com/en/sql/language-manual/functions/date_format.html
    
    ### Why are the changes needed?
    Exception is thrown on invocation when collated strings are passed as 
arguments to date_format.
    
    ### Does this PR introduce _any_ user-facing change?
    No user facing changes, extending support.
    
    ### How was this patch tested?
    Tests are added with this PR.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    No.
    
    Closes #46561 from nebojsa-db/SPARK-48215.
    
    Authored-by: Nebojsa Savic <nebojsa.sa...@databricks.com>
    Signed-off-by: Wenchen Fan <wenc...@databricks.com>
---
 .../catalyst/expressions/datetimeExpressions.scala |  5 ++--
 .../spark/sql/CollationSQLExpressionsSuite.scala   | 32 ++++++++++++++++++++++
 2 files changed, 35 insertions(+), 2 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
index 081a42f5608e..8caf8c5d48c2 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
@@ -36,6 +36,7 @@ import org.apache.spark.sql.catalyst.util.DateTimeUtils._
 import org.apache.spark.sql.catalyst.util.LegacyDateFormats.SIMPLE_DATE_FORMAT
 import org.apache.spark.sql.errors.{QueryCompilationErrors, 
QueryExecutionErrors}
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.internal.types.StringTypeAnyCollation
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.types.DayTimeIntervalType.DAY
 import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String}
@@ -951,9 +952,9 @@ case class DateFormatClass(left: Expression, right: 
Expression, timeZoneId: Opti
 
   def this(left: Expression, right: Expression) = this(left, right, None)
 
-  override def dataType: DataType = StringType
+  override def dataType: DataType = SQLConf.get.defaultStringType
 
-  override def inputTypes: Seq[AbstractDataType] = Seq(TimestampType, 
StringType)
+  override def inputTypes: Seq[AbstractDataType] = Seq(TimestampType, 
StringTypeAnyCollation)
 
   override def withTimeZone(timeZoneId: String): TimeZoneAwareExpression =
     copy(timeZoneId = Option(timeZoneId))
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala
index 0d48f9f0a88d..828245bb3fdd 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala
@@ -1600,6 +1600,38 @@ class CollationSQLExpressionsSuite
     })
   }
 
+  test("DateFormat expression with collation") {
+    case class DateFormatTestCase[R](date: String, format: String, collation: 
String, result: R)
+    val testCases = Seq(
+      DateFormatTestCase("2021-01-01", "yyyy-MM-dd", "UTF8_BINARY", 
"2021-01-01"),
+      DateFormatTestCase("2021-01-01", "yyyy-dd", "UTF8_BINARY_LCASE", 
"2021-01"),
+      DateFormatTestCase("2021-01-01", "yyyy-MM-dd", "UNICODE", "2021-01-01"),
+      DateFormatTestCase("2021-01-01", "yyyy", "UNICODE_CI", "2021")
+    )
+
+    for {
+      collateDate <- Seq(true, false)
+      collateFormat <- Seq(true, false)
+    } {
+      testCases.foreach(t => {
+        val dateArg = if (collateDate) s"collate('${t.date}', 
'${t.collation}')" else s"'${t.date}'"
+        val formatArg =
+          if (collateFormat) {
+            s"collate('${t.format}', '${t.collation}')"
+          } else {
+            s"'${t.format}'"
+          }
+
+        withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collation) {
+          val query = s"SELECT date_format(${dateArg}, ${formatArg})"
+          // Result & data type
+          checkAnswer(sql(query), Row(t.result))
+          
assert(sql(query).schema.fields.head.dataType.sameType(StringType(t.collation)))
+        }
+      })
+    }
+  }
+
   // TODO: Add more tests for other SQL expressions
 
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

(spark) branch master updated: [SPARK-48215][SQL] Extending support for collated strings on date_format expression

Reply via email to