Repository: spark
Updated Branches:
  refs/heads/branch-1.4 451c8722a -> 65981619b


[SPARK-8420] [SQL] Fix comparision of timestamps/dates with strings (branch-1.4)

This is branch 1.4 backport of https://github.com/apache/spark/pull/6888.

Below is the original description.

In earlier versions of Spark SQL we casted `TimestampType` and `DataType` to 
`StringType` when it was involved in a binary comparison with a `StringType`.  
This allowed comparing a timestamp with a partial date as a user would expect.
 - `time > "2014-06-10"`
 - `time > "2014"`

In 1.4.0 we tried to cast the String instead into a Timestamp.  However, since 
partial dates are not a valid complete timestamp this results in `null` which 
results in the tuple being filtered.

This PR restores the earlier behavior.  Note that we still special case 
equality so that these comparisons are not affected by not printing zeros for 
subsecond precision.

Author: Michael Armbrust <michaeldatabricks.com>

Closes #6888 from marmbrus/timeCompareString and squashes the following commits:

bdef29c [Michael Armbrust] test partial date
1f09adf [Michael Armbrust] special handling of equality
1172c60 [Michael Armbrust] more test fixing
4dfc412 [Michael Armbrust] fix tests
aaa9508 [Michael Armbrust] newline
04d908f [Michael Armbrust] [SPARK-8420][SQL] Fix comparision of 
timestamps/dates with strings

Conflicts:
        
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
        
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala

Author: Michael Armbrust <mich...@databricks.com>

Closes #6914 from yhuai/timeCompareString-1.4 and squashes the following 
commits:

9882915 [Michael Armbrust] [SPARK-8420] [SQL] Fix comparision of 
timestamps/dates with strings


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/65981619
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/65981619
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/65981619

Branch: refs/heads/branch-1.4
Commit: 65981619b26da03f0c5133133e318a180235e96d
Parents: 451c872
Author: Michael Armbrust <mich...@databricks.com>
Authored: Mon Jun 22 10:45:33 2015 -0700
Committer: Yin Huai <yh...@databricks.com>
Committed: Mon Jun 22 10:45:33 2015 -0700

----------------------------------------------------------------------
 .../catalyst/analysis/HiveTypeCoercion.scala    | 17 ++++--
 .../sql/catalyst/expressions/predicates.scala   |  9 ++++
 .../apache/spark/sql/DataFrameDateSuite.scala   | 56 ++++++++++++++++++++
 .../org/apache/spark/sql/SQLQuerySuite.scala    |  4 ++
 .../scala/org/apache/spark/sql/TestData.scala   |  6 ---
 .../columnar/InMemoryColumnarQuerySuite.scala   |  7 ++-
 6 files changed, 88 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/65981619/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
----------------------------------------------------------------------
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
index fa7968e..6d0f4a0 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
@@ -242,7 +242,16 @@ trait HiveTypeCoercion {
       case a: BinaryArithmetic if a.right.dataType == StringType =>
         a.makeCopy(Array(a.left, Cast(a.right, DoubleType)))
 
-      // we should cast all timestamp/date/string compare into string compare
+      // For equality between string and timestamp we cast the string to a 
timestamp
+      // so that things like rounding of subsecond precision does not affect 
the comparison.
+      case p @ Equality(left @ StringType(), right @ TimestampType()) =>
+        p.makeCopy(Array(Cast(left, TimestampType), right))
+      case p @ Equality(left @ TimestampType(), right @ StringType()) =>
+        p.makeCopy(Array(left, Cast(right, TimestampType)))
+
+      // We should cast all relative timestamp/date/string comparison into 
string comparisions
+      // This behaves as a user would expect because timestamp strings sort 
lexicographically.
+      // i.e. TimeStamp(2013-01-01 00:00 ...) < "2014" = true
       case p: BinaryComparison if p.left.dataType == StringType &&
                                   p.right.dataType == DateType =>
         p.makeCopy(Array(p.left, Cast(p.right, StringType)))
@@ -251,10 +260,12 @@ trait HiveTypeCoercion {
         p.makeCopy(Array(Cast(p.left, StringType), p.right))
       case p: BinaryComparison if p.left.dataType == StringType &&
                                   p.right.dataType == TimestampType =>
-        p.makeCopy(Array(Cast(p.left, TimestampType), p.right))
+        p.makeCopy(Array(p.left, Cast(p.right, StringType)))
       case p: BinaryComparison if p.left.dataType == TimestampType &&
                                   p.right.dataType == StringType =>
-        p.makeCopy(Array(p.left, Cast(p.right, TimestampType)))
+        p.makeCopy(Array(Cast(p.left, StringType), p.right))
+
+      // Comparisons between dates and timestamps.
       case p: BinaryComparison if p.left.dataType == TimestampType &&
                                   p.right.dataType == DateType =>
         p.makeCopy(Array(Cast(p.left, StringType), Cast(p.right, StringType)))

http://git-wip-us.apache.org/repos/asf/spark/blob/65981619/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
----------------------------------------------------------------------
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index 1d72a9e..fdd29bf 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -175,6 +175,15 @@ abstract class BinaryComparison extends BinaryExpression 
with Predicate {
   self: Product =>
 }
 
+/** An extractor that matches both standard 3VL equality and null-safe 
equality. */
+private[sql] object Equality {
+  def unapply(e: BinaryComparison): Option[(Expression, Expression)] = e match 
{
+    case EqualTo(l, r) => Some((l, r))
+    case EqualNullSafe(l, r) => Some((l, r))
+    case _ => None
+  }
+}
+
 case class EqualTo(left: Expression, right: Expression) extends 
BinaryComparison {
   override def symbol: String = "="
 

http://git-wip-us.apache.org/repos/asf/spark/blob/65981619/sql/core/src/test/scala/org/apache/spark/sql/DataFrameDateSuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameDateSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameDateSuite.scala
new file mode 100644
index 0000000..a4719a3
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameDateSuite.scala
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import java.sql.{Date, Timestamp}
+
+class DataFrameDateTimeSuite extends QueryTest {
+
+  private lazy val ctx = org.apache.spark.sql.test.TestSQLContext
+  import ctx.implicits._
+
+  test("timestamp comparison with date strings") {
+    val df = Seq(
+      (1, Timestamp.valueOf("2015-01-01 00:00:00")),
+      (2, Timestamp.valueOf("2014-01-01 00:00:00"))).toDF("i", "t")
+
+    checkAnswer(
+      df.select("t").filter($"t" <= "2014-06-01"),
+      Row(Timestamp.valueOf("2014-01-01 00:00:00")) :: Nil)
+
+
+    checkAnswer(
+      df.select("t").filter($"t" >= "2014-06-01"),
+      Row(Timestamp.valueOf("2015-01-01 00:00:00")) :: Nil)
+  }
+
+  test("date comparison with date strings") {
+    val df = Seq(
+      (1, Date.valueOf("2015-01-01")),
+      (2, Date.valueOf("2014-01-01"))).toDF("i", "t")
+
+    checkAnswer(
+      df.select("t").filter($"t" <= "2014-06-01"),
+      Row(Date.valueOf("2014-01-01")) :: Nil)
+
+
+    checkAnswer(
+      df.select("t").filter($"t" >= "2015"),
+      Row(Date.valueOf("2015-01-01")) :: Nil)
+  }
+}

http://git-wip-us.apache.org/repos/asf/spark/blob/65981619/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index bf18bf8..8a0679e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -19,6 +19,8 @@ package org.apache.spark.sql
 
 import org.scalatest.BeforeAndAfterAll
 
+import java.sql.Timestamp
+
 import org.apache.spark.sql.catalyst.DefaultParserDialect
 import org.apache.spark.sql.catalyst.errors.DialectException
 import org.apache.spark.sql.execution.GeneratedAggregate
@@ -311,6 +313,8 @@ class SQLQuerySuite extends QueryTest with 
BeforeAndAfterAll {
   }
 
   test("SPARK-3173 Timestamp support in the parser") {
+    (0 to 3).map(i => Tuple1(new 
Timestamp(i))).toDF("time").registerTempTable("timestamps")
+
     checkAnswer(sql(
       "SELECT time FROM timestamps WHERE time='1969-12-31 16:00:00.0'"),
       Row(java.sql.Timestamp.valueOf("1969-12-31 16:00:00")))

http://git-wip-us.apache.org/repos/asf/spark/blob/65981619/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala
index 725a18b..520a862 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/TestData.scala
@@ -174,12 +174,6 @@ object TestData {
       "3, C3, true, null" ::
       "4, D4, true, 2147483644" :: Nil)
 
-  case class TimestampField(time: Timestamp)
-  val timestamps = TestSQLContext.sparkContext.parallelize((0 to 3).map { i =>
-    TimestampField(new Timestamp(i))
-  })
-  timestamps.toDF().registerTempTable("timestamps")
-
   case class IntField(i: Int)
   // An RDD with 4 elements and 8 partitions
   val withEmptyParts = TestSQLContext.sparkContext.parallelize((1 to 
4).map(IntField), 8)

http://git-wip-us.apache.org/repos/asf/spark/blob/65981619/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala
index 56591d9..72ad769 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/columnar/InMemoryColumnarQuerySuite.scala
@@ -90,15 +90,18 @@ class InMemoryColumnarQuerySuite extends QueryTest {
   }
 
   test("SPARK-2729 regression: timestamp data type") {
+    val timestamps = (0 to 3).map(i => Tuple1(new Timestamp(i))).toDF("time")
+    timestamps.registerTempTable("timestamps")
+
     checkAnswer(
       sql("SELECT time FROM timestamps"),
-      timestamps.collect().toSeq.map(Row.fromTuple))
+      timestamps.collect().toSeq)
 
     cacheTable("timestamps")
 
     checkAnswer(
       sql("SELECT time FROM timestamps"),
-      timestamps.collect().toSeq.map(Row.fromTuple))
+      timestamps.collect().toSeq)
   }
 
   test("SPARK-3320 regression: batched column buffer building should work with 
empty partitions") {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to