This is an automated email from the ASF dual-hosted git repository. weichenxu123 pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.0 by this push: new cb890d9 Revert "[SPARK-30791][SQL][PYTHON] Add 'sameSemantics' and 'sementicHash' methods in Dataset" cb890d9 is described below commit cb890d96bc38860988dba97efaf6d88cc8c09288 Author: WeichenXu <weichen...@databricks.com> AuthorDate: Tue Feb 18 10:41:49 2020 +0800 Revert "[SPARK-30791][SQL][PYTHON] Add 'sameSemantics' and 'sementicHash' methods in Dataset" This reverts commit ba9141592d0f0ce23c207efb21ae84ac7cc4670a. --- python/pyspark/sql/dataframe.py | 46 ---------------------- python/pyspark/sql/tests/test_dataframe.py | 5 --- .../main/scala/org/apache/spark/sql/Dataset.scala | 28 ------------- .../scala/org/apache/spark/sql/DatasetSuite.scala | 15 ------- 4 files changed, 94 deletions(-) diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 8325b68..2432b81 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -2153,52 +2153,6 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): "should have been DataFrame." % type(result) return result - @since(3.1) - def sameSemantics(self, other): - """ - Returns `True` when the logical query plans inside both :class:`DataFrame`\\s are equal and - therefore return same results. - - .. note:: The equality comparison here is simplified by tolerating the cosmetic differences - such as attribute names. - - .. note:: This API can compare both :class:`DataFrame`\\s very fast but can still return - `False` on the :class:`DataFrame` that return the same results, for instance, from - different plans. Such false negative semantic can be useful when caching as an example. - - .. note:: DeveloperApi - - >>> df1 = spark.range(10) - >>> df2 = spark.range(10) - >>> df1.withColumn("col1", df1.id * 2).sameSemantics(df2.withColumn("col1", df2.id * 2)) - True - >>> df1.withColumn("col1", df1.id * 2).sameSemantics(df2.withColumn("col1", df2.id + 2)) - False - >>> df1.withColumn("col1", df1.id * 2).sameSemantics(df2.withColumn("col0", df2.id * 2)) - True - """ - if not isinstance(other, DataFrame): - raise ValueError("other parameter should be of DataFrame; however, got %s" - % type(other)) - return self._jdf.sameSemantics(other._jdf) - - @since(3.1) - def semanticHash(self): - """ - Returns a hash code of the logical query plan against this :class:`DataFrame`. - - .. note:: Unlike the standard hash code, the hash is calculated against the query plan - simplified by tolerating the cosmetic differences such as attribute names. - - .. note:: DeveloperApi - - >>> spark.range(10).selectExpr("id as col0").semanticHash() # doctest: +SKIP - 1855039936 - >>> spark.range(10).selectExpr("id as col1").semanticHash() # doctest: +SKIP - 1855039936 - """ - return self._jdf.semanticHash() - where = copy_func( filter, sinceversion=1.3, diff --git a/python/pyspark/sql/tests/test_dataframe.py b/python/pyspark/sql/tests/test_dataframe.py index 942cd4b..d738449 100644 --- a/python/pyspark/sql/tests/test_dataframe.py +++ b/python/pyspark/sql/tests/test_dataframe.py @@ -782,11 +782,6 @@ class DataFrameTests(ReusedSQLTestCase): break self.assertEqual(df.take(8), result) - def test_same_semantics_error(self): - with QuietTest(self.sc): - with self.assertRaisesRegexp(ValueError, "should be of DataFrame.*int"): - self.spark.range(10).sameSemantics(1) - class QueryExecutionListenerTests(unittest.TestCase, SQLTestUtils): # These tests are separate because it uses 'spark.sql.queryExecutionListeners' which is diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index 5cd2583..42f3535 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -3310,34 +3310,6 @@ class Dataset[T] private[sql]( files.toSet.toArray } - /** - * Returns `true` when the logical query plans inside both [[Dataset]]s are equal and - * therefore return same results. - * - * @note The equality comparison here is simplified by tolerating the cosmetic differences - * such as attribute names. - * @note This API can compare both [[Dataset]]s very fast but can still return `false` on - * the [[Dataset]] that return the same results, for instance, from different plans. Such - * false negative semantic can be useful when caching as an example. - * @since 3.1.0 - */ - @DeveloperApi - def sameSemantics(other: Dataset[T]): Boolean = { - queryExecution.analyzed.sameResult(other.queryExecution.analyzed) - } - - /** - * Returns a `hashCode` of the logical query plan against this [[Dataset]]. - * - * @note Unlike the standard `hashCode`, the hash is calculated against the query plan - * simplified by tolerating the cosmetic differences such as attribute names. - * @since 3.1.0 - */ - @DeveloperApi - def semanticHash(): Int = { - queryExecution.analyzed.semanticHash() - } - //////////////////////////////////////////////////////////////////////////// // For Python API //////////////////////////////////////////////////////////////////////////// diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala index b4ed4ec..b0bd612 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala @@ -1909,21 +1909,6 @@ class DatasetSuite extends QueryTest assert(active eq SparkSession.getActiveSession.get) } - - test("SPARK-30791: sameSemantics and semanticHash work") { - val df1 = Seq((1, 2), (4, 5)).toDF("col1", "col2") - val df2 = Seq((1, 2), (4, 5)).toDF("col1", "col2") - val df3 = Seq((0, 2), (4, 5)).toDF("col1", "col2") - val df4 = Seq((0, 2), (4, 5)).toDF("col0", "col2") - - assert(df1.sameSemantics(df2) === true) - assert(df1.sameSemantics(df3) === false) - assert(df3.sameSemantics(df4) === true) - - assert(df1.semanticHash === df2.semanticHash) - assert(df1.semanticHash !== df3.semanticHash) - assert(df3.semanticHash === df4.semanticHash) - } } object AssertExecutionId { --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org