liangz1 commented on a change in pull request #27565: [WIP][SPARK-30791][SQL][PYTHON] Add 'sameSemantics' and 'sementicHash' methods in Dataset URL: https://github.com/apache/spark/pull/27565#discussion_r379882384
########## File path: python/pyspark/sql/dataframe.py ########## @@ -2153,6 +2153,59 @@ def transform(self, func): "should have been DataFrame." % type(result) return result + @since(3.1) + def sameSemantics(self, other): + """ + Returns `True` when the logical query plans inside both :class:`DataFrame`\\s are equal and + therefore return same results. + + .. note:: The equality comparison here is simplified by tolerating the cosmetic differences + such as attribute names. + + .. note::This API can compare both :class:`DataFrame`\\s very fast but can still return + `False` on the :class:`DataFrame` that return the same results, for instance, from + different plans. Such false negative semantic can be useful when caching as an example. + + >>> df1 = spark.range(100) + >>> df2 = spark.range(100) + >>> df3 = spark.range(100) + >>> df4 = spark.range(100) + >>> df1.withColumn("col1", df1.id * 2).sameSemantics(df2.withColumn("col1", df2.id * 2)) + True + >>> df1.withColumn("col1", df1.id * 2).sameSemantics(df3.withColumn("col1", df3.id + 2)) + False + >>> df1.withColumn("col1", df1.id * 2).sameSemantics(df4.withColumn("col0", df4.id * 2)) + True + """ + if not isinstance(other, DataFrame): + raise ValueError("other parameter should be of DataFrame; however, got %s" + % type(other)) + return self._jdf.sameSemantics(other._jdf) + + @since(3.1) + def semanticHash(self): + """ + Returns a hash code of the logical query plan against this :class:`DataFrame`. + + .. note:: Unlike the standard hash code, the hash is calculated against the query plan + simplified by tolerating the cosmetic differences such as attribute names. + + >>> df1 = spark.range(100) + >>> df2 = spark.range(100) + >>> df3 = spark.range(100) + >>> df4 = spark.range(100) + >>> df1.withColumn("col1", df1.id * 2).semanticHash() == \ + df2.withColumn("col1", df2.id * 2).semanticHash() + True + >>> df1.withColumn("col1", df1.id * 2).semanticHash() == \ + df3.withColumn("col1", df3.id + 2).semanticHash() + False Review comment: Same behavior for dataframe from `spark.read.load()` ``` >>> df4=spark.read.load(csv_file_path, format="csv", inferSchema="true", header="true") >>> df4.schema StructType(List(StructField(bool_col,BooleanType,true),StructField(float_col,DoubleType,true),StructField(double_col,DoubleType,true),StructField(int_col,IntegerType,true),StructField(long_col,IntegerType,true))) >>> df4.withColumn("col1", df4.int_col *2).semanticHash() -1746346451 >>> df4.withColumn("col1", df4.int_col +2).semanticHash() -1746346451 ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org