This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 840620fbda1 [SPARK-41656][CONNECT][TESTS] Enable doctests in pyspark.sql.connect.dataframe 840620fbda1 is described below commit 840620fbda1fa7b01c8bea8d327a8b5d96f9f9ad Author: Sandeep Singh <sand...@techaddict.me> AuthorDate: Tue Jan 3 11:10:11 2023 +0900 [SPARK-41656][CONNECT][TESTS] Enable doctests in pyspark.sql.connect.dataframe ### What changes were proposed in this pull request? This PR proposes to enable doctests in pyspark.sql.connect.dataframe that is virtually the same as pyspark.sql.dataframe. ### Why are the changes needed? To make sure on the PySpark compatibility and test coverage. ### Does this PR introduce any user-facing change? No, doctest's only. ### How was this patch tested? New Doctests Added Closes #39346 from techaddict/SPARK-41656-pyspark.sql.connect.dataframe. Authored-by: Sandeep Singh <sand...@techaddict.me> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- dev/sparktestsupport/modules.py | 1 + python/pyspark/sql/connect/dataframe.py | 99 ++++++++++++++++++++++++++++++++- python/pyspark/sql/dataframe.py | 10 ++-- 3 files changed, 104 insertions(+), 6 deletions(-) diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index 0eeb3dd9218..2c399174c13 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -510,6 +510,7 @@ pyspark_connect = Module( "pyspark.sql.connect.window", "pyspark.sql.connect.column", "pyspark.sql.connect.readwriter", + "pyspark.sql.connect.dataframe", # unittests "pyspark.sql.tests.connect.test_connect_plan", "pyspark.sql.tests.connect.test_connect_basic", diff --git a/python/pyspark/sql/connect/dataframe.py b/python/pyspark/sql/connect/dataframe.py index 95582e86390..0a69b6317f8 100644 --- a/python/pyspark/sql/connect/dataframe.py +++ b/python/pyspark/sql/connect/dataframe.py @@ -35,7 +35,7 @@ import pandas import warnings from collections.abc import Iterable -from pyspark import _NoValue +from pyspark import _NoValue, SparkContext, SparkConf from pyspark._globals import _NoValueType from pyspark.sql.types import DataType, StructType, Row @@ -1373,3 +1373,100 @@ class DataFrameStatFunctions: DataFrameStatFunctions.__doc__ = PySparkDataFrameStatFunctions.__doc__ + + +def _test() -> None: + import os + import sys + import doctest + from pyspark.sql import SparkSession as PySparkSession + from pyspark.testing.connectutils import should_test_connect, connect_requirement_message + + os.chdir(os.environ["SPARK_HOME"]) + + if should_test_connect: + import pyspark.sql.connect.dataframe + + globs = pyspark.sql.connect.dataframe.__dict__.copy() + # Works around to create a regular Spark session + sc = SparkContext("local[4]", "sql.connect.dataframe tests", conf=SparkConf()) + globs["_spark"] = PySparkSession( + sc, options={"spark.app.name": "sql.connect.dataframe tests"} + ) + + # TODO(SPARK-41819): Implement RDD.getNumPartitions + del pyspark.sql.connect.dataframe.DataFrame.coalesce.__doc__ + del pyspark.sql.connect.dataframe.DataFrame.repartition.__doc__ + + # TODO(SPARK-41820): Fix SparkConnectException: requirement failed + del pyspark.sql.connect.dataframe.DataFrame.createOrReplaceGlobalTempView.__doc__ + del pyspark.sql.connect.dataframe.DataFrame.createOrReplaceTempView.__doc__ + + # TODO(SPARK-41821): Fix DataFrame.describe + del pyspark.sql.connect.dataframe.DataFrame.describe.__doc__ + + # TODO(SPARK-41823): ambiguous column names + del pyspark.sql.connect.dataframe.DataFrame.drop.__doc__ + del pyspark.sql.connect.dataframe.DataFrame.join.__doc__ + + # TODO(SPARK-41824): DataFrame.explain format is different + del pyspark.sql.connect.dataframe.DataFrame.explain.__doc__ + del pyspark.sql.connect.dataframe.DataFrame.hint.__doc__ + + # TODO(SPARK-41825): Dataframe.show formatting int as double + del pyspark.sql.connect.dataframe.DataFrame.fillna.__doc__ + del pyspark.sql.connect.dataframe.DataFrameNaFunctions.replace.__doc__ + del pyspark.sql.connect.dataframe.DataFrameNaFunctions.fill.__doc__ + del pyspark.sql.connect.dataframe.DataFrame.replace.__doc__ + del pyspark.sql.connect.dataframe.DataFrame.intersect.__doc__ + + # TODO(SPARK-41826): Implement Dataframe.readStream + del pyspark.sql.connect.dataframe.DataFrame.isStreaming.__doc__ + + # TODO(SPARK-41827): groupBy requires all cols be Column or str + del pyspark.sql.connect.dataframe.DataFrame.groupBy.__doc__ + + # TODO(SPARK-41828): Implement creating empty DataFrame + del pyspark.sql.connect.dataframe.DataFrame.isEmpty.__doc__ + + # TODO(SPARK-41829): Add Dataframe sort ordering + del pyspark.sql.connect.dataframe.DataFrame.sort.__doc__ + del pyspark.sql.connect.dataframe.DataFrame.sortWithinPartitions.__doc__ + + # TODO(SPARK-41830): fix sample parameters + del pyspark.sql.connect.dataframe.DataFrame.sample.__doc__ + + # TODO(SPARK-41831): fix transform to accept ColumnReference + del pyspark.sql.connect.dataframe.DataFrame.transform.__doc__ + + # TODO(SPARK-41832): fix unionByName + del pyspark.sql.connect.dataframe.DataFrame.unionByName.__doc__ + + # TODO(SPARK-41818): Support saveAsTable + del pyspark.sql.connect.dataframe.DataFrame.write.__doc__ + + # Creates a remote Spark session. + os.environ["SPARK_REMOTE"] = "sc://localhost" + globs["spark"] = PySparkSession.builder.remote("sc://localhost").getOrCreate() + + (failure_count, test_count) = doctest.testmod( + pyspark.sql.connect.dataframe, + globs=globs, + optionflags=doctest.ELLIPSIS + | doctest.NORMALIZE_WHITESPACE + | doctest.IGNORE_EXCEPTION_DETAIL, + ) + + globs["spark"].stop() + globs["_spark"].stop() + if failure_count: + sys.exit(-1) + else: + print( + f"Skipping pyspark.sql.connect.dataframe doctests: {connect_requirement_message}", + file=sys.stderr, + ) + + +if __name__ == "__main__": + _test() diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 7f05d49bded..e3646cd7d95 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -189,7 +189,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): -------- >>> df = spark.range(1) >>> type(df.sparkSession) - <class 'pyspark.sql.session.SparkSession'> + <class '...session.SparkSession'> """ return self._session @@ -233,7 +233,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): -------- >>> df = spark.sql("SELECT 1 AS c1, int(NULL) AS c2") >>> type(df.na) - <class 'pyspark.sql.dataframe.DataFrameNaFunctions'> + <class '...dataframe.DataFrameNaFunctions'> Replace the missing values as 2. @@ -264,7 +264,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): >>> import pyspark.sql.functions as f >>> df = spark.range(3).withColumn("c", f.expr("id + 1")) >>> type(df.stat) - <class 'pyspark.sql.dataframe.DataFrameStatFunctions'> + <class '...dataframe.DataFrameStatFunctions'> >>> df.stat.corr("id", "c") 1.0 """ @@ -355,7 +355,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): Throw an exception if the table already exists. - >>> df.createTempView("people") # doctest: +IGNORE_EXCEPTION_DETAIL + >>> df.createTempView("people") # doctest: +IGNORE_EXCEPTION_DETAIL, +SKIP Traceback (most recent call last): ... AnalysisException: "Temporary table 'people' already exists;" @@ -438,7 +438,7 @@ class DataFrame(PandasMapOpsMixin, PandasConversionMixin): Throws an exception if the global temporary view already exists. - >>> df.createGlobalTempView("people") # doctest: +IGNORE_EXCEPTION_DETAIL + >>> df.createGlobalTempView("people") # doctest: +IGNORE_EXCEPTION_DETAIL, +SKIP Traceback (most recent call last): ... AnalysisException: "Temporary table 'people' already exists;" --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org