[spark] branch master updated: [SPARK-36894][SPARK-37077][PYTHON] Synchronize RDD.toDF annotations with SparkSession and SQLContext .createDataFrame variants
This is an automated email from the ASF dual-hosted git repository. sarutak pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new b874bf5 [SPARK-36894][SPARK-37077][PYTHON] Synchronize RDD.toDF annotations with SparkSession and SQLContext .createDataFrame variants b874bf5 is described below commit b874bf5dca4f1b7272f458350eb153e7b272f8c8 Author: zero323 AuthorDate: Thu Nov 4 02:06:48 2021 +0900 [SPARK-36894][SPARK-37077][PYTHON] Synchronize RDD.toDF annotations with SparkSession and SQLContext .createDataFrame variants ### What changes were proposed in this pull request? This pull request synchronizes `RDD.toDF` annotations with `SparkSession.createDataFrame` and `SQLContext.createDataFrame` variants. Additionally, it fixes recent regression in `SQLContext.createDataFrame` (SPARK-37077), where `RDD` is no longer consider a valid input. ### Why are the changes needed? - Adds support for providing `str` schema. - Add supports for converting `RDDs` of "atomic" values, if schema is provided. Additionally it introduces a `TypeVar` representing supported "atomic" values. This was done to avoid issue with manual data tests, where the following ```python sc.parallelize([1]).toDF(schema=IntegerType()) ``` results in ``` error: No overload variant of "toDF" of "RDD" matches argument type "IntegerType" [call-overload] note: Possible overload variants: note: def toDF(self, schema: Union[List[str], Tuple[str, ...], None] = ..., sampleRatio: Optional[float] = ...) -> DataFrame note: def toDF(self, schema: Union[StructType, str, None] = ...) -> DataFrame ``` when `Union` type is used (this problem doesn't surface when non-self bound is used). ### Does this PR introduce _any_ user-facing change? Type checker only. Please note, that these annotations serve primarily to support documentation, as checks on `self` types are still very limited. ### How was this patch tested? Existing tests and manual data tests. __Note__: Updated data tests to reflect new expected traceback, after reversal in #34477 Closes #34478 from zero323/SPARK-36894. Authored-by: zero323 Signed-off-by: Kousuke Saruta --- python/pyspark/rdd.pyi | 15 ++--- python/pyspark/sql/_typing.pyi | 11 +++ python/pyspark/sql/context.py| 38 ++ python/pyspark/sql/session.py| 40 +--- python/pyspark/sql/tests/typing/test_session.yml | 8 ++--- 5 files changed, 71 insertions(+), 41 deletions(-) diff --git a/python/pyspark/rdd.pyi b/python/pyspark/rdd.pyi index a810a2c..84481d3 100644 --- a/python/pyspark/rdd.pyi +++ b/python/pyspark/rdd.pyi @@ -55,8 +55,8 @@ from pyspark.resource.requests import ( # noqa: F401 from pyspark.resource.profile import ResourceProfile from pyspark.statcounter import StatCounter from pyspark.sql.dataframe import DataFrame -from pyspark.sql.types import StructType -from pyspark.sql._typing import RowLike +from pyspark.sql.types import AtomicType, StructType +from pyspark.sql._typing import AtomicValue, RowLike from py4j.java_gateway import JavaObject # type: ignore[import] T = TypeVar("T") @@ -445,11 +445,18 @@ class RDD(Generic[T]): @overload def toDF( self: RDD[RowLike], -schema: Optional[List[str]] = ..., +schema: Optional[Union[List[str], Tuple[str, ...]]] = ..., sampleRatio: Optional[float] = ..., ) -> DataFrame: ... @overload -def toDF(self: RDD[RowLike], schema: Optional[StructType] = ...) -> DataFrame: ... +def toDF( +self: RDD[RowLike], schema: Optional[Union[StructType, str]] = ... +) -> DataFrame: ... +@overload +def toDF( +self: RDD[AtomicValue], +schema: Union[AtomicType, str], +) -> DataFrame: ... class RDDBarrier(Generic[T]): rdd: RDD[T] diff --git a/python/pyspark/sql/_typing.pyi b/python/pyspark/sql/_typing.pyi index 1a3bd8f..b6b4606 100644 --- a/python/pyspark/sql/_typing.pyi +++ b/python/pyspark/sql/_typing.pyi @@ -42,6 +42,17 @@ AtomicDataTypeOrString = Union[pyspark.sql.types.AtomicType, str] DataTypeOrString = Union[pyspark.sql.types.DataType, str] OptionalPrimitiveType = Optional[PrimitiveType] +AtomicValue = TypeVar( +"AtomicValue", +datetime.datetime, +datetime.date, +decimal.Decimal, +bool, +str, +int, +float, +) + RowLike = TypeVar("RowLike", List[Any], Tuple[Any, ...], pyspark.sql.types.Row) class SupportsOpen(Protocol): diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py index 7d27c55..eba2087 100644 --- a/python/pyspark/sql/context.py +++ b/python/p
[spark] branch master updated: [SPARK-36894][SPARK-37077][PYTHON] Synchronize RDD.toDF annotations with SparkSession and SQLContext .createDataFrame variants
This is an automated email from the ASF dual-hosted git repository. zero323 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 855da09 [SPARK-36894][SPARK-37077][PYTHON] Synchronize RDD.toDF annotations with SparkSession and SQLContext .createDataFrame variants 855da09 is described below commit 855da09f02f3007a2c36e7a738d4dc81fd95569a Author: zero323 AuthorDate: Wed Nov 3 13:10:48 2021 +0100 [SPARK-36894][SPARK-37077][PYTHON] Synchronize RDD.toDF annotations with SparkSession and SQLContext .createDataFrame variants ### What changes were proposed in this pull request? This pull request synchronizes `RDD.toDF` annotations with `SparkSession.createDataFrame` and `SQLContext.createDataFrame` variants. Additionally, it fixes recent regression in `SQLContext.createDataFrame` (SPARK-37077), where `RDD` is no longer consider a valid input. ### Why are the changes needed? - Adds support for providing `str` schema. - Add supports for converting `RDDs` of "atomic" values, if schema is provided. Additionally it introduces a `TypeVar` representing supported "atomic" values. This was done to avoid issue with manual data tests, where the following ```python sc.parallelize([1]).toDF(schema=IntegerType()) ``` results in ``` error: No overload variant of "toDF" of "RDD" matches argument type "IntegerType" [call-overload] note: Possible overload variants: note: def toDF(self, schema: Union[List[str], Tuple[str, ...], None] = ..., sampleRatio: Optional[float] = ...) -> DataFrame note: def toDF(self, schema: Union[StructType, str, None] = ...) -> DataFrame ``` when `Union` type is used (this problem doesn't surface when non-self bound is used). ### Does this PR introduce _any_ user-facing change? Type checker only. Please note, that these annotations serve primarily to support documentation, as checks on `self` types are still very limited. ### How was this patch tested? Existing tests and manual data tests. Closes #34146 from zero323/SPARK-36894. Authored-by: zero323 Signed-off-by: zero323 --- python/pyspark/rdd.pyi | 15 +++ python/pyspark/sql/_typing.pyi | 11 +++ python/pyspark/sql/context.py | 38 -- python/pyspark/sql/session.py | 40 +--- 4 files changed, 67 insertions(+), 37 deletions(-) diff --git a/python/pyspark/rdd.pyi b/python/pyspark/rdd.pyi index a810a2c..84481d3 100644 --- a/python/pyspark/rdd.pyi +++ b/python/pyspark/rdd.pyi @@ -55,8 +55,8 @@ from pyspark.resource.requests import ( # noqa: F401 from pyspark.resource.profile import ResourceProfile from pyspark.statcounter import StatCounter from pyspark.sql.dataframe import DataFrame -from pyspark.sql.types import StructType -from pyspark.sql._typing import RowLike +from pyspark.sql.types import AtomicType, StructType +from pyspark.sql._typing import AtomicValue, RowLike from py4j.java_gateway import JavaObject # type: ignore[import] T = TypeVar("T") @@ -445,11 +445,18 @@ class RDD(Generic[T]): @overload def toDF( self: RDD[RowLike], -schema: Optional[List[str]] = ..., +schema: Optional[Union[List[str], Tuple[str, ...]]] = ..., sampleRatio: Optional[float] = ..., ) -> DataFrame: ... @overload -def toDF(self: RDD[RowLike], schema: Optional[StructType] = ...) -> DataFrame: ... +def toDF( +self: RDD[RowLike], schema: Optional[Union[StructType, str]] = ... +) -> DataFrame: ... +@overload +def toDF( +self: RDD[AtomicValue], +schema: Union[AtomicType, str], +) -> DataFrame: ... class RDDBarrier(Generic[T]): rdd: RDD[T] diff --git a/python/pyspark/sql/_typing.pyi b/python/pyspark/sql/_typing.pyi index 1a3bd8f..b6b4606 100644 --- a/python/pyspark/sql/_typing.pyi +++ b/python/pyspark/sql/_typing.pyi @@ -42,6 +42,17 @@ AtomicDataTypeOrString = Union[pyspark.sql.types.AtomicType, str] DataTypeOrString = Union[pyspark.sql.types.DataType, str] OptionalPrimitiveType = Optional[PrimitiveType] +AtomicValue = TypeVar( +"AtomicValue", +datetime.datetime, +datetime.date, +decimal.Decimal, +bool, +str, +int, +float, +) + RowLike = TypeVar("RowLike", List[Any], Tuple[Any, ...], pyspark.sql.types.Row) class SupportsOpen(Protocol): diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py index 7d27c55..eba2087 100644 --- a/python/pyspark/sql/context.py +++ b/python/pyspark/sql/context.py @@ -48,13 +48,11 @@ from pyspark.conf import SparkConf if TYPE_CHECKING: from pyspark.sql._typing import ( -UserDefinedFunctionLike, +AtomicValue, RowL