This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.2 by this push: new e9cc811 [SPARK-36146][PYTHON][INFRA][TESTS] Upgrade Python version from 3.6 to 3.9 in GitHub Actions' linter/docs e9cc811 is described below commit e9cc81151d3bbe7f2d53d522316e5f6038edb3b8 Author: Hyukjin Kwon <gurwls...@apache.org> AuthorDate: Thu Jul 15 08:01:54 2021 -0700 [SPARK-36146][PYTHON][INFRA][TESTS] Upgrade Python version from 3.6 to 3.9 in GitHub Actions' linter/docs This PR proposes to use Python 3.9 in documentation and linter at GitHub Actions. This PR also contains the fixes for mypy check (introduced by Python 3.9 upgrade) ``` python/pyspark/sql/pandas/_typing/protocols/frame.pyi:64: error: Name "np.ndarray" is not defined python/pyspark/sql/pandas/_typing/protocols/frame.pyi:91: error: Name "np.recarray" is not defined python/pyspark/sql/pandas/_typing/protocols/frame.pyi:165: error: Name "np.ndarray" is not defined python/pyspark/pandas/categorical.py:82: error: Item "dtype[Any]" of "Union[dtype[Any], Any]" has no attribute "categories" python/pyspark/pandas/categorical.py:109: error: Item "dtype[Any]" of "Union[dtype[Any], Any]" has no attribute "ordered" python/pyspark/ml/linalg/__init__.pyi:184: error: Return type "ndarray[Any, Any]" of "toArray" incompatible with return type "NoReturn" in supertype "Matrix" python/pyspark/ml/linalg/__init__.pyi:217: error: Return type "ndarray[Any, Any]" of "toArray" incompatible with return type "NoReturn" in supertype "Matrix" python/pyspark/pandas/typedef/typehints.py:163: error: Module has no attribute "bool"; maybe "bool_" or "bool8"? python/pyspark/pandas/typedef/typehints.py:174: error: Module has no attribute "float"; maybe "float_", "cfloat", or "float96"? python/pyspark/pandas/typedef/typehints.py:180: error: Module has no attribute "int"; maybe "uint", "rint", or "intp"? python/pyspark/pandas/ml.py:81: error: Value of type variable "_DTypeScalar_co" of "dtype" cannot be "object" python/pyspark/pandas/indexing.py:1649: error: Module has no attribute "int"; maybe "uint", "rint", or "intp"? python/pyspark/pandas/indexing.py:1656: error: Module has no attribute "int"; maybe "uint", "rint", or "intp"? python/pyspark/pandas/frame.py:4969: error: Function "numpy.array" is not valid as a type python/pyspark/pandas/frame.py:4969: note: Perhaps you need "Callable[...]" or a callback protocol? python/pyspark/pandas/frame.py:4970: error: Function "numpy.array" is not valid as a type python/pyspark/pandas/frame.py:4970: note: Perhaps you need "Callable[...]" or a callback protocol? python/pyspark/pandas/frame.py:7402: error: "List[Any]" has no attribute "tolist" python/pyspark/pandas/series.py:1030: error: Module has no attribute "_NoValue" python/pyspark/pandas/series.py:1031: error: Module has no attribute "_NoValue" python/pyspark/pandas/indexes/category.py:159: error: Item "dtype[Any]" of "Union[dtype[Any], Any]" has no attribute "categories" python/pyspark/pandas/indexes/category.py:180: error: Item "dtype[Any]" of "Union[dtype[Any], Any]" has no attribute "ordered" python/pyspark/pandas/namespace.py:2036: error: Argument 1 to "column_name" has incompatible type "float"; expected "str" python/pyspark/pandas/mlflow.py:59: error: Incompatible types in assignment (expression has type "Type[floating[Any]]", variable has type "str") python/pyspark/pandas/data_type_ops/categorical_ops.py:43: error: Item "dtype[Any]" of "Union[dtype[Any], Any]" has no attribute "categories" python/pyspark/pandas/data_type_ops/categorical_ops.py:43: error: Item "dtype[Any]" of "Union[dtype[Any], Any]" has no attribute "ordered" python/pyspark/pandas/data_type_ops/categorical_ops.py:56: error: Item "dtype[Any]" of "Union[dtype[Any], Any]" has no attribute "categories" python/pyspark/pandas/tests/test_typedef.py:70: error: Name "np.float" is not defined python/pyspark/pandas/tests/test_typedef.py:77: error: Name "np.float" is not defined python/pyspark/pandas/tests/test_typedef.py:85: error: Name "np.float" is not defined python/pyspark/pandas/tests/test_typedef.py:100: error: Name "np.float" is not defined python/pyspark/pandas/tests/test_typedef.py:108: error: Name "np.float" is not defined python/pyspark/mllib/clustering.pyi:152: error: Incompatible types in assignment (expression has type "ndarray[Any, Any]", base class "KMeansModel" defined the type as "List[ndarray[Any, Any]]") python/pyspark/mllib/classification.pyi:93: error: Signature of "predict" incompatible with supertype "LinearClassificationModel" Found 32 errors in 15 files (checked 315 source files) 1 ``` Python 3.6 is deprecated at SPARK-35938 No. Maybe static analysis, etc. by some type hints but they are really non-breaking.. I manually checked by GitHub Actions build in forked repository. Closes #33356 from HyukjinKwon/SPARK-36146. Authored-by: Hyukjin Kwon <gurwls...@apache.org> Signed-off-by: Dongjoon Hyun <dongj...@apache.org> (cherry picked from commit a71dd6af2f4fb82c1b7ef978be27ecb4e86429cd) Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- .github/workflows/build_and_test.yml | 11 +++-------- python/pyspark/ml/linalg/__init__.pyi | 2 +- python/pyspark/mllib/_typing.pyi | 2 +- python/pyspark/mllib/classification.pyi | 2 +- python/pyspark/mllib/clustering.pyi | 6 +++--- python/pyspark/pandas/categorical.py | 6 +++--- .../pandas/data_type_ops/categorical_ops.py | 16 ++++++++++------ python/pyspark/pandas/frame.py | 10 +++++++--- python/pyspark/pandas/indexes/category.py | 8 ++++---- python/pyspark/pandas/indexing.py | 4 ++-- python/pyspark/pandas/ml.py | 2 +- python/pyspark/pandas/mlflow.py | 8 +++++--- python/pyspark/pandas/namespace.py | 6 +++--- python/pyspark/pandas/series.py | 5 +++-- python/pyspark/pandas/tests/test_typedef.py | 22 +++++++++++----------- python/pyspark/pandas/typedef/typehints.py | 6 +++--- .../pyspark/sql/pandas/_typing/protocols/frame.pyi | 2 +- 17 files changed, 62 insertions(+), 56 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 74fc6ba..49e105d 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -354,18 +354,13 @@ jobs: key: docs-maven-${{ hashFiles('**/pom.xml') }} restore-keys: | docs-maven- - - name: Install Python 3.6 - uses: actions/setup-python@v2 - with: - python-version: 3.6 - architecture: x64 - name: Install Python linter dependencies run: | # TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes. # See also https://github.com/sphinx-doc/sphinx/issues/7551. # Jinja2 3.0.0+ causes error when building with Sphinx. # See also https://issues.apache.org/jira/browse/SPARK-35375. - python3.6 -m pip install flake8 pydata_sphinx_theme 'mypy==0.910' numpydoc 'jinja2<3.0.0' 'black==21.5b2' + python3.9 -m pip install flake8 pydata_sphinx_theme 'mypy==0.910' numpydoc 'jinja2<3.0.0' 'black==21.5b2' - name: Install R linter dependencies and SparkR run: | apt-get install -y libcurl4-openssl-dev libgit2-dev libssl-dev libxml2-dev @@ -384,8 +379,8 @@ jobs: # See also https://github.com/sphinx-doc/sphinx/issues/7551. # Jinja2 3.0.0+ causes error when building with Sphinx. # See also https://issues.apache.org/jira/browse/SPARK-35375. - python3.6 -m pip install 'sphinx<3.1.0' mkdocs numpy pydata_sphinx_theme ipython nbsphinx numpydoc 'jinja2<3.0.0' - python3.6 -m pip install sphinx_plotly_directive 'pyarrow<5.0.0' pandas 'plotly>=4.8' + python3.9 -m pip install 'sphinx<3.1.0' mkdocs numpy pydata_sphinx_theme ipython nbsphinx numpydoc 'jinja2<3.0.0' + python3.9 -m pip install sphinx_plotly_directive 'pyarrow<5.0.0' pandas 'plotly>=4.8' apt-get update -y apt-get install -y ruby ruby-dev Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2'), repos='https://cloud.r-project.org/')" diff --git a/python/pyspark/ml/linalg/__init__.pyi b/python/pyspark/ml/linalg/__init__.pyi index b4fba88..a302825 100644 --- a/python/pyspark/ml/linalg/__init__.pyi +++ b/python/pyspark/ml/linalg/__init__.pyi @@ -164,7 +164,7 @@ class Matrix: def __init__( self, numRows: int, numCols: int, isTransposed: bool = ... ) -> None: ... - def toArray(self) -> NoReturn: ... + def toArray(self) -> ndarray: ... class DenseMatrix(Matrix): values: Any diff --git a/python/pyspark/mllib/_typing.pyi b/python/pyspark/mllib/_typing.pyi index 213a699..22469e8 100644 --- a/python/pyspark/mllib/_typing.pyi +++ b/python/pyspark/mllib/_typing.pyi @@ -20,4 +20,4 @@ from typing import List, Tuple, Union from pyspark.mllib.linalg import Vector from numpy import ndarray # noqa: F401 -VectorLike = Union[Vector, List[float], Tuple[float, ...]] +VectorLike = Union[ndarray, Vector, List[float], Tuple[float, ...]] diff --git a/python/pyspark/mllib/classification.pyi b/python/pyspark/mllib/classification.pyi index 967b0a9..89229e2 100644 --- a/python/pyspark/mllib/classification.pyi +++ b/python/pyspark/mllib/classification.pyi @@ -90,7 +90,7 @@ class LogisticRegressionWithLBFGS: class SVMModel(LinearClassificationModel): def __init__(self, weights: Vector, intercept: float) -> None: ... - @overload + @overload # type: ignore def predict(self, x: VectorLike) -> float64: ... @overload def predict(self, x: RDD[VectorLike]) -> RDD[float64]: ... diff --git a/python/pyspark/mllib/clustering.pyi b/python/pyspark/mllib/clustering.pyi index b4f3496..7b3673a 100644 --- a/python/pyspark/mllib/clustering.pyi +++ b/python/pyspark/mllib/clustering.pyi @@ -35,7 +35,7 @@ from pyspark.streaming.dstream import DStream T = TypeVar("T") class BisectingKMeansModel(JavaModelWrapper): - centers: List[ndarray] + centers: List[VectorLike] def __init__(self, java_model: JavaObject) -> None: ... @property def clusterCenters(self) -> List[ndarray]: ... @@ -62,7 +62,7 @@ class BisectingKMeans: ) -> BisectingKMeansModel: ... class KMeansModel(Saveable, Loader[KMeansModel]): - centers: List[ndarray] + centers: List[VectorLike] def __init__(self, centers: List[VectorLike]) -> None: ... @property def clusterCenters(self) -> List[ndarray]: ... @@ -149,7 +149,7 @@ class StreamingKMeansModel(KMeansModel): ) -> None: ... @property def clusterWeights(self) -> List[float64]: ... - centers: ndarray + centers: List[VectorLike] def update( self, data: RDD[VectorLike], decayFactor: float, timeUnit: str ) -> StreamingKMeansModel: ... diff --git a/python/pyspark/pandas/categorical.py b/python/pyspark/pandas/categorical.py index 230b45b..6b772e2 100644 --- a/python/pyspark/pandas/categorical.py +++ b/python/pyspark/pandas/categorical.py @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, cast import pandas as pd from pandas.api.types import CategoricalDtype @@ -79,7 +79,7 @@ class CategoricalAccessor(object): >>> s.cat.categories Index(['a', 'b', 'c'], dtype='object') """ - return self._data.dtype.categories + return cast(CategoricalDtype, self._data.dtype).categories @categories.setter def categories(self, categories: pd.Index) -> None: @@ -106,7 +106,7 @@ class CategoricalAccessor(object): >>> s.cat.ordered False """ - return self._data.dtype.ordered + return cast(CategoricalDtype, self._data.dtype).ordered @property def codes(self) -> "ps.Series": diff --git a/python/pyspark/pandas/data_type_ops/categorical_ops.py b/python/pyspark/pandas/data_type_ops/categorical_ops.py index fb5666d..b0f32cb 100644 --- a/python/pyspark/pandas/data_type_ops/categorical_ops.py +++ b/python/pyspark/pandas/data_type_ops/categorical_ops.py @@ -16,7 +16,7 @@ # from itertools import chain -from typing import Any, Union +from typing import Any, Union, cast import pandas as pd from pandas.api.types import CategoricalDtype @@ -41,8 +41,12 @@ class CategoricalOps(DataTypeOps): def restore(self, col: pd.Series) -> pd.Series: """Restore column when to_pandas.""" - return pd.Categorical.from_codes( - col, categories=self.dtype.categories, ordered=self.dtype.ordered + return pd.Series( + pd.Categorical.from_codes( + col, + categories=cast(CategoricalDtype, self.dtype).categories, + ordered=cast(CategoricalDtype, self.dtype).ordered, + ) ) def prepare(self, col: pd.Series) -> pd.Series: @@ -52,10 +56,10 @@ class CategoricalOps(DataTypeOps): def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike: dtype, _ = pandas_on_spark_type(dtype) - if isinstance(dtype, CategoricalDtype) and dtype.categories is None: + if isinstance(dtype, CategoricalDtype) and cast(CategoricalDtype, dtype).categories is None: return index_ops.copy() - categories = index_ops.dtype.categories + categories = cast(CategoricalDtype, index_ops.dtype).categories if len(categories) == 0: scol = SF.lit(None) else: @@ -84,7 +88,7 @@ class CategoricalOps(DataTypeOps): def _non_equality_comparison_input_check(left: IndexOpsLike, right: Any) -> None: - if not left.dtype.ordered: + if not cast(CategoricalDtype, left.dtype).ordered: raise TypeError("Unordered Categoricals can only compare equality or not.") if isinstance(right, IndexOpsMixin) and isinstance(right.dtype, CategoricalDtype): if hash(left.dtype) != hash(right.dtype): diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py index 6189e17..deb1484 100644 --- a/python/pyspark/pandas/frame.py +++ b/python/pyspark/pandas/frame.py @@ -4966,8 +4966,8 @@ defaultdict(<class 'list'>, {'col..., 'col...})] @staticmethod def from_records( - data: Union[np.array, List[tuple], dict, pd.DataFrame], - index: Union[str, list, np.array] = None, + data: Union[np.ndarray, List[tuple], dict, pd.DataFrame], + index: Union[str, list, np.ndarray] = None, exclude: list = None, columns: list = None, coerce_float: bool = False, @@ -7399,7 +7399,11 @@ defaultdict(<class 'list'>, {'col..., 'col...})] SF.lit(False).alias(self._internal.data_spark_column_names[i]) ) elif is_list_like(values): - values = values.tolist() if isinstance(values, np.ndarray) else list(values) + values = ( + cast(np.ndarray, values).tolist() + if isinstance(values, np.ndarray) + else list(values) + ) data_spark_columns += [ self._internal.spark_column_for(label) .isin(values) diff --git a/python/pyspark/pandas/indexes/category.py b/python/pyspark/pandas/indexes/category.py index 9976da0..0ece940 100644 --- a/python/pyspark/pandas/indexes/category.py +++ b/python/pyspark/pandas/indexes/category.py @@ -15,10 +15,10 @@ # limitations under the License. # from functools import partial -from typing import Any, no_type_check +from typing import Any, no_type_check, cast import pandas as pd -from pandas.api.types import is_hashable +from pandas.api.types import is_hashable, CategoricalDtype from pyspark import pandas as ps from pyspark.pandas.indexes.base import Index @@ -156,7 +156,7 @@ class CategoricalIndex(Index): >>> idx.categories Index(['a', 'b', 'c'], dtype='object') """ - return self.dtype.categories + return cast(CategoricalDtype, self.dtype).categories @categories.setter def categories(self, categories: pd.Index) -> None: @@ -177,7 +177,7 @@ class CategoricalIndex(Index): >>> idx.ordered False """ - return self.dtype.ordered + return cast(CategoricalDtype, self.dtype).ordered def __getattr__(self, item: str) -> Any: if hasattr(MissingPandasLikeCategoricalIndex, item): diff --git a/python/pyspark/pandas/indexing.py b/python/pyspark/pandas/indexing.py index f374a6f..d36c453 100644 --- a/python/pyspark/pandas/indexing.py +++ b/python/pyspark/pandas/indexing.py @@ -1646,14 +1646,14 @@ class iLocIndexer(LocIndexerLike): ) -> Tuple[Optional[Column], Optional[int], Optional[int]]: sdf = self._internal.spark_frame - if any(isinstance(key, (int, np.int, np.int64, np.int32)) and key < 0 for key in rows_sel): + if any(isinstance(key, (int, np.int64, np.int32)) and key < 0 for key in rows_sel): offset = sdf.count() else: offset = 0 new_rows_sel = [] for key in list(rows_sel): - if not isinstance(key, (int, np.int, np.int64, np.int32)): + if not isinstance(key, (int, np.int64, np.int32)): raise TypeError( "cannot do positional indexing with these indexers [{}] of {}".format( key, type(key) diff --git a/python/pyspark/pandas/ml.py b/python/pyspark/pandas/ml.py index f0554dd..a4f1d7a 100644 --- a/python/pyspark/pandas/ml.py +++ b/python/pyspark/pandas/ml.py @@ -78,7 +78,7 @@ def to_numeric_df(psdf: "ps.DataFrame") -> Tuple[pyspark.sql.DataFrame, List[Lab """ # TODO, it should be more robust. accepted_types = { - np.dtype(dt) + np.dtype(dt) # type: ignore for dt in [np.int8, np.int16, np.int32, np.int64, np.float32, np.float64, np.bool_] } numeric_column_labels = [ diff --git a/python/pyspark/pandas/mlflow.py b/python/pyspark/pandas/mlflow.py index 6178bac..719db40 100644 --- a/python/pyspark/pandas/mlflow.py +++ b/python/pyspark/pandas/mlflow.py @@ -25,7 +25,7 @@ import pandas as pd import numpy as np from typing import Any -from pyspark.pandas._typing import Label # noqa: F401 (SPARK-34943) +from pyspark.pandas._typing import Label, Dtype # noqa: F401 (SPARK-34943) from pyspark.pandas.utils import lazy_property, default_session from pyspark.pandas.frame import DataFrame from pyspark.pandas.series import Series, first_series @@ -42,7 +42,7 @@ class PythonModelWrapper(object): """ - def __init__(self, model_uri: str, return_type_hint: str): + def __init__(self, model_uri: str, return_type_hint: Union[str, type, Dtype]): self._model_uri = model_uri self._return_type_hint = return_type_hint @@ -109,7 +109,9 @@ class PythonModelWrapper(object): raise ValueError("unknown data type: {}".format(type(data).__name__)) -def load_model(model_uri: str, predict_type: str = "infer") -> PythonModelWrapper: +def load_model( + model_uri: str, predict_type: Union[str, type, Dtype] = "infer" +) -> PythonModelWrapper: """ Loads an MLflow model into an wrapper that can be used both for pandas and pandas-on-Spark DataFrame. diff --git a/python/pyspark/pandas/namespace.py b/python/pyspark/pandas/namespace.py index 7089486..a46926d 100644 --- a/python/pyspark/pandas/namespace.py +++ b/python/pyspark/pandas/namespace.py @@ -2020,11 +2020,11 @@ def get_dummies( if drop_first: values = values[1:] - def column_name(value: str) -> str: + def column_name(v: Any) -> Name: if prefix is None or cast(List[str], prefix)[i] == "": - return value + return v else: - return "{}{}{}".format(cast(List[str], prefix)[i], prefix_sep, value) + return "{}{}{}".format(cast(List[str], prefix)[i], prefix_sep, v) for value in values: remaining_columns.append( diff --git a/python/pyspark/pandas/series.py b/python/pyspark/pandas/series.py index f3a1198..bb1bc5c 100644 --- a/python/pyspark/pandas/series.py +++ b/python/pyspark/pandas/series.py @@ -1027,8 +1027,9 @@ class Series(Frame, IndexOpsMixin, Generic[T]): current = current.when(self.spark.column == SF.lit(to_replace), value) if hasattr(arg, "__missing__"): - tmp_val = arg[np._NoValue] - del arg[np._NoValue] # Remove in case it's set in defaultdict. + tmp_val = arg[np._NoValue] # type: ignore + # Remove in case it's set in defaultdict. + del arg[np._NoValue] # type: ignore current = current.otherwise(SF.lit(tmp_val)) else: current = current.otherwise(SF.lit(None).cast(self.spark.data_type)) diff --git a/python/pyspark/pandas/tests/test_typedef.py b/python/pyspark/pandas/tests/test_typedef.py index 795ffe7..6b644f4 100644 --- a/python/pyspark/pandas/tests/test_typedef.py +++ b/python/pyspark/pandas/tests/test_typedef.py @@ -67,14 +67,14 @@ class TypeHintTests(unittest.TestCase): self.assertEqual(inferred.dtype, np.int64) self.assertEqual(inferred.spark_type, LongType()) - def func() -> pd.Series[np.float]: + def func() -> pd.Series[float]: pass inferred = infer_return_type(func) self.assertEqual(inferred.dtype, np.float64) self.assertEqual(inferred.spark_type, DoubleType()) - def func() -> "pd.DataFrame[np.float, str]": + def func() -> "pd.DataFrame[np.float_, str]": pass expected = StructType([StructField("c0", DoubleType()), StructField("c1", StringType())]) @@ -82,7 +82,7 @@ class TypeHintTests(unittest.TestCase): self.assertEqual(inferred.dtypes, [np.float64, np.unicode_]) self.assertEqual(inferred.spark_type, expected) - def func() -> "pandas.DataFrame[np.float]": + def func() -> "pandas.DataFrame[float]": pass expected = StructType([StructField("c0", DoubleType())]) @@ -97,7 +97,7 @@ class TypeHintTests(unittest.TestCase): self.assertEqual(inferred.dtype, np.int64) self.assertEqual(inferred.spark_type, LongType()) - def func() -> pd.DataFrame[np.float, str]: + def func() -> pd.DataFrame[np.float64, str]: pass expected = StructType([StructField("c0", DoubleType()), StructField("c1", StringType())]) @@ -105,7 +105,7 @@ class TypeHintTests(unittest.TestCase): self.assertEqual(inferred.dtypes, [np.float64, np.unicode_]) self.assertEqual(inferred.spark_type, expected) - def func() -> pd.DataFrame[np.float]: + def func() -> pd.DataFrame[np.float_]: pass expected = StructType([StructField("c0", DoubleType())]) @@ -152,7 +152,7 @@ class TypeHintTests(unittest.TestCase): "Type inference from pandas instances is supported with Python 3.7+", ) def test_infer_schema_with_names_pandas_instances(self): - def func() -> 'pd.DataFrame["a" : np.float, "b":str]': # noqa: F405 + def func() -> 'pd.DataFrame["a" : np.float_, "b":str]': # noqa: F405 pass expected = StructType([StructField("a", DoubleType()), StructField("b", StringType())]) @@ -160,7 +160,7 @@ class TypeHintTests(unittest.TestCase): self.assertEqual(inferred.dtypes, [np.float64, np.unicode_]) self.assertEqual(inferred.spark_type, expected) - def func() -> "pd.DataFrame['a': np.float, 'b': int]": # noqa: F405 + def func() -> "pd.DataFrame['a': float, 'b': int]": # noqa: F405 pass expected = StructType([StructField("a", DoubleType()), StructField("b", LongType())]) @@ -206,7 +206,7 @@ class TypeHintTests(unittest.TestCase): ) def test_infer_schema_with_names_pandas_instances_negative(self): def try_infer_return_type(): - def f() -> 'pd.DataFrame["a" : np.float : 1, "b":str:2]': # noqa: F405 + def f() -> 'pd.DataFrame["a" : np.float_ : 1, "b":str:2]': # noqa: F405 pass infer_return_type(f) @@ -225,7 +225,7 @@ class TypeHintTests(unittest.TestCase): self.assertRaisesRegex(TypeError, "not understood", try_infer_return_type) def try_infer_return_type(): - def f() -> 'pd.DataFrame["a" : np.float : 1, "b":str:2]': # noqa: F405 + def f() -> 'pd.DataFrame["a" : float : 1, "b":str:2]': # noqa: F405 pass infer_return_type(f) @@ -253,7 +253,7 @@ class TypeHintTests(unittest.TestCase): def test_infer_schema_with_names_negative(self): def try_infer_return_type(): - def f() -> 'ps.DataFrame["a" : np.float : 1, "b":str:2]': # noqa: F405 + def f() -> 'ps.DataFrame["a" : float : 1, "b":str:2]': # noqa: F405 pass infer_return_type(f) @@ -272,7 +272,7 @@ class TypeHintTests(unittest.TestCase): self.assertRaisesRegex(TypeError, "not understood", try_infer_return_type) def try_infer_return_type(): - def f() -> 'ps.DataFrame["a" : np.float : 1, "b":str:2]': # noqa: F405 + def f() -> 'ps.DataFrame["a" : np.float_ : 1, "b":str:2]': # noqa: F405 pass infer_return_type(f) diff --git a/python/pyspark/pandas/typedef/typehints.py b/python/pyspark/pandas/typedef/typehints.py index 00415dc..1321313 100644 --- a/python/pyspark/pandas/typedef/typehints.py +++ b/python/pyspark/pandas/typedef/typehints.py @@ -160,7 +160,7 @@ def as_spark_type(tpe: Union[str, type, Dtype], *, raise_error: bool = True) -> elif tpe in (bytes, np.character, np.bytes_, np.string_): return types.BinaryType() # BooleanType - elif tpe in (bool, np.bool, "bool", "?"): + elif tpe in (bool, np.bool_, "bool", "?"): return types.BooleanType() # DateType elif tpe in (datetime.date,): @@ -171,13 +171,13 @@ def as_spark_type(tpe: Union[str, type, Dtype], *, raise_error: bool = True) -> elif tpe in (decimal.Decimal,): # TODO: considering about the precision & scale for decimal type. return types.DecimalType(38, 18) - elif tpe in (float, np.float, np.float64, "float", "float64", "double"): + elif tpe in (float, np.float_, np.float64, "float", "float64", "double"): return types.DoubleType() elif tpe in (np.float32, "float32", "f"): return types.FloatType() elif tpe in (np.int32, "int32", "i"): return types.IntegerType() - elif tpe in (int, np.int, np.int64, "int", "int64", "long"): + elif tpe in (int, np.int64, "int", "int64", "long"): return types.LongType() elif tpe in (np.int16, "int16", "short"): return types.ShortType() diff --git a/python/pyspark/sql/pandas/_typing/protocols/frame.pyi b/python/pyspark/sql/pandas/_typing/protocols/frame.pyi index 9148e7a..e219c1c 100644 --- a/python/pyspark/sql/pandas/_typing/protocols/frame.pyi +++ b/python/pyspark/sql/pandas/_typing/protocols/frame.pyi @@ -22,7 +22,7 @@ # - Add Protocol as a base class # - Replace imports with Any -import numpy.ma as np # type: ignore[import] +import numpy as np # type: ignore[import] from typing import Any, Hashable, IO, Iterable, List, Optional, Sequence, Tuple, Union from typing_extensions import Protocol from .series import SeriesLike --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org