This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 5db51ef [SPARK-35721][PYTHON] Path level discover for python unittests 5db51ef is described below commit 5db51efa1a0d98ad74a94f7b73bcb1161817e0a5 Author: Yikun Jiang <yikunk...@gmail.com> AuthorDate: Tue Jun 29 17:56:13 2021 +0900 [SPARK-35721][PYTHON] Path level discover for python unittests ### What changes were proposed in this pull request? Add path level discover for python unittests. ### Why are the changes needed? Now we need to specify the python test cases by manually when we add a new testcase. Sometime, we forgot to add the testcase to module list, the testcase would not be executed. Such as: - pyspark-core pyspark.tests.test_pin_thread Thus we need some auto-discover way to find all testcase rather than specified every case by manually. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Add below code in end of `dev/sparktestsupport/modules.py` ```python for m in sorted(all_modules): for g in sorted(m.python_test_goals): print(m.name, g) ``` Compare the result before and after: https://www.diffchecker.com/iO3FvhKL Closes #32867 from Yikun/SPARK_DISCOVER_TEST. Authored-by: Yikun Jiang <yikunk...@gmail.com> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- dev/sparktestsupport/modules.py | 212 +++++++-------------- python/pyspark/pandas/tests/indexes/test_base.py | 5 + .../pyspark/pandas/tests/indexes/test_datetime.py | 5 + python/pyspark/pandas/tests/test_dataframe.py | 5 + python/pyspark/pandas/tests/test_groupby.py | 5 + python/pyspark/pandas/tests/test_indexing.py | 5 + .../pandas/tests/test_ops_on_diff_frames.py | 5 + .../tests/test_ops_on_diff_frames_groupby.py | 5 + python/pyspark/pandas/tests/test_series.py | 5 + python/pyspark/pandas/tests/test_stats.py | 5 + 10 files changed, 117 insertions(+), 140 deletions(-) diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index 2ba3390..80cd3a4 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -15,14 +15,72 @@ # limitations under the License. # +from collections.abc import Iterable from functools import total_ordering import itertools import os import re +import unittest +import sys + +from sparktestsupport import SPARK_HOME all_modules = [] +def _get_module_from_name(name): + __import__(name) + return sys.modules[name] + + +def _discover_python_unittests(*paths, discover_slow=False): + """Discover the python module which contains unittests under paths. + + Such as: + ['pyspark/tests'], it will return the set of module name under the path of pyspark/tests, like + {'pyspark.tests.test_appsubmit', 'pyspark.tests.test_broadcast', ...} + + Parameters + ---------- + paths : str + Paths of modules to be discovered. + discover_slow : bool + If True, will only discover slow tests + If False, will discover all tests except slow tests + + Returns + ------- + A set of complete test module name discovered under specified paths + """ + + def add_test_module(testcases, modules, slow): + """Append the testcases module names to modules set""" + if isinstance(testcases, Iterable): + for test_case in testcases: + add_test_module(test_case, modules, slow) + else: + name = testcases.__module__ + module = _get_module_from_name(name) + if slow and hasattr(module, 'is_slow_test'): + modules.add(name) + if not slow and not hasattr(module, 'is_slow_test'): + modules.add(name) + + if not paths: + return [] + modules = set() + pyspark_path = os.path.join(SPARK_HOME, "python") + for path in paths: + # Discover the unittest in every path + testcases = unittest.defaultTestLoader.discover( + os.path.join(pyspark_path, path), + top_level_dir=pyspark_path + ) + add_test_module(testcases, modules, discover_slow) + + return sorted(list(modules)) + + @total_ordering class Module(object): """ @@ -388,24 +446,7 @@ pyspark_core = Module( "pyspark.profiler", "pyspark.shuffle", "pyspark.util", - # unittests - "pyspark.tests.test_appsubmit", - "pyspark.tests.test_broadcast", - "pyspark.tests.test_conf", - "pyspark.tests.test_context", - "pyspark.tests.test_daemon", - "pyspark.tests.test_install_spark", - "pyspark.tests.test_join", - "pyspark.tests.test_profiler", - "pyspark.tests.test_rdd", - "pyspark.tests.test_rddbarrier", - "pyspark.tests.test_readwrite", - "pyspark.tests.test_serializers", - "pyspark.tests.test_shuffle", - "pyspark.tests.test_taskcontext", - "pyspark.tests.test_util", - "pyspark.tests.test_worker", - ] + ] + _discover_python_unittests("pyspark/tests"), ) pyspark_sql = Module( @@ -437,32 +478,7 @@ pyspark_sql = Module( "pyspark.sql.pandas.serializers", "pyspark.sql.pandas.typehints", "pyspark.sql.pandas.utils", - # unittests - "pyspark.sql.tests.test_arrow", - "pyspark.sql.tests.test_catalog", - "pyspark.sql.tests.test_column", - "pyspark.sql.tests.test_conf", - "pyspark.sql.tests.test_context", - "pyspark.sql.tests.test_dataframe", - "pyspark.sql.tests.test_datasources", - "pyspark.sql.tests.test_functions", - "pyspark.sql.tests.test_group", - "pyspark.sql.tests.test_pandas_cogrouped_map", - "pyspark.sql.tests.test_pandas_grouped_map", - "pyspark.sql.tests.test_pandas_map", - "pyspark.sql.tests.test_pandas_udf", - "pyspark.sql.tests.test_pandas_udf_grouped_agg", - "pyspark.sql.tests.test_pandas_udf_scalar", - "pyspark.sql.tests.test_pandas_udf_typehints", - "pyspark.sql.tests.test_pandas_udf_window", - "pyspark.sql.tests.test_readwriter", - "pyspark.sql.tests.test_serde", - "pyspark.sql.tests.test_session", - "pyspark.sql.tests.test_streaming", - "pyspark.sql.tests.test_types", - "pyspark.sql.tests.test_udf", - "pyspark.sql.tests.test_utils", - ] + ] + _discover_python_unittests("pyspark/sql/tests"), ) @@ -474,10 +490,7 @@ pyspark_resource = Module( source_file_regexes=[ "python/pyspark/resource" ], - python_test_goals=[ - # unittests - "pyspark.resource.tests.test_resources", - ] + python_test_goals=_discover_python_unittests("pyspark/resource/tests"), ) @@ -494,12 +507,7 @@ pyspark_streaming = Module( python_test_goals=[ # doctests "pyspark.streaming.util", - # unittests - "pyspark.streaming.tests.test_context", - "pyspark.streaming.tests.test_dstream", - "pyspark.streaming.tests.test_kinesis", - "pyspark.streaming.tests.test_listener", - ] + ] + _discover_python_unittests("pyspark/streaming/tests"), ) @@ -525,17 +533,10 @@ pyspark_mllib = Module( "pyspark.mllib.stat.KernelDensity", "pyspark.mllib.tree", "pyspark.mllib.util", - # unittests - "pyspark.mllib.tests.test_algorithms", - "pyspark.mllib.tests.test_feature", - "pyspark.mllib.tests.test_linalg", - "pyspark.mllib.tests.test_stat", - "pyspark.mllib.tests.test_streaming_algorithms", - "pyspark.mllib.tests.test_util", - ], + ] + _discover_python_unittests("pyspark/mllib/tests"), excluded_python_implementations=[ "PyPy" # Skip these tests under PyPy since they require numpy and it isn't available there - ] + ], ) @@ -559,27 +560,13 @@ pyspark_ml = Module( "pyspark.ml.regression", "pyspark.ml.stat", "pyspark.ml.tuning", - # unittests - "pyspark.ml.tests.test_algorithms", - "pyspark.ml.tests.test_base", - "pyspark.ml.tests.test_evaluation", - "pyspark.ml.tests.test_feature", - "pyspark.ml.tests.test_image", - "pyspark.ml.tests.test_linalg", - "pyspark.ml.tests.test_param", - "pyspark.ml.tests.test_persistence", - "pyspark.ml.tests.test_pipeline", - "pyspark.ml.tests.test_stat", - "pyspark.ml.tests.test_training_summary", - "pyspark.ml.tests.test_tuning", - "pyspark.ml.tests.test_util", - "pyspark.ml.tests.test_wrapper", - ], + ] + _discover_python_unittests("pyspark/ml/tests"), excluded_python_implementations=[ "PyPy" # Skip these tests under PyPy since they require numpy and it isn't available there - ] + ], ) + pyspark_pandas = Module( name="pyspark-pandas", dependencies=[pyspark_core, pyspark_sql], @@ -614,59 +601,14 @@ pyspark_pandas = Module( "pyspark.pandas.spark.accessors", "pyspark.pandas.spark.utils", "pyspark.pandas.typedef.typehints", - # unittests - "pyspark.pandas.tests.data_type_ops.test_base", - "pyspark.pandas.tests.data_type_ops.test_binary_ops", - "pyspark.pandas.tests.data_type_ops.test_boolean_ops", - "pyspark.pandas.tests.data_type_ops.test_categorical_ops", - "pyspark.pandas.tests.data_type_ops.test_complex_ops", - "pyspark.pandas.tests.data_type_ops.test_date_ops", - "pyspark.pandas.tests.data_type_ops.test_datetime_ops", - "pyspark.pandas.tests.data_type_ops.test_decimal_ops", - "pyspark.pandas.tests.data_type_ops.test_null_ops", - "pyspark.pandas.tests.data_type_ops.test_num_ops", - "pyspark.pandas.tests.data_type_ops.test_string_ops", - "pyspark.pandas.tests.data_type_ops.test_udt_ops", - "pyspark.pandas.tests.indexes.test_category", - "pyspark.pandas.tests.plot.test_frame_plot", - "pyspark.pandas.tests.plot.test_frame_plot_matplotlib", - "pyspark.pandas.tests.plot.test_frame_plot_plotly", - "pyspark.pandas.tests.plot.test_series_plot", - "pyspark.pandas.tests.plot.test_series_plot_matplotlib", - "pyspark.pandas.tests.plot.test_series_plot_plotly", - "pyspark.pandas.tests.test_categorical", - "pyspark.pandas.tests.test_config", - "pyspark.pandas.tests.test_csv", - "pyspark.pandas.tests.test_dataframe_conversion", - "pyspark.pandas.tests.test_dataframe_spark_io", - "pyspark.pandas.tests.test_default_index", - "pyspark.pandas.tests.test_expanding", - "pyspark.pandas.tests.test_extension", - "pyspark.pandas.tests.test_frame_spark", - "pyspark.pandas.tests.test_indexops_spark", - "pyspark.pandas.tests.test_internal", - "pyspark.pandas.tests.test_namespace", - "pyspark.pandas.tests.test_numpy_compat", - "pyspark.pandas.tests.test_ops_on_diff_frames_groupby_expanding", - "pyspark.pandas.tests.test_ops_on_diff_frames_groupby_rolling", - "pyspark.pandas.tests.test_repr", - "pyspark.pandas.tests.test_reshape", - "pyspark.pandas.tests.test_rolling", - "pyspark.pandas.tests.test_series_conversion", - "pyspark.pandas.tests.test_series_datetime", - "pyspark.pandas.tests.test_series_string", - "pyspark.pandas.tests.test_spark_functions", - "pyspark.pandas.tests.test_sql", - "pyspark.pandas.tests.test_typedef", - "pyspark.pandas.tests.test_utils", - "pyspark.pandas.tests.test_window", - ], + ] + _discover_python_unittests("pyspark/pandas/tests"), excluded_python_implementations=[ "PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and - # they aren't available there - ] + # they aren't available there + ], ) + pyspark_pandas_slow = Module( name="pyspark-pandas-slow", dependencies=[pyspark_core, pyspark_sql], @@ -678,17 +620,7 @@ pyspark_pandas_slow = Module( "pyspark.pandas.frame", "pyspark.pandas.generic", "pyspark.pandas.series", - # unittests - "pyspark.pandas.tests.indexes.test_base", - "pyspark.pandas.tests.indexes.test_datetime", - "pyspark.pandas.tests.test_dataframe", - "pyspark.pandas.tests.test_groupby", - "pyspark.pandas.tests.test_indexing", - "pyspark.pandas.tests.test_ops_on_diff_frames", - "pyspark.pandas.tests.test_ops_on_diff_frames_groupby", - "pyspark.pandas.tests.test_series", - "pyspark.pandas.tests.test_stats", - ], + ] + _discover_python_unittests("pyspark/pandas/tests", discover_slow=True), excluded_python_implementations=[ "PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and # they aren't available there diff --git a/python/pyspark/pandas/tests/indexes/test_base.py b/python/pyspark/pandas/tests/indexes/test_base.py index 3d35cfc..2faad5e 100644 --- a/python/pyspark/pandas/tests/indexes/test_base.py +++ b/python/pyspark/pandas/tests/indexes/test_base.py @@ -34,6 +34,11 @@ from pyspark.pandas.missing.indexes import ( from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils, SPARK_CONF_ARROW_ENABLED +# This is used in run-tests.py to discover the slow test. See more in the doc of +# _discover_python_unittests of dev/sparktestsupport/modules.py +is_slow_test = True + + class IndexesTest(PandasOnSparkTestCase, TestUtils): @property def pdf(self): diff --git a/python/pyspark/pandas/tests/indexes/test_datetime.py b/python/pyspark/pandas/tests/indexes/test_datetime.py index 8a55e2e..7934012 100644 --- a/python/pyspark/pandas/tests/indexes/test_datetime.py +++ b/python/pyspark/pandas/tests/indexes/test_datetime.py @@ -25,6 +25,11 @@ import pyspark.pandas as ps from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils +# This is used in run-tests.py to discover the slow test. See more in the doc of +# _discover_python_unittests of dev/sparktestsupport/modules.py +is_slow_test = True + + class DatetimeIndexTest(PandasOnSparkTestCase, TestUtils): @property def fixed_freqs(self): diff --git a/python/pyspark/pandas/tests/test_dataframe.py b/python/pyspark/pandas/tests/test_dataframe.py index e54b783..858863c 100644 --- a/python/pyspark/pandas/tests/test_dataframe.py +++ b/python/pyspark/pandas/tests/test_dataframe.py @@ -50,6 +50,11 @@ from pyspark.testing.sqlutils import SQLTestUtils from pyspark.pandas.utils import name_like_string +# This is used in run-tests.py to discover the slow test. See more in the doc of +# _discover_python_unittests of dev/sparktestsupport/modules.py +is_slow_test = True + + class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): @property def pdf(self): diff --git a/python/pyspark/pandas/tests/test_groupby.py b/python/pyspark/pandas/tests/test_groupby.py index 1bc182d..5ec93ca 100644 --- a/python/pyspark/pandas/tests/test_groupby.py +++ b/python/pyspark/pandas/tests/test_groupby.py @@ -34,6 +34,11 @@ from pyspark.pandas.groupby import is_multi_agg_with_relabel from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils +# This is used in run-tests.py to discover the slow test. See more in the doc of +# _discover_python_unittests of dev/sparktestsupport/modules.py +is_slow_test = True + + class GroupByTest(PandasOnSparkTestCase, TestUtils): def test_groupby_simple(self): pdf = pd.DataFrame( diff --git a/python/pyspark/pandas/tests/test_indexing.py b/python/pyspark/pandas/tests/test_indexing.py index b74cf90..056d404 100644 --- a/python/pyspark/pandas/tests/test_indexing.py +++ b/python/pyspark/pandas/tests/test_indexing.py @@ -27,6 +27,11 @@ from pyspark.pandas.exceptions import SparkPandasIndexingError from pyspark.testing.pandasutils import ComparisonTestBase, PandasOnSparkTestCase, compare_both +# This is used in run-tests.py to discover the slow test. See more in the doc of +# _discover_python_unittests of dev/sparktestsupport/modules.py +is_slow_test = True + + class BasicIndexingTest(ComparisonTestBase): @property def pdf(self): diff --git a/python/pyspark/pandas/tests/test_ops_on_diff_frames.py b/python/pyspark/pandas/tests/test_ops_on_diff_frames.py index 12e87b2..db8beb7 100644 --- a/python/pyspark/pandas/tests/test_ops_on_diff_frames.py +++ b/python/pyspark/pandas/tests/test_ops_on_diff_frames.py @@ -35,6 +35,11 @@ from pyspark.pandas.typedef.typehints import ( ) +# This is used in run-tests.py to discover the slow test. See more in the doc of +# _discover_python_unittests of dev/sparktestsupport/modules.py +is_slow_test = True + + class OpsOnDiffFramesEnabledTest(PandasOnSparkTestCase, SQLTestUtils): @classmethod def setUpClass(cls): diff --git a/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py b/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py index 70c3089..97efcf8 100644 --- a/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py +++ b/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py @@ -25,6 +25,11 @@ from pyspark.testing.pandasutils import PandasOnSparkTestCase from pyspark.testing.sqlutils import SQLTestUtils +# This is used in run-tests.py to discover the slow test. See more in the doc of +# _discover_python_unittests of dev/sparktestsupport/modules.py +is_slow_test = True + + class OpsOnDiffFramesGroupByTest(PandasOnSparkTestCase, SQLTestUtils): @classmethod def setUpClass(cls): diff --git a/python/pyspark/pandas/tests/test_series.py b/python/pyspark/pandas/tests/test_series.py index 3bb06215..9d792b7 100644 --- a/python/pyspark/pandas/tests/test_series.py +++ b/python/pyspark/pandas/tests/test_series.py @@ -44,6 +44,11 @@ from pyspark.pandas.typedef.typehints import ( ) +# This is used in run-tests.py to discover the slow test. See more in the doc of +# _discover_python_unittests of dev/sparktestsupport/modules.py +is_slow_test = True + + class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): @property def pser(self): diff --git a/python/pyspark/pandas/tests/test_stats.py b/python/pyspark/pandas/tests/test_stats.py index 1a38665..06b35e0 100644 --- a/python/pyspark/pandas/tests/test_stats.py +++ b/python/pyspark/pandas/tests/test_stats.py @@ -31,6 +31,11 @@ from pyspark.testing.pandasutils import PandasOnSparkTestCase, SPARK_CONF_ARROW_ from pyspark.testing.sqlutils import SQLTestUtils +# This is used in run-tests.py to discover the slow test. See more in the doc of +# _discover_python_unittests of dev/sparktestsupport/modules.py +is_slow_test = True + + class StatsTest(PandasOnSparkTestCase, SQLTestUtils): def _test_stat_functions(self, pdf_or_pser, psdf_or_psser): functions = ["max", "min", "mean", "sum", "count"] --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org