[spark] branch master updated: [SPARK-35721][PYTHON] Path level discover for python unittests

gurwls223 Tue, 29 Jun 2021 01:56:58 -0700

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 5db51ef  [SPARK-35721][PYTHON] Path level discover for python unittests
5db51ef is described below

commit 5db51efa1a0d98ad74a94f7b73bcb1161817e0a5
Author: Yikun Jiang <yikunk...@gmail.com>
AuthorDate: Tue Jun 29 17:56:13 2021 +0900

    [SPARK-35721][PYTHON] Path level discover for python unittests
    
    ### What changes were proposed in this pull request?
    Add path level discover for python unittests.
    
    ### Why are the changes needed?
    Now we need to specify the python test cases by manually when we add a new 
testcase. Sometime, we forgot to add the testcase to module list, the testcase 
would not be executed.
    
    Such as:
    - pyspark-core pyspark.tests.test_pin_thread
    
    Thus we need some auto-discover way to find all testcase rather than 
specified every case by manually.
    
    ### Does this PR introduce _any_ user-facing change?
    No
    
    ### How was this patch tested?
    Add below code in end of `dev/sparktestsupport/modules.py`
    ```python
    for m in sorted(all_modules):
        for g in sorted(m.python_test_goals):
            print(m.name, g)
    ```
    Compare the result before and after:
    https://www.diffchecker.com/iO3FvhKL
    
    Closes #32867 from Yikun/SPARK_DISCOVER_TEST.
    
    Authored-by: Yikun Jiang <yikunk...@gmail.com>
    Signed-off-by: Hyukjin Kwon <gurwls...@apache.org>
---
 dev/sparktestsupport/modules.py                    | 212 +++++++--------------
 python/pyspark/pandas/tests/indexes/test_base.py   |   5 +
 .../pyspark/pandas/tests/indexes/test_datetime.py  |   5 +
 python/pyspark/pandas/tests/test_dataframe.py      |   5 +
 python/pyspark/pandas/tests/test_groupby.py        |   5 +
 python/pyspark/pandas/tests/test_indexing.py       |   5 +
 .../pandas/tests/test_ops_on_diff_frames.py        |   5 +
 .../tests/test_ops_on_diff_frames_groupby.py       |   5 +
 python/pyspark/pandas/tests/test_series.py         |   5 +
 python/pyspark/pandas/tests/test_stats.py          |   5 +
 10 files changed, 117 insertions(+), 140 deletions(-)

diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index 2ba3390..80cd3a4 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -15,14 +15,72 @@
 # limitations under the License.
 #
 
+from collections.abc import Iterable
 from functools import total_ordering
 import itertools
 import os
 import re
+import unittest
+import sys
+
+from sparktestsupport import SPARK_HOME
 
 all_modules = []
 
 
+def _get_module_from_name(name):
+    __import__(name)
+    return sys.modules[name]
+
+
+def _discover_python_unittests(*paths, discover_slow=False):
+    """Discover the python module which contains unittests under paths.
+
+    Such as:
+    ['pyspark/tests'], it will return the set of module name under the path of 
pyspark/tests, like
+    {'pyspark.tests.test_appsubmit', 'pyspark.tests.test_broadcast', ...}
+
+    Parameters
+    ----------
+    paths : str
+        Paths of modules to be discovered.
+    discover_slow : bool
+        If True, will only discover slow tests
+        If False, will discover all tests except slow tests
+
+    Returns
+    -------
+    A set of complete test module name discovered under specified paths
+    """
+
+    def add_test_module(testcases, modules, slow):
+        """Append the testcases module names to modules set"""
+        if isinstance(testcases, Iterable):
+            for test_case in testcases:
+                add_test_module(test_case, modules, slow)
+        else:
+            name = testcases.__module__
+            module = _get_module_from_name(name)
+            if slow and hasattr(module, 'is_slow_test'):
+                modules.add(name)
+            if not slow and not hasattr(module, 'is_slow_test'):
+                modules.add(name)
+
+    if not paths:
+        return []
+    modules = set()
+    pyspark_path = os.path.join(SPARK_HOME, "python")
+    for path in paths:
+        # Discover the unittest in every path
+        testcases = unittest.defaultTestLoader.discover(
+            os.path.join(pyspark_path, path),
+            top_level_dir=pyspark_path
+        )
+        add_test_module(testcases, modules, discover_slow)
+
+    return sorted(list(modules))
+
+
 @total_ordering
 class Module(object):
     """
@@ -388,24 +446,7 @@ pyspark_core = Module(
         "pyspark.profiler",
         "pyspark.shuffle",
         "pyspark.util",
-        # unittests
-        "pyspark.tests.test_appsubmit",
-        "pyspark.tests.test_broadcast",
-        "pyspark.tests.test_conf",
-        "pyspark.tests.test_context",
-        "pyspark.tests.test_daemon",
-        "pyspark.tests.test_install_spark",
-        "pyspark.tests.test_join",
-        "pyspark.tests.test_profiler",
-        "pyspark.tests.test_rdd",
-        "pyspark.tests.test_rddbarrier",
-        "pyspark.tests.test_readwrite",
-        "pyspark.tests.test_serializers",
-        "pyspark.tests.test_shuffle",
-        "pyspark.tests.test_taskcontext",
-        "pyspark.tests.test_util",
-        "pyspark.tests.test_worker",
-    ]
+    ] + _discover_python_unittests("pyspark/tests"),
 )
 
 pyspark_sql = Module(
@@ -437,32 +478,7 @@ pyspark_sql = Module(
         "pyspark.sql.pandas.serializers",
         "pyspark.sql.pandas.typehints",
         "pyspark.sql.pandas.utils",
-        # unittests
-        "pyspark.sql.tests.test_arrow",
-        "pyspark.sql.tests.test_catalog",
-        "pyspark.sql.tests.test_column",
-        "pyspark.sql.tests.test_conf",
-        "pyspark.sql.tests.test_context",
-        "pyspark.sql.tests.test_dataframe",
-        "pyspark.sql.tests.test_datasources",
-        "pyspark.sql.tests.test_functions",
-        "pyspark.sql.tests.test_group",
-        "pyspark.sql.tests.test_pandas_cogrouped_map",
-        "pyspark.sql.tests.test_pandas_grouped_map",
-        "pyspark.sql.tests.test_pandas_map",
-        "pyspark.sql.tests.test_pandas_udf",
-        "pyspark.sql.tests.test_pandas_udf_grouped_agg",
-        "pyspark.sql.tests.test_pandas_udf_scalar",
-        "pyspark.sql.tests.test_pandas_udf_typehints",
-        "pyspark.sql.tests.test_pandas_udf_window",
-        "pyspark.sql.tests.test_readwriter",
-        "pyspark.sql.tests.test_serde",
-        "pyspark.sql.tests.test_session",
-        "pyspark.sql.tests.test_streaming",
-        "pyspark.sql.tests.test_types",
-        "pyspark.sql.tests.test_udf",
-        "pyspark.sql.tests.test_utils",
-    ]
+    ] + _discover_python_unittests("pyspark/sql/tests"),
 )
 
 
@@ -474,10 +490,7 @@ pyspark_resource = Module(
     source_file_regexes=[
         "python/pyspark/resource"
     ],
-    python_test_goals=[
-        # unittests
-        "pyspark.resource.tests.test_resources",
-    ]
+    python_test_goals=_discover_python_unittests("pyspark/resource/tests"),
 )
 
 
@@ -494,12 +507,7 @@ pyspark_streaming = Module(
     python_test_goals=[
         # doctests
         "pyspark.streaming.util",
-        # unittests
-        "pyspark.streaming.tests.test_context",
-        "pyspark.streaming.tests.test_dstream",
-        "pyspark.streaming.tests.test_kinesis",
-        "pyspark.streaming.tests.test_listener",
-    ]
+    ] + _discover_python_unittests("pyspark/streaming/tests"),
 )
 
 
@@ -525,17 +533,10 @@ pyspark_mllib = Module(
         "pyspark.mllib.stat.KernelDensity",
         "pyspark.mllib.tree",
         "pyspark.mllib.util",
-        # unittests
-        "pyspark.mllib.tests.test_algorithms",
-        "pyspark.mllib.tests.test_feature",
-        "pyspark.mllib.tests.test_linalg",
-        "pyspark.mllib.tests.test_stat",
-        "pyspark.mllib.tests.test_streaming_algorithms",
-        "pyspark.mllib.tests.test_util",
-    ],
+    ] + _discover_python_unittests("pyspark/mllib/tests"),
     excluded_python_implementations=[
         "PyPy"  # Skip these tests under PyPy since they require numpy and it 
isn't available there
-    ]
+    ],
 )
 
 
@@ -559,27 +560,13 @@ pyspark_ml = Module(
         "pyspark.ml.regression",
         "pyspark.ml.stat",
         "pyspark.ml.tuning",
-        # unittests
-        "pyspark.ml.tests.test_algorithms",
-        "pyspark.ml.tests.test_base",
-        "pyspark.ml.tests.test_evaluation",
-        "pyspark.ml.tests.test_feature",
-        "pyspark.ml.tests.test_image",
-        "pyspark.ml.tests.test_linalg",
-        "pyspark.ml.tests.test_param",
-        "pyspark.ml.tests.test_persistence",
-        "pyspark.ml.tests.test_pipeline",
-        "pyspark.ml.tests.test_stat",
-        "pyspark.ml.tests.test_training_summary",
-        "pyspark.ml.tests.test_tuning",
-        "pyspark.ml.tests.test_util",
-        "pyspark.ml.tests.test_wrapper",
-    ],
+    ] + _discover_python_unittests("pyspark/ml/tests"),
     excluded_python_implementations=[
         "PyPy"  # Skip these tests under PyPy since they require numpy and it 
isn't available there
-    ]
+    ],
 )
 
+
 pyspark_pandas = Module(
     name="pyspark-pandas",
     dependencies=[pyspark_core, pyspark_sql],
@@ -614,59 +601,14 @@ pyspark_pandas = Module(
         "pyspark.pandas.spark.accessors",
         "pyspark.pandas.spark.utils",
         "pyspark.pandas.typedef.typehints",
-        # unittests
-        "pyspark.pandas.tests.data_type_ops.test_base",
-        "pyspark.pandas.tests.data_type_ops.test_binary_ops",
-        "pyspark.pandas.tests.data_type_ops.test_boolean_ops",
-        "pyspark.pandas.tests.data_type_ops.test_categorical_ops",
-        "pyspark.pandas.tests.data_type_ops.test_complex_ops",
-        "pyspark.pandas.tests.data_type_ops.test_date_ops",
-        "pyspark.pandas.tests.data_type_ops.test_datetime_ops",
-        "pyspark.pandas.tests.data_type_ops.test_decimal_ops",
-        "pyspark.pandas.tests.data_type_ops.test_null_ops",
-        "pyspark.pandas.tests.data_type_ops.test_num_ops",
-        "pyspark.pandas.tests.data_type_ops.test_string_ops",
-        "pyspark.pandas.tests.data_type_ops.test_udt_ops",
-        "pyspark.pandas.tests.indexes.test_category",
-        "pyspark.pandas.tests.plot.test_frame_plot",
-        "pyspark.pandas.tests.plot.test_frame_plot_matplotlib",
-        "pyspark.pandas.tests.plot.test_frame_plot_plotly",
-        "pyspark.pandas.tests.plot.test_series_plot",
-        "pyspark.pandas.tests.plot.test_series_plot_matplotlib",
-        "pyspark.pandas.tests.plot.test_series_plot_plotly",
-        "pyspark.pandas.tests.test_categorical",
-        "pyspark.pandas.tests.test_config",
-        "pyspark.pandas.tests.test_csv",
-        "pyspark.pandas.tests.test_dataframe_conversion",
-        "pyspark.pandas.tests.test_dataframe_spark_io",
-        "pyspark.pandas.tests.test_default_index",
-        "pyspark.pandas.tests.test_expanding",
-        "pyspark.pandas.tests.test_extension",
-        "pyspark.pandas.tests.test_frame_spark",
-        "pyspark.pandas.tests.test_indexops_spark",
-        "pyspark.pandas.tests.test_internal",
-        "pyspark.pandas.tests.test_namespace",
-        "pyspark.pandas.tests.test_numpy_compat",
-        "pyspark.pandas.tests.test_ops_on_diff_frames_groupby_expanding",
-        "pyspark.pandas.tests.test_ops_on_diff_frames_groupby_rolling",
-        "pyspark.pandas.tests.test_repr",
-        "pyspark.pandas.tests.test_reshape",
-        "pyspark.pandas.tests.test_rolling",
-        "pyspark.pandas.tests.test_series_conversion",
-        "pyspark.pandas.tests.test_series_datetime",
-        "pyspark.pandas.tests.test_series_string",
-        "pyspark.pandas.tests.test_spark_functions",
-        "pyspark.pandas.tests.test_sql",
-        "pyspark.pandas.tests.test_typedef",
-        "pyspark.pandas.tests.test_utils",
-        "pyspark.pandas.tests.test_window",
-    ],
+    ] + _discover_python_unittests("pyspark/pandas/tests"),
     excluded_python_implementations=[
         "PyPy"  # Skip these tests under PyPy since they require numpy, 
pandas, and pyarrow and
-                # they aren't available there
-    ]
+        # they aren't available there
+    ],
 )
 
+
 pyspark_pandas_slow = Module(
     name="pyspark-pandas-slow",
     dependencies=[pyspark_core, pyspark_sql],
@@ -678,17 +620,7 @@ pyspark_pandas_slow = Module(
         "pyspark.pandas.frame",
         "pyspark.pandas.generic",
         "pyspark.pandas.series",
-        # unittests
-        "pyspark.pandas.tests.indexes.test_base",
-        "pyspark.pandas.tests.indexes.test_datetime",
-        "pyspark.pandas.tests.test_dataframe",
-        "pyspark.pandas.tests.test_groupby",
-        "pyspark.pandas.tests.test_indexing",
-        "pyspark.pandas.tests.test_ops_on_diff_frames",
-        "pyspark.pandas.tests.test_ops_on_diff_frames_groupby",
-        "pyspark.pandas.tests.test_series",
-        "pyspark.pandas.tests.test_stats",
-    ],
+    ] + _discover_python_unittests("pyspark/pandas/tests", discover_slow=True),
     excluded_python_implementations=[
         "PyPy"  # Skip these tests under PyPy since they require numpy, 
pandas, and pyarrow and
         # they aren't available there
diff --git a/python/pyspark/pandas/tests/indexes/test_base.py 
b/python/pyspark/pandas/tests/indexes/test_base.py
index 3d35cfc..2faad5e 100644
--- a/python/pyspark/pandas/tests/indexes/test_base.py
+++ b/python/pyspark/pandas/tests/indexes/test_base.py
@@ -34,6 +34,11 @@ from pyspark.pandas.missing.indexes import (
 from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils, 
SPARK_CONF_ARROW_ENABLED
 
 
+# This is used in run-tests.py to discover the slow test. See more in the doc 
of
+# _discover_python_unittests of dev/sparktestsupport/modules.py
+is_slow_test = True
+
+
 class IndexesTest(PandasOnSparkTestCase, TestUtils):
     @property
     def pdf(self):
diff --git a/python/pyspark/pandas/tests/indexes/test_datetime.py 
b/python/pyspark/pandas/tests/indexes/test_datetime.py
index 8a55e2e..7934012 100644
--- a/python/pyspark/pandas/tests/indexes/test_datetime.py
+++ b/python/pyspark/pandas/tests/indexes/test_datetime.py
@@ -25,6 +25,11 @@ import pyspark.pandas as ps
 from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils
 
 
+# This is used in run-tests.py to discover the slow test. See more in the doc 
of
+# _discover_python_unittests of dev/sparktestsupport/modules.py
+is_slow_test = True
+
+
 class DatetimeIndexTest(PandasOnSparkTestCase, TestUtils):
     @property
     def fixed_freqs(self):
diff --git a/python/pyspark/pandas/tests/test_dataframe.py 
b/python/pyspark/pandas/tests/test_dataframe.py
index e54b783..858863c 100644
--- a/python/pyspark/pandas/tests/test_dataframe.py
+++ b/python/pyspark/pandas/tests/test_dataframe.py
@@ -50,6 +50,11 @@ from pyspark.testing.sqlutils import SQLTestUtils
 from pyspark.pandas.utils import name_like_string
 
 
+# This is used in run-tests.py to discover the slow test. See more in the doc 
of
+# _discover_python_unittests of dev/sparktestsupport/modules.py
+is_slow_test = True
+
+
 class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils):
     @property
     def pdf(self):
diff --git a/python/pyspark/pandas/tests/test_groupby.py 
b/python/pyspark/pandas/tests/test_groupby.py
index 1bc182d..5ec93ca 100644
--- a/python/pyspark/pandas/tests/test_groupby.py
+++ b/python/pyspark/pandas/tests/test_groupby.py
@@ -34,6 +34,11 @@ from pyspark.pandas.groupby import is_multi_agg_with_relabel
 from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils
 
 
+# This is used in run-tests.py to discover the slow test. See more in the doc 
of
+# _discover_python_unittests of dev/sparktestsupport/modules.py
+is_slow_test = True
+
+
 class GroupByTest(PandasOnSparkTestCase, TestUtils):
     def test_groupby_simple(self):
         pdf = pd.DataFrame(
diff --git a/python/pyspark/pandas/tests/test_indexing.py 
b/python/pyspark/pandas/tests/test_indexing.py
index b74cf90..056d404 100644
--- a/python/pyspark/pandas/tests/test_indexing.py
+++ b/python/pyspark/pandas/tests/test_indexing.py
@@ -27,6 +27,11 @@ from pyspark.pandas.exceptions import 
SparkPandasIndexingError
 from pyspark.testing.pandasutils import ComparisonTestBase, 
PandasOnSparkTestCase, compare_both
 
 
+# This is used in run-tests.py to discover the slow test. See more in the doc 
of
+# _discover_python_unittests of dev/sparktestsupport/modules.py
+is_slow_test = True
+
+
 class BasicIndexingTest(ComparisonTestBase):
     @property
     def pdf(self):
diff --git a/python/pyspark/pandas/tests/test_ops_on_diff_frames.py 
b/python/pyspark/pandas/tests/test_ops_on_diff_frames.py
index 12e87b2..db8beb7 100644
--- a/python/pyspark/pandas/tests/test_ops_on_diff_frames.py
+++ b/python/pyspark/pandas/tests/test_ops_on_diff_frames.py
@@ -35,6 +35,11 @@ from pyspark.pandas.typedef.typehints import (
 )
 
 
+# This is used in run-tests.py to discover the slow test. See more in the doc 
of
+# _discover_python_unittests of dev/sparktestsupport/modules.py
+is_slow_test = True
+
+
 class OpsOnDiffFramesEnabledTest(PandasOnSparkTestCase, SQLTestUtils):
     @classmethod
     def setUpClass(cls):
diff --git a/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py 
b/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py
index 70c3089..97efcf8 100644
--- a/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py
+++ b/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py
@@ -25,6 +25,11 @@ from pyspark.testing.pandasutils import PandasOnSparkTestCase
 from pyspark.testing.sqlutils import SQLTestUtils
 
 
+# This is used in run-tests.py to discover the slow test. See more in the doc 
of
+# _discover_python_unittests of dev/sparktestsupport/modules.py
+is_slow_test = True
+
+
 class OpsOnDiffFramesGroupByTest(PandasOnSparkTestCase, SQLTestUtils):
     @classmethod
     def setUpClass(cls):
diff --git a/python/pyspark/pandas/tests/test_series.py 
b/python/pyspark/pandas/tests/test_series.py
index 3bb06215..9d792b7 100644
--- a/python/pyspark/pandas/tests/test_series.py
+++ b/python/pyspark/pandas/tests/test_series.py
@@ -44,6 +44,11 @@ from pyspark.pandas.typedef.typehints import (
 )
 
 
+# This is used in run-tests.py to discover the slow test. See more in the doc 
of
+# _discover_python_unittests of dev/sparktestsupport/modules.py
+is_slow_test = True
+
+
 class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
     @property
     def pser(self):
diff --git a/python/pyspark/pandas/tests/test_stats.py 
b/python/pyspark/pandas/tests/test_stats.py
index 1a38665..06b35e0 100644
--- a/python/pyspark/pandas/tests/test_stats.py
+++ b/python/pyspark/pandas/tests/test_stats.py
@@ -31,6 +31,11 @@ from pyspark.testing.pandasutils import 
PandasOnSparkTestCase, SPARK_CONF_ARROW_
 from pyspark.testing.sqlutils import SQLTestUtils
 
 
+# This is used in run-tests.py to discover the slow test. See more in the doc 
of
+# _discover_python_unittests of dev/sparktestsupport/modules.py
+is_slow_test = True
+
+
 class StatsTest(PandasOnSparkTestCase, SQLTestUtils):
     def _test_stat_functions(self, pdf_or_pser, psdf_or_psser):
         functions = ["max", "min", "mean", "sum", "count"]

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-35721][PYTHON] Path level discover for python unittests

Reply via email to