Yikun commented on a change in pull request #33174:
URL: https://github.com/apache/spark/pull/33174#discussion_r662283934



##########
File path: python/run-tests.py
##########
@@ -40,6 +42,99 @@
 from sparktestsupport.shellutils import which, subprocess_check_output  # noqa
 from sparktestsupport.modules import all_modules, pyspark_sql  # noqa
 
+# Make sure logging config before any possible logging print
+logging.basicConfig(stream=sys.stdout, format="%(message)s")
+LOGGER = logging.getLogger()
+
+
+def _get_module_from_name(name):
+    __import__(name)
+    return sys.modules[name]
+
+
+def _discover_python_unittests(paths, discover_slow=False):
+    """Discover the python module which contains unittests under paths.
+
+    Such as:
+    ['pyspark/tests'], it will return the set of module name under the path of 
pyspark/tests, like
+    {'pyspark.tests.test_appsubmit', 'pyspark.tests.test_broadcast', ...}
+
+    Parameters
+    ----------
+    paths : list
+        Paths of modules to be discovered.
+    discover_slow : bool
+        If True, will only discover slow tests
+        If False, will discover all tests except slow tests
+
+    Returns
+    -------
+    A set of complete test module name discovered under specified paths
+
+    >>> sorted([x for x in _discover_python_unittests(['pyspark/tests'])])
+    ... # doctest: +NORMALIZE_WHITESPACE
+    ['pyspark.tests.test_appsubmit', 'pyspark.tests.test_broadcast', 
'pyspark.tests.test_conf',
+    'pyspark.tests.test_context', 'pyspark.tests.test_daemon', 
'pyspark.tests.test_install_spark',
+    'pyspark.tests.test_join', 'pyspark.tests.test_pin_thread', 
'pyspark.tests.test_profiler',
+    'pyspark.tests.test_rdd', 'pyspark.tests.test_rddbarrier', 
'pyspark.tests.test_readwrite',
+    'pyspark.tests.test_serializers', 'pyspark.tests.test_shuffle',
+    'pyspark.tests.test_taskcontext', 'pyspark.tests.test_util', 
'pyspark.tests.test_worker']
+    >>> sorted([x for x in 
_discover_python_unittests([("pyspark/pandas/tests", "slow")])])
+    ... # doctest: +NORMALIZE_WHITESPACE
+    ['pyspark.pandas.tests.indexes.test_base', 
'pyspark.pandas.tests.indexes.test_datetime',
+    'pyspark.pandas.tests.test_dataframe', 'pyspark.pandas.tests.test_groupby',
+    'pyspark.pandas.tests.test_indexing', 
'pyspark.pandas.tests.test_ops_on_diff_frames',
+    'pyspark.pandas.tests.test_ops_on_diff_frames_groupby', 
'pyspark.pandas.tests.test_series',
+    'pyspark.pandas.tests.test_stats']
+    """
+
+    def add_test_module(testcases, modules, slow):
+        """Append the testcases module names to modules set"""
+        if isinstance(testcases, Iterable):
+            for test_case in testcases:
+                add_test_module(test_case, modules, slow)
+        else:
+            name = testcases.__module__
+            module = _get_module_from_name(name)
+            if slow and hasattr(module, 'is_slow_test'):
+                modules.add(name)
+            if not slow and not hasattr(module, 'is_slow_test'):
+                modules.add(name)
+
+    if not paths:
+        return []
+    modules = set()
+    pyspark_path = os.path.join(SPARK_HOME, "python")
+    for path in paths:
+        if isinstance(path, tuple) and len(path) >= 2 and path[1] == "slow":
+            discover_slow = True
+            path = path[0]
+        # Discover the unittest in every path
+        testcases = unittest.defaultTestLoader.discover(
+            os.path.join(pyspark_path, path),
+            top_level_dir=pyspark_path
+        )
+        if not unittest.defaultTestLoader.errors:
+            add_test_module(testcases, modules, discover_slow)
+        else:
+            # unittest discover need all deps of PySpark
+            for error in unittest.defaultTestLoader.errors:
+                print(error)
+            raise Exception("Discover unittest failed, Please make sure you 
have install all deps "
+                            "of PySpark.")

Review comment:
       In order to add doctest in some time to reuse the discover module 
travel, it is better to use pkgutil.walk_packages, and add 
`_contain_unittests_class` to make sure only unittests is loaded.
   
   and when we want to add doctest in future, we could add something like 
`_contain_doctests_class`
   
   ```python
   pyspark_path = os.path.join(SPARK_HOME, "python")
   real_path = os.path.join(pyspark_path, path)
   prefix = path.replace('/', '.')
   # Travel the module under the real_path
   for importer, module_name, ispkg in pkgutil.walk_packages([real_path], 
prefix=prefix+'.'):
       if _contain_unittests_class(module_name, slow_only):
           modules.add(module_name)
   ```




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Reply via email to