BryanCutler commented on a change in pull request #30309:
URL: https://github.com/apache/spark/pull/30309#discussion_r523380710



##########
File path: python/pyspark/util.py
##########
@@ -75,6 +79,144 @@ def wrapper(*args, **kwargs):
     return wrapper
 
 
+def walk_tb(tb):
+    while tb is not None:
+        yield tb
+        tb = tb.tb_next
+
+
+def try_simplify_traceback(tb):
+    """
+    Simplify the traceback. It removes the tracebacks in the current package, 
and only
+    shows the traceback that is related to the thirdparty and user-specified 
codes.
+
+    Returns
+    -------
+    TracebackType or None
+      Simplified traceback instance. It returns None if it fails to simplify.
+
+    Notes
+    -----
+    This keeps the tracebacks once it sees they are from a different file even
+    though the following tracebacks are from the current package.
+
+    Examples
+    --------
+    >>> import importlib
+    >>> import sys
+    >>> import traceback
+    >>> import tempfile
+    >>> with tempfile.TemporaryDirectory() as tmp_dir:
+    ...     with open("%s/dummy_module.py" % tmp_dir, "w") as f:
+    ...         _ = f.write(
+    ...             'def raise_stop_iteration():\\n'
+    ...             '    raise StopIteration()\\n\\n'
+    ...             'def simple_wrapper(f):\\n'
+    ...             '    def wrapper(*a, **k):\\n'
+    ...             '        return f(*a, **k)\\n'
+    ...             '    return wrapper\\n')
+    ...         f.flush()
+    ...         spec = importlib.util.spec_from_file_location(
+    ...             "dummy_module", "%s/dummy_module.py" % tmp_dir)
+    ...         dummy_module = importlib.util.module_from_spec(spec)
+    ...         spec.loader.exec_module(dummy_module)
+    >>> def skip_doctest_traceback(tb):
+    ...     import pyspark
+    ...     root = os.path.dirname(pyspark.__file__)
+    ...     pairs = zip(walk_tb(tb), traceback.extract_tb(tb))
+    ...     for cur_tb, cur_frame in pairs:
+    ...         if cur_frame.filename.startswith(root):
+    ...             return cur_tb
+
+    Regular exceptions should show the file name of the current package as 
below.
+
+    >>> exc_info = None
+    >>> try:
+    ...    fail_on_stopiteration(dummy_module.raise_stop_iteration)()

Review comment:
       indentation off here?

##########
File path: python/pyspark/util.py
##########
@@ -20,6 +20,10 @@
 import re
 import sys
 import traceback
+import types
+import os
+import platform
+import itertools

Review comment:
       nit: imports not in alphabetical order - not necessary though

##########
File path: python/pyspark/util.py
##########
@@ -75,6 +79,144 @@ def wrapper(*args, **kwargs):
     return wrapper
 
 
+def walk_tb(tb):
+    while tb is not None:
+        yield tb
+        tb = tb.tb_next
+
+
+def try_simplify_traceback(tb):
+    """
+    Simplify the traceback. It removes the tracebacks in the current package, 
and only
+    shows the traceback that is related to the thirdparty and user-specified 
codes.
+
+    Returns
+    -------
+    TracebackType or None
+      Simplified traceback instance. It returns None if it fails to simplify.
+
+    Notes
+    -----
+    This keeps the tracebacks once it sees they are from a different file even
+    though the following tracebacks are from the current package.
+
+    Examples
+    --------
+    >>> import importlib
+    >>> import sys
+    >>> import traceback
+    >>> import tempfile
+    >>> with tempfile.TemporaryDirectory() as tmp_dir:
+    ...     with open("%s/dummy_module.py" % tmp_dir, "w") as f:
+    ...         _ = f.write(
+    ...             'def raise_stop_iteration():\\n'
+    ...             '    raise StopIteration()\\n\\n'
+    ...             'def simple_wrapper(f):\\n'
+    ...             '    def wrapper(*a, **k):\\n'
+    ...             '        return f(*a, **k)\\n'
+    ...             '    return wrapper\\n')
+    ...         f.flush()
+    ...         spec = importlib.util.spec_from_file_location(
+    ...             "dummy_module", "%s/dummy_module.py" % tmp_dir)
+    ...         dummy_module = importlib.util.module_from_spec(spec)
+    ...         spec.loader.exec_module(dummy_module)
+    >>> def skip_doctest_traceback(tb):
+    ...     import pyspark
+    ...     root = os.path.dirname(pyspark.__file__)
+    ...     pairs = zip(walk_tb(tb), traceback.extract_tb(tb))
+    ...     for cur_tb, cur_frame in pairs:
+    ...         if cur_frame.filename.startswith(root):
+    ...             return cur_tb
+
+    Regular exceptions should show the file name of the current package as 
below.
+
+    >>> exc_info = None
+    >>> try:
+    ...    fail_on_stopiteration(dummy_module.raise_stop_iteration)()
+    ... except Exception as e:
+    ...     tb = sys.exc_info()[-1]
+    ...     e.__cause__ = None
+    ...     exc_info = "".join(
+    ...         traceback.format_exception(type(e), e, tb))
+    >>> print(exc_info)  # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
+    Traceback (most recent call last):
+      File ...
+        ...
+      File "/.../pyspark/util.py", line ...
+        ...
+    RuntimeError: ...
+    >>> "pyspark/util.py" in exc_info
+    True
+
+    If the the traceback is simplified with this method, it hides the current 
package file name:

Review comment:
       "the the" -> "the"

##########
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
##########
@@ -1927,6 +1927,16 @@ object SQLConf {
       .version("3.0.0")
       .fallbackConf(BUFFER_SIZE)
 
+  val PYSPARK_SIMPLIFIEID_EXCEPTION =
+    buildConf("spark.sql.execution.pyspark.udf.simplifiedException.enabled")
+      .doc(
+        s"When true, the exception messages from Python UDFs are simplified. 
It hides " +
+         "the Python worker, (de)serialization, etc from PySpark in 
tracebacks, and only" +

Review comment:
       need a space after 'only'?

##########
File path: python/pyspark/util.py
##########
@@ -75,6 +79,144 @@ def wrapper(*args, **kwargs):
     return wrapper
 
 
+def walk_tb(tb):
+    while tb is not None:
+        yield tb
+        tb = tb.tb_next
+
+
+def try_simplify_traceback(tb):
+    """
+    Simplify the traceback. It removes the tracebacks in the current package, 
and only
+    shows the traceback that is related to the thirdparty and user-specified 
codes.
+
+    Returns
+    -------
+    TracebackType or None
+      Simplified traceback instance. It returns None if it fails to simplify.
+
+    Notes
+    -----
+    This keeps the tracebacks once it sees they are from a different file even
+    though the following tracebacks are from the current package.
+
+    Examples
+    --------
+    >>> import importlib
+    >>> import sys
+    >>> import traceback
+    >>> import tempfile
+    >>> with tempfile.TemporaryDirectory() as tmp_dir:
+    ...     with open("%s/dummy_module.py" % tmp_dir, "w") as f:
+    ...         _ = f.write(
+    ...             'def raise_stop_iteration():\\n'
+    ...             '    raise StopIteration()\\n\\n'
+    ...             'def simple_wrapper(f):\\n'
+    ...             '    def wrapper(*a, **k):\\n'
+    ...             '        return f(*a, **k)\\n'
+    ...             '    return wrapper\\n')
+    ...         f.flush()
+    ...         spec = importlib.util.spec_from_file_location(
+    ...             "dummy_module", "%s/dummy_module.py" % tmp_dir)
+    ...         dummy_module = importlib.util.module_from_spec(spec)
+    ...         spec.loader.exec_module(dummy_module)
+    >>> def skip_doctest_traceback(tb):
+    ...     import pyspark
+    ...     root = os.path.dirname(pyspark.__file__)
+    ...     pairs = zip(walk_tb(tb), traceback.extract_tb(tb))
+    ...     for cur_tb, cur_frame in pairs:
+    ...         if cur_frame.filename.startswith(root):
+    ...             return cur_tb
+
+    Regular exceptions should show the file name of the current package as 
below.
+
+    >>> exc_info = None
+    >>> try:
+    ...    fail_on_stopiteration(dummy_module.raise_stop_iteration)()

Review comment:
       also looks like similar spots below.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Reply via email to