tvalentyn commented on code in PR #35656:
URL: https://github.com/apache/beam/pull/35656#discussion_r2274527514


##########
sdks/python/apache_beam/internal/code_object_pickler.py:
##########
@@ -15,7 +15,462 @@
 # limitations under the License.
 #
 
+"""Customizations to how Python code objects are pickled.
+
+This module provides functions for pickling code objects, especially lambdas,
+in a consistent way. It addresses issues with non-deterministic pickling by
+creating a unique identifier that is invariant to small changes in the source
+code.
+
+The code object identifiers consists of a sequence of the following parts
+separated by periods:
+- Module names - The name of the module the code object is in
+- Class names - The name of a class containing the code object. There can be
+  multiple of these in the same identifier in the case of nested
+  classes.
+- Function names - The name of the function containing the code object.
+  There can be multiple of these in the case of nested functions.
+- __code__ - Attribute indicating that we are entering the code object of a
+  function/method.
+- __co_consts__[<name>] - The name of the local variable containing the
+  code object. In the case of lambdas, the name is created by using the
+  signature of the lambda and hashing the bytecode, as shown below.
+
+Examples:
+- __main__.top_level_function.__code__
+- __main__.ClassWithNestedFunction.process.__code__.co_consts[nested_function]
+- __main__.ClassWithNestedLambda.process.__code__.co_consts[
+    get_lambda_from_dictionary].co_consts[<lambda>, ('x',)]
+- __main__.ClassWithNestedLambda.process.__code__.co_consts[
+    <lambda>, ('x',), 1234567890]
+"""
+
+import collections
+import hashlib
+import inspect
+import re
+import sys
+import types
+from typing import Union
+
 
 def get_normalized_path(path):
   """Returns a normalized path. This function is intended to be overridden."""
   return path
+
+
+def get_code_path(callable: types.FunctionType):

Review Comment:
   add the type hint (str) for the result



##########
sdks/python/apache_beam/internal/code_object_pickler.py:
##########
@@ -15,7 +15,462 @@
 # limitations under the License.
 #
 
+"""Customizations to how Python code objects are pickled.
+
+This module provides functions for pickling code objects, especially lambdas,
+in a consistent way. It addresses issues with non-deterministic pickling by
+creating a unique identifier that is invariant to small changes in the source
+code.
+
+The code object identifiers consists of a sequence of the following parts
+separated by periods:
+- Module names - The name of the module the code object is in
+- Class names - The name of a class containing the code object. There can be
+  multiple of these in the same identifier in the case of nested
+  classes.
+- Function names - The name of the function containing the code object.
+  There can be multiple of these in the case of nested functions.
+- __code__ - Attribute indicating that we are entering the code object of a
+  function/method.
+- __co_consts__[<name>] - The name of the local variable containing the
+  code object. In the case of lambdas, the name is created by using the
+  signature of the lambda and hashing the bytecode, as shown below.
+
+Examples:
+- __main__.top_level_function.__code__
+- __main__.ClassWithNestedFunction.process.__code__.co_consts[nested_function]
+- __main__.ClassWithNestedLambda.process.__code__.co_consts[
+    get_lambda_from_dictionary].co_consts[<lambda>, ('x',)]
+- __main__.ClassWithNestedLambda.process.__code__.co_consts[
+    <lambda>, ('x',), 1234567890]
+"""
+
+import collections
+import hashlib
+import inspect
+import re
+import sys
+import types
+from typing import Union
+
 
 def get_normalized_path(path):
   """Returns a normalized path. This function is intended to be overridden."""
   return path
+
+
+def get_code_path(callable: types.FunctionType):
+  """Returns the stable reference to the code object.
+
+  Will be implemented using cloudpickle in a future version.
+
+  Args:
+    callable: The callable object to search for.
+
+  Returns:
+    The stable reference to the code object.
+      Examples:
+      - __main__.top_level_function.__code__
+      - __main__.ClassWithNestedFunction.process.__code__.co_consts[
+        nested_function]
+      - __main__.ClassWithNestedLambda.process.__code__.co_consts[
+        get_lambda_from_dictionary].co_consts[<lambda>, ('x',)]
+      - __main__.ClassWithNestedLambda.process.__code__.co_consts[
+        <lambda>, ('x',), 1234567890]
+  """
+  if not hasattr(callable, '__module__') or not hasattr(callable,
+                                                        '__qualname__'):
+    return None
+  code_path = _extend_path(
+      callable.__module__,
+      _search(
+          callable,
+          sys.modules[callable.__module__],
+          callable.__qualname__.split('.'),
+      ),
+  )
+  return code_path
+
+
+def _extend_path(prefix: str, suffix: str):
+  """Extends the path to the code object.
+
+  Args:
+    prefix: The prefix of the path.
+    suffix: The rest of the path.
+
+  Returns:
+    The extended path.
+  """
+  if suffix is None:

Review Comment:
   I would rename the args as follows, since it's a bit confusing that we have 
a suffix, a prefix but no middle. seems like this is used to prepend prefixes, 
how about:
   
   def _extend_path(prefix: str, current_path: Optional[str]):
   
   Note: use Optional to indicate the value can be None



##########
sdks/python/apache_beam/internal/code_object_pickler.py:
##########
@@ -95,13 +105,16 @@ def _extend_path(prefix, suffix):
   return prefix + '.' + suffix
 
 
-def _search(callable, node, qual_name_parts):
+def _search(
+    callable: types.FunctionType, node: object, qual_name_parts: list[str]):
   """Searches an object to create a stable reference code path.
 
   Recursively searches the tree of objects starting from node and looking for
   callable and returns a string to uniquely identify the path from node to the
   callable.
 
+  Example of qual_name_parts: ['MyClass', 'process', '<locals>', '<lambda>']

Review Comment:
   As a reader of this code my first initial question was:  why are we passing 
both callable and callable.__qualname__.split('.') as arguments to _search , 
why not to pass only callable, and figure out qual_name_parts within the 
function.
   
   The docstring doesn't answer that. Looks like it play some part in the 
recursion further down in the callstack, where we chip-away pieces of qual name 
starting from the beginning. This can be understood only from reading the 
details of the code. Do you think it would be possible to capture the nature of 
using qual_name_parts here? I also wonder if it is otherwise possible to not 
have this argument.



##########
sdks/python/apache_beam/internal/code_object_pickler.py:
##########
@@ -15,7 +15,421 @@
 # limitations under the License.
 #
 
+"""Customizations to how Python code objects are pickled.
+
+This module provides functions for pickling code objects, especially lambdas,
+in a consistent way. It addresses issues with non-deterministic pickling by
+creating a unique identifier that is invariant to small changes in the source
+code.
+
+The code object identifiers consists of a sequence of the following parts

Review Comment:
   I see, the concept of codeobject identifiers is indeed common to several 
methods. Consider the following docstring:
   
   ```
   This module provides helper functions to improve pickling code objects, 
especially lambdas,
   in a more consistent way by using code object identifiers.  
   
   A code object identifier is a unique identifier for a code object, that 
provides 
   a unique reference to the code object in the context where the code is 
defined 
   and is invariant to small changes in the surrounding source code.
   
   The code object identifier consists of a sequence of the following parts 
   separated by periods:
   - Module names - The name of the module the code object is in
   - Class names - The name of a class containing the code object. There can be
     multiple of these in the same identifier in the case of nested
     classes.
   - Function names - The name of the function containing the code object.
     There can be multiple of these in the case of nested functions.
   - __code__ - Attribute indicating that we are entering the code object of a
     function/method.
   - __co_consts__[<name>] - The name of the local variable containing the
     code object. In the case of lambdas, the name is created by using the
     signature of the lambda and hashing the bytecode, as shown below.
   Examples:
   - __main__.top_level_function.__code__
   - 
__main__.ClassWithNestedFunction.process.__code__.co_consts[nested_function]
   - __main__.ClassWithNestedLambda.process.__code__.co_consts[
       get_lambda_from_dictionary].co_consts[<lambda>, ('x',)]
   - __main__.ClassWithNestedLambda.process.__code__.co_consts[
       <lambda>, ('x',), 1234567890]
   ```
   



##########
sdks/python/apache_beam/internal/code_object_pickler.py:
##########
@@ -15,7 +15,462 @@
 # limitations under the License.
 #
 
+"""Customizations to how Python code objects are pickled.
+
+This module provides functions for pickling code objects, especially lambdas,
+in a consistent way. It addresses issues with non-deterministic pickling by
+creating a unique identifier that is invariant to small changes in the source
+code.
+
+The code object identifiers consists of a sequence of the following parts
+separated by periods:
+- Module names - The name of the module the code object is in
+- Class names - The name of a class containing the code object. There can be
+  multiple of these in the same identifier in the case of nested
+  classes.
+- Function names - The name of the function containing the code object.
+  There can be multiple of these in the case of nested functions.
+- __code__ - Attribute indicating that we are entering the code object of a
+  function/method.
+- __co_consts__[<name>] - The name of the local variable containing the
+  code object. In the case of lambdas, the name is created by using the
+  signature of the lambda and hashing the bytecode, as shown below.
+
+Examples:
+- __main__.top_level_function.__code__
+- __main__.ClassWithNestedFunction.process.__code__.co_consts[nested_function]
+- __main__.ClassWithNestedLambda.process.__code__.co_consts[
+    get_lambda_from_dictionary].co_consts[<lambda>, ('x',)]
+- __main__.ClassWithNestedLambda.process.__code__.co_consts[
+    <lambda>, ('x',), 1234567890]
+"""
+
+import collections
+import hashlib
+import inspect
+import re
+import sys
+import types
+from typing import Union
+
 
 def get_normalized_path(path):
   """Returns a normalized path. This function is intended to be overridden."""
   return path
+
+
+def get_code_path(callable: types.FunctionType):
+  """Returns the stable reference to the code object.

Review Comment:
   I'd try to use  consistent terminology: we currently 'code object 
identifier', 'stable reference' and 'code path' all seemingly meaning the same 
thing:
   
   
    Suggested docstring:
   
   ```
   def get_code_path(callable: types.FunctionType):
     """Returns the code object identifier for a given callable.
   ...
   
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to