gemini-code-assist[bot] commented on code in PR #38473:
URL: https://github.com/apache/beam/pull/38473#discussion_r3238659201
##########
sdks/python/apache_beam/yaml/yaml_mapping.py:
##########
@@ -210,80 +230,54 @@ def py_value_to_js_dict(py_value):
def _expand_javascript_mapping_func(
original_fields, expression=None, callable=None, path=None, name=None):
- # Check for installed js2py package
- if js2py is None:
+ if quickjs is None:
raise ValueError(
- "Javascript mapping functions are not supported on"
- " Python 3.12 or later.")
-
- # import remaining js2py objects
- from js2py import base
- from js2py.constructors import jsdate
- from js2py.internals import simplex
-
- js_array_type = (
- base.PyJsArray,
- base.PyJsArrayBuffer,
- base.PyJsInt8Array,
- base.PyJsUint8Array,
- base.PyJsUint8ClampedArray,
- base.PyJsInt16Array,
- base.PyJsUint16Array,
- base.PyJsInt32Array,
- base.PyJsUint32Array,
- base.PyJsFloat32Array,
- base.PyJsFloat64Array)
-
- def _js_object_to_py_object(obj):
- if isinstance(obj, (base.PyJsNumber, base.PyJsString, base.PyJsBoolean)):
- return base.to_python(obj)
- elif isinstance(obj, js_array_type):
- return [_js_object_to_py_object(value) for value in obj.to_list()]
- elif isinstance(obj, jsdate.PyJsDate):
- return obj.to_utc_dt()
- elif isinstance(obj, (base.PyJsNull, base.PyJsUndefined)):
- return None
- elif isinstance(obj, base.PyJsError):
- raise RuntimeError(obj['message'])
- elif isinstance(obj, base.PyJsObject):
- return {
- key: _js_object_to_py_object(value['value'])
- for (key, value) in obj.own.items()
- }
- elif isinstance(obj, base.JsObjectWrapper):
- return _js_object_to_py_object(obj._obj)
-
- return obj
+ "Javascript mapping functions are not supported because the "
+ "quickjs-ng library is not installed.")
if expression:
- source = '\n'.join(['function(__row__) {'] + [
- f' {name} = __row__.{name}'
- for name in original_fields if name in expression
- ] + [' return (' + expression + ')'] + ['}'])
- js_func = _CustomJsObjectWrapper(js2py.eval_js(source))
+ unpacking_code = '\n'.join([
+ f' var {name} = __row__.{name};' for name in original_fields
+ if name in expression
+ ])
+ source_code = f"""
+ const udf = function(__row__) {{
+ {unpacking_code}
+ return ({expression});
+ }};
+ function wrapper(json_str) {{
+ return udf(JSON.parse(json_str));
+ }}
+ """
+ entrypoint = 'wrapper'
elif callable:
- js_func = _CustomJsObjectWrapper(js2py.eval_js(callable))
+ match = re.search(r'(?:async\s+)?function\s+([a-zA-Z0-9_]+)', callable)
Review Comment:

The regex used to extract the function name is quite restrictive and will
fail for arrow functions (`const f = (x) => ...`), anonymous functions assigned
to variables, or exported functions. It also doesn't handle cases where the
function might be `async` (though the regex tries to match it, the subsequent
wrapper call is synchronous). A more robust approach would be to wrap the
entire `callable` in parentheses and evaluate it to get the function object
directly, or use a more flexible way to define the entry point rather than
relying on regex parsing.
##########
sdks/python/apache_beam/yaml/yaml_mapping.py:
##########
@@ -178,18 +178,38 @@ def _check_mapping_arguments(
raise ValueError(f'{transform_name} cannot specify "name" without "path"')
-# js2py's JsObjectWrapper object has a self-referencing __dict__ property
-# that cannot be pickled without implementing the __getstate__ and
-# __setstate__ methods.
-class _CustomJsObjectWrapper(JsObjectWrapper):
- def __init__(self, js_obj):
- super().__init__(js_obj.__dict__['_obj'])
+_THREAD_LOCAL_JS_CACHE = threading.local()
- def __getstate__(self):
- return self.__dict__.copy()
- def __setstate__(self, state):
- self.__dict__.update(state)
+class _JsWrapper:
+ def __init__(self, source_code, entrypoint_name):
+ self.source_code = source_code
+ self.entrypoint_name = entrypoint_name
+
+ def _get_wrapper_fn(self):
+ cache = _THREAD_LOCAL_JS_CACHE
+ if not hasattr(cache, 'functions'):
+ cache.functions = {}
+
+ cache_key = (self.source_code, self.entrypoint_name)
+ if cache_key not in cache.functions:
+ ctx = quickjs.Context()
+ ctx.eval(self.source_code)
+ cache.functions[cache_key] = ctx.get(self.entrypoint_name)
+
+ return cache.functions[cache_key]
+
+ def __call__(self, row):
+ wrapper_fn = self._get_wrapper_fn()
+ row_as_dict = py_value_to_js_dict(row)
+ try:
+ js_result = wrapper_fn(json.dumps(row_as_dict))
Review Comment:

Using `json.dumps` to pass the row to JavaScript will cause a `TypeError` if
the row contains non-serializable types such as `datetime.datetime`, `bytes`,
or `Decimal`, which are common in Beam pipelines. This is a regression from the
previous `js2py` implementation which handled these types via its internal
mapping. Furthermore, the double JSON serialization/deserialization roundtrip
for every row introduces significant performance overhead. Since `quickjs-ng`
supports direct conversion of Python dictionaries to JavaScript objects, it is
recommended to pass the dictionary directly to the function and avoid the JSON
roundtrip.
##########
sdks/python/apache_beam/yaml/yaml_mapping.py:
##########
@@ -210,80 +230,54 @@ def py_value_to_js_dict(py_value):
def _expand_javascript_mapping_func(
original_fields, expression=None, callable=None, path=None, name=None):
- # Check for installed js2py package
- if js2py is None:
+ if quickjs is None:
raise ValueError(
- "Javascript mapping functions are not supported on"
- " Python 3.12 or later.")
-
- # import remaining js2py objects
- from js2py import base
- from js2py.constructors import jsdate
- from js2py.internals import simplex
-
- js_array_type = (
- base.PyJsArray,
- base.PyJsArrayBuffer,
- base.PyJsInt8Array,
- base.PyJsUint8Array,
- base.PyJsUint8ClampedArray,
- base.PyJsInt16Array,
- base.PyJsUint16Array,
- base.PyJsInt32Array,
- base.PyJsUint32Array,
- base.PyJsFloat32Array,
- base.PyJsFloat64Array)
-
- def _js_object_to_py_object(obj):
- if isinstance(obj, (base.PyJsNumber, base.PyJsString, base.PyJsBoolean)):
- return base.to_python(obj)
- elif isinstance(obj, js_array_type):
- return [_js_object_to_py_object(value) for value in obj.to_list()]
- elif isinstance(obj, jsdate.PyJsDate):
- return obj.to_utc_dt()
- elif isinstance(obj, (base.PyJsNull, base.PyJsUndefined)):
- return None
- elif isinstance(obj, base.PyJsError):
- raise RuntimeError(obj['message'])
- elif isinstance(obj, base.PyJsObject):
- return {
- key: _js_object_to_py_object(value['value'])
- for (key, value) in obj.own.items()
- }
- elif isinstance(obj, base.JsObjectWrapper):
- return _js_object_to_py_object(obj._obj)
-
- return obj
+ "Javascript mapping functions are not supported because the "
+ "quickjs-ng library is not installed.")
if expression:
- source = '\n'.join(['function(__row__) {'] + [
- f' {name} = __row__.{name}'
- for name in original_fields if name in expression
- ] + [' return (' + expression + ')'] + ['}'])
- js_func = _CustomJsObjectWrapper(js2py.eval_js(source))
+ unpacking_code = '\n'.join([
+ f' var {name} = __row__.{name};' for name in original_fields
+ if name in expression
+ ])
Review Comment:

The unpacking logic uses direct property access and variable assignment
which will fail if a field name is not a valid JavaScript identifier (e.g.,
contains spaces, dashes, or is a reserved keyword). Additionally, using `var`
for every field can lead to name collisions. Consider using bracket notation
for property access and only unpacking valid identifiers to prevent syntax
errors in the generated JavaScript.
```suggestion
unpacking_code = '\n'.join([
f" var {name} = __row__['{name}'];" for name in original_fields
if name in expression and name.isidentifier()
])
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]