gemini-code-assist[bot] commented on code in PR #38917: URL: https://github.com/apache/beam/pull/38917#discussion_r3394011179
########## sdks/python/apache_beam/examples/ml_transform/mltransform_image_embedding.py: ########## @@ -0,0 +1,263 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Batch image embedding pipeline using MLTransform. + +The pipeline reads image URIs from a text file, decodes images with Pillow, +generates SentenceTransformers image embeddings through MLTransform, and writes +results to BigQuery using batch file loads. +""" + +import argparse +import hashlib +import io +import logging +import time +from collections.abc import Iterable +from typing import Any + +import apache_beam as beam +from apache_beam.io.filesystems import FileSystems +from apache_beam.ml.transforms.base import MLTransform +from apache_beam.ml.transforms.embeddings.huggingface import SentenceTransformerEmbeddings +from apache_beam.options.pipeline_options import PipelineOptions +from apache_beam.options.pipeline_options import SetupOptions +from apache_beam.options.pipeline_options import StandardOptions +from apache_beam.runners.runner import PipelineResult +from PIL import Image + +IMAGE_COLUMN = 'image' +IMAGE_ID_COLUMN = 'image_id' +IMAGE_URI_COLUMN = 'image_uri' + +DEFAULT_IMAGE_MODEL_NAME = 'clip-ViT-B-32' +DEFAULT_ACCELERATOR = 'type:nvidia-tesla-t4;count:1;install-nvidia-driver' +DEFAULT_EMBEDDING_MIN_RAM = '16GB' + +OUTPUT_TABLE_SCHEMA = { + 'fields': [ + { + 'name': 'image_id', 'type': 'STRING' + }, + { + 'name': 'image_uri', 'type': 'STRING' + }, + { + 'name': 'model_name', 'type': 'STRING' + }, + { + 'name': 'embedding', 'type': 'FLOAT64', 'mode': 'REPEATED' + }, + { + 'name': 'embedding_dim', 'type': 'INT64' + }, + { + 'name': 'infer_ms', 'type': 'INT64' + }, + ] +} + + +def now_millis() -> int: + return int(time.time() * 1000) + + +def sha1_hex(value: str) -> str: + return hashlib.sha1(value.encode('utf-8')).hexdigest() + + +def filter_empty_uri(uri: str) -> Iterable[str]: + uri = uri.strip() + if uri: + yield uri + + +def load_image_from_uri(uri: str) -> bytes: + with FileSystems.open(uri) as file: + return file.read() + + +def decode_pil(image_bytes: bytes) -> Image.Image: + with Image.open(io.BytesIO(image_bytes)) as image: + image = image.convert('RGB') + image.load() + return image + + +class ReadImage(beam.DoFn): + def process(self, uri: str) -> Iterable[dict[str, Any]]: + image_id = sha1_hex(uri) + try: + yield { + IMAGE_ID_COLUMN: image_id, + IMAGE_URI_COLUMN: uri, + IMAGE_COLUMN: decode_pil(load_image_from_uri(uri)), + } + except Exception as exc: + logging.warning( + 'Failed to read or decode image %s (%s): %s', image_id, uri, exc) + + +def _as_dict(row: Any) -> dict[str, Any]: + if hasattr(row, 'as_dict'): + return row.as_dict() + if hasattr(row, '_asdict'): + return row._asdict() + return dict(row) + + +def embedding_to_list(value: Any) -> list[float]: + if hasattr(value, 'tolist'): + value = value.tolist() + return [float(item) for item in value] + + +class FormatImageEmbeddingOutput(beam.DoFn): + def __init__(self, model_name: str): + self.model_name = model_name + + def process(self, row: Any) -> Iterable[dict[str, Any]]: + row = _as_dict(row) + embedding = embedding_to_list(row[IMAGE_COLUMN]) + yield { + IMAGE_ID_COLUMN: row[IMAGE_ID_COLUMN], + IMAGE_URI_COLUMN: row[IMAGE_URI_COLUMN], + 'model_name': self.model_name, + 'embedding': embedding, + 'embedding_dim': len(embedding), + 'infer_ms': now_millis(), + } + + +def _str_to_bool(value: str) -> bool: + if value.lower() == 'true': + return True + if value.lower() == 'false': + return False + raise argparse.ArgumentTypeError( + f'"true" or "false" expected, got "{value}" instead.') + + +def parse_known_args(argv): + parser = argparse.ArgumentParser() + parser.add_argument('--mode', default='batch', choices=['batch']) + parser.add_argument( + '--input', required=True, help='Path to a text file with image URIs.') + parser.add_argument( + '--output_table', required=True, help='BigQuery table for embeddings.') + parser.add_argument( + '--publish_to_big_query', + type=_str_to_bool, + default=True, + help='Whether to write embedding rows to BigQuery.') + parser.add_argument( + '--artifact_location', + required=True, + help='Path where MLTransform artifacts are written.') + parser.add_argument( + '--pretrained_model_name', + default=DEFAULT_IMAGE_MODEL_NAME, + help='SentenceTransformers image model name.') + parser.add_argument( + '--device', + default='CPU', + choices=['CPU', 'GPU'], + help='Device used by SentenceTransformers on the worker.') + parser.add_argument( + '--min_batch_size', + type=int, + default=8, + help='Minimum Beam inference batch size.') + parser.add_argument( + '--max_batch_size', + type=int, + default=64, + help='Maximum Beam inference batch size.') + parser.add_argument( + '--embedding_accelerator', + default=DEFAULT_ACCELERATOR, + help='GPU accelerator resource hint for the MLTransform embedding step.') + parser.add_argument( + '--embedding_min_ram', + default=DEFAULT_EMBEDDING_MIN_RAM, + help='CPU right-fitting min RAM resource hint for the embedding step.') + return parser.parse_known_args(argv) + + +def run( + argv=None, save_main_session=True, test_pipeline=None) -> PipelineResult: + known_args, pipeline_args = parse_known_args(argv) + pipeline_options = PipelineOptions(pipeline_args) + pipeline_options.view_as(SetupOptions).save_main_session = save_main_session + pipeline_options.view_as(StandardOptions).streaming = False + + device = 'cuda' if known_args.device == 'GPU' else 'cpu' + embedding_transform = SentenceTransformerEmbeddings( + model_name=known_args.pretrained_model_name, + columns=[IMAGE_COLUMN], + image_model=True, + min_batch_size=known_args.min_batch_size, + max_batch_size=known_args.max_batch_size, + load_model_args={'device': device}, + inference_args={ + 'convert_to_numpy': True, + 'show_progress_bar': False, + }) + + ml_transform = MLTransform( + write_artifact_location=known_args.artifact_location).with_transform( + embedding_transform) + if known_args.device == 'GPU' and known_args.embedding_accelerator: + ml_transform = ml_transform.with_resource_hints( + accelerator=known_args.embedding_accelerator) + elif known_args.embedding_min_ram: + ml_transform = ml_transform.with_resource_hints( + min_ram=known_args.embedding_min_ram) + + pipeline = test_pipeline or beam.Pipeline(options=pipeline_options) + rows = ( + pipeline + | 'ReadImageURIs' >> beam.io.ReadFromText(known_args.input) + | 'FilterEmptyURIs' >> beam.FlatMap(filter_empty_uri) + | 'ReadImages' >> beam.ParDo(ReadImage()) + | 'ReshuffleBeforeEmbedding' >> beam.Reshuffle()) Review Comment:  Reshuffling the elements *after* decoding the images into `PIL.Image.Image` objects is highly inefficient. Uncompressed image objects are extremely large compared to their URI strings or compressed bytes. Serializing and transferring these uncompressed images across the network during a reshuffle can cause severe performance bottlenecks and OutOfMemory (OOM) errors on large datasets.\n\nInstead, perform the reshuffle on the image URIs *before* reading and decoding them. This achieves the same load balancing and fusion prevention benefits with virtually zero serialization overhead. ```suggestion rows = ( pipeline | 'ReadImageURIs' >> beam.io.ReadFromText(known_args.input) | 'FilterEmptyURIs' >> beam.FlatMap(filter_empty_uri) | 'ReshuffleBeforeEmbedding' >> beam.Reshuffle() | 'ReadImages' >> beam.ParDo(ReadImage())) ``` ########## sdks/python/apache_beam/examples/ml_transform/mltransform_image_embedding.py: ########## @@ -0,0 +1,263 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Batch image embedding pipeline using MLTransform. + +The pipeline reads image URIs from a text file, decodes images with Pillow, +generates SentenceTransformers image embeddings through MLTransform, and writes +results to BigQuery using batch file loads. +""" + +import argparse +import hashlib +import io +import logging +import time +from collections.abc import Iterable +from typing import Any + +import apache_beam as beam +from apache_beam.io.filesystems import FileSystems +from apache_beam.ml.transforms.base import MLTransform +from apache_beam.ml.transforms.embeddings.huggingface import SentenceTransformerEmbeddings +from apache_beam.options.pipeline_options import PipelineOptions +from apache_beam.options.pipeline_options import SetupOptions +from apache_beam.options.pipeline_options import StandardOptions +from apache_beam.runners.runner import PipelineResult +from PIL import Image + +IMAGE_COLUMN = 'image' +IMAGE_ID_COLUMN = 'image_id' +IMAGE_URI_COLUMN = 'image_uri' + +DEFAULT_IMAGE_MODEL_NAME = 'clip-ViT-B-32' +DEFAULT_ACCELERATOR = 'type:nvidia-tesla-t4;count:1;install-nvidia-driver' +DEFAULT_EMBEDDING_MIN_RAM = '16GB' + +OUTPUT_TABLE_SCHEMA = { + 'fields': [ + { + 'name': 'image_id', 'type': 'STRING' + }, + { + 'name': 'image_uri', 'type': 'STRING' + }, + { + 'name': 'model_name', 'type': 'STRING' + }, + { + 'name': 'embedding', 'type': 'FLOAT64', 'mode': 'REPEATED' + }, + { + 'name': 'embedding_dim', 'type': 'INT64' + }, + { + 'name': 'infer_ms', 'type': 'INT64' + }, + ] +} + + +def now_millis() -> int: + return int(time.time() * 1000) + + +def sha1_hex(value: str) -> str: + return hashlib.sha1(value.encode('utf-8')).hexdigest() + + +def filter_empty_uri(uri: str) -> Iterable[str]: + uri = uri.strip() + if uri: + yield uri + + +def load_image_from_uri(uri: str) -> bytes: + with FileSystems.open(uri) as file: + return file.read() + + +def decode_pil(image_bytes: bytes) -> Image.Image: + with Image.open(io.BytesIO(image_bytes)) as image: + image = image.convert('RGB') + image.load() + return image + + +class ReadImage(beam.DoFn): + def process(self, uri: str) -> Iterable[dict[str, Any]]: + image_id = sha1_hex(uri) + try: + yield { + IMAGE_ID_COLUMN: image_id, + IMAGE_URI_COLUMN: uri, + IMAGE_COLUMN: decode_pil(load_image_from_uri(uri)), + } + except Exception as exc: + logging.warning( + 'Failed to read or decode image %s (%s): %s', image_id, uri, exc) + + +def _as_dict(row: Any) -> dict[str, Any]: + if hasattr(row, 'as_dict'): + return row.as_dict() + if hasattr(row, '_asdict'): + return row._asdict() + return dict(row) + + +def embedding_to_list(value: Any) -> list[float]: + if hasattr(value, 'tolist'): + value = value.tolist() + return [float(item) for item in value] Review Comment:  For numpy arrays (which are the common output format for embeddings), calling `.tolist()` already recursively converts all elements to standard Python float/int types. Running a list comprehension `[float(item) for item in value]` over the resulting list adds unnecessary overhead, especially for high-dimensional embedding vectors. Returning early when `tolist` is available avoids this extra pass. ```suggestion def embedding_to_list(value: Any) -> list[float]: if hasattr(value, 'tolist'): return value.tolist() return [float(item) for item in value] ``` ########## sdks/python/apache_beam/ml/transforms/mltransform_embedding_tests_requirements.txt: ########## @@ -0,0 +1,20 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +sentence-transformers~=5.0.0 Review Comment:  The version constraint `sentence-transformers~=5.0.0` is invalid because `sentence-transformers` has not reached version 5.0.0 yet (the current major version is 3.x). This will cause dependency installation to fail when running the benchmarks. Please update this to a valid version constraint, such as `>=3.0.0`. ``` sentence-transformers>=3.0.0 ``` ########## sdks/python/apache_beam/examples/ml_transform/mltransform_image_embedding.py: ########## @@ -0,0 +1,263 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Batch image embedding pipeline using MLTransform. + +The pipeline reads image URIs from a text file, decodes images with Pillow, +generates SentenceTransformers image embeddings through MLTransform, and writes +results to BigQuery using batch file loads. +""" + +import argparse +import hashlib +import io +import logging +import time +from collections.abc import Iterable +from typing import Any + +import apache_beam as beam +from apache_beam.io.filesystems import FileSystems +from apache_beam.ml.transforms.base import MLTransform +from apache_beam.ml.transforms.embeddings.huggingface import SentenceTransformerEmbeddings +from apache_beam.options.pipeline_options import PipelineOptions +from apache_beam.options.pipeline_options import SetupOptions +from apache_beam.options.pipeline_options import StandardOptions +from apache_beam.runners.runner import PipelineResult +from PIL import Image + +IMAGE_COLUMN = 'image' +IMAGE_ID_COLUMN = 'image_id' +IMAGE_URI_COLUMN = 'image_uri' + +DEFAULT_IMAGE_MODEL_NAME = 'clip-ViT-B-32' +DEFAULT_ACCELERATOR = 'type:nvidia-tesla-t4;count:1;install-nvidia-driver' +DEFAULT_EMBEDDING_MIN_RAM = '16GB' + +OUTPUT_TABLE_SCHEMA = { + 'fields': [ + { + 'name': 'image_id', 'type': 'STRING' + }, + { + 'name': 'image_uri', 'type': 'STRING' + }, + { + 'name': 'model_name', 'type': 'STRING' + }, + { + 'name': 'embedding', 'type': 'FLOAT64', 'mode': 'REPEATED' + }, + { + 'name': 'embedding_dim', 'type': 'INT64' + }, + { + 'name': 'infer_ms', 'type': 'INT64' + }, + ] +} + + +def now_millis() -> int: + return int(time.time() * 1000) + + +def sha1_hex(value: str) -> str: + return hashlib.sha1(value.encode('utf-8')).hexdigest() + + +def filter_empty_uri(uri: str) -> Iterable[str]: + uri = uri.strip() + if uri: + yield uri + + +def load_image_from_uri(uri: str) -> bytes: + with FileSystems.open(uri) as file: + return file.read() + + +def decode_pil(image_bytes: bytes) -> Image.Image: + with Image.open(io.BytesIO(image_bytes)) as image: + image = image.convert('RGB') + image.load() + return image + + +class ReadImage(beam.DoFn): + def process(self, uri: str) -> Iterable[dict[str, Any]]: + image_id = sha1_hex(uri) + try: + yield { + IMAGE_ID_COLUMN: image_id, + IMAGE_URI_COLUMN: uri, + IMAGE_COLUMN: decode_pil(load_image_from_uri(uri)), + } + except Exception as exc: + logging.warning( + 'Failed to read or decode image %s (%s): %s', image_id, uri, exc) + + +def _as_dict(row: Any) -> dict[str, Any]: + if hasattr(row, 'as_dict'): + return row.as_dict() + if hasattr(row, '_asdict'): + return row._asdict() + return dict(row) + + +def embedding_to_list(value: Any) -> list[float]: + if hasattr(value, 'tolist'): + value = value.tolist() + return [float(item) for item in value] + + +class FormatImageEmbeddingOutput(beam.DoFn): + def __init__(self, model_name: str): + self.model_name = model_name + + def process(self, row: Any) -> Iterable[dict[str, Any]]: + row = _as_dict(row) + embedding = embedding_to_list(row[IMAGE_COLUMN]) + yield { + IMAGE_ID_COLUMN: row[IMAGE_ID_COLUMN], + IMAGE_URI_COLUMN: row[IMAGE_URI_COLUMN], + 'model_name': self.model_name, + 'embedding': embedding, + 'embedding_dim': len(embedding), + 'infer_ms': now_millis(), Review Comment:  The field name `infer_ms` suggests that it measures the duration of the inference step in milliseconds. However, it is populated with the current epoch timestamp in milliseconds (`now_millis()`). This is highly misleading. Consider renaming this field to `processed_ts` or `timestamp_ms` to accurately reflect that it is a timestamp, or document its meaning clearly. ########## sdks/python/apache_beam/examples/ml_transform/mltransform_text_embedding.py: ########## @@ -0,0 +1,193 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Batch text embedding pipeline using MLTransform. + +The pipeline reads text lines, generates sentence-transformer embeddings through +MLTransform, and writes JSONL output to a sharded text sink. +""" + +import argparse +import hashlib +import json +import logging +from collections.abc import Iterable +from typing import Any + +import apache_beam as beam +from apache_beam.ml.transforms.base import MLTransform +from apache_beam.ml.transforms.embeddings.huggingface import SentenceTransformerEmbeddings +from apache_beam.options.pipeline_options import PipelineOptions +from apache_beam.options.pipeline_options import SetupOptions +from apache_beam.runners.runner import PipelineResult + +ID_COLUMN = 'id' +RAW_TEXT_COLUMN = 'raw_text' +TEXT_COLUMN = 'text' + +DEFAULT_MODEL_NAME = 'sentence-transformers/all-MiniLM-L6-v2' + + +def _str_to_bool(value: str) -> bool: + if value.lower() == 'true': + return True + if value.lower() == 'false': + return False + raise argparse.ArgumentTypeError( + f'"true" or "false" expected, got "{value}" instead.') + + +def parse_known_args(argv): + parser = argparse.ArgumentParser() + parser.add_argument( + '--input', required=True, help='Path to the input text file.') + parser.add_argument( + '--output', + required=True, + help='Output prefix for sharded JSONL embedding results.') + parser.add_argument( + '--artifact_location', + required=True, + help='Path where MLTransform artifacts are written.') + parser.add_argument( + '--model_name', + default=DEFAULT_MODEL_NAME, + help='SentenceTransformers model name.') + parser.add_argument( + '--min_batch_size', + type=int, + default=16, + help='Minimum Beam inference batch size.') + parser.add_argument( + '--max_batch_size', + type=int, + default=128, + help='Maximum Beam inference batch size.') + parser.add_argument( + '--model_batch_size', + type=int, + default=32, + help='Batch size passed to SentenceTransformer.encode.') + parser.add_argument( + '--device', + default='CPU', + choices=['CPU', 'GPU'], + help='Device used by SentenceTransformers on the worker.') + parser.add_argument( + '--large_model', + type=_str_to_bool, + default=False, + help='Whether RunInference should share the model across processes.') + return parser.parse_known_args(argv) + + +def text_to_record(line: str) -> Iterable[dict[str, str]]: + text = line.strip() + if not text: + return + + yield { + ID_COLUMN: hashlib.sha256(text.encode('utf-8')).hexdigest(), + RAW_TEXT_COLUMN: text, + TEXT_COLUMN: text, + } + + +def _as_dict(row: Any) -> dict[str, Any]: + if hasattr(row, 'as_dict'): + return row.as_dict() + if hasattr(row, '_asdict'): + return row._asdict() + return dict(row) + + +def normalize_json_value(value: Any) -> Any: + if hasattr(value, 'tolist'): + value = value.tolist() + elif hasattr(value, 'item'): + value = value.item() + + if isinstance(value, dict): + return {key: normalize_json_value(val) for key, val in value.items()} + if isinstance(value, (list, tuple)): + return [normalize_json_value(item) for item in value] + return value Review Comment:  Calling `normalize_json_value` recursively on every single element of a high-dimensional embedding vector (e.g., 384 or 768 dimensions) introduces significant CPU overhead. Since `numpy.ndarray.tolist()` already recursively converts all nested arrays and numpy scalars to standard Python types, we can return early when `tolist` is present, completely avoiding the element-by-element recursion. ```suggestion def normalize_json_value(value: Any) -> Any: if hasattr(value, 'tolist'): return value.tolist() if hasattr(value, 'item'): return value.item() if isinstance(value, dict): return {key: normalize_json_value(val) for key, val in value.items()} if isinstance(value, (list, tuple)): return [normalize_json_value(item) for item in value] return value ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
