Re: [PR] chore: Add PySpark-based benchmarks, starting with ETL example [datafusion-comet]

via GitHub Mon, 12 Jan 2026 10:36:59 -0800


hsiang-c commented on code in PR #3065:
URL: https://github.com/apache/datafusion-comet/pull/3065#discussion_r2683441442



##########
benchmarks/pyspark/generate_data.py:
##########
@@ -0,0 +1,446 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Generate test data for shuffle size comparison benchmark.
+
+This script generates a parquet dataset with a realistic schema (100 columns
+including deeply nested structs, arrays, and maps) for benchmarking shuffle
+operations across Spark, Comet JVM, and Comet Native shuffle modes.
+"""
+
+import argparse
+from pyspark.sql import SparkSession
+from pyspark.sql import functions as F
+from pyspark.sql.types import (
+    StructType, StructField, IntegerType, LongType, DoubleType,
+    StringType, BooleanType, DateType, TimestampType, ArrayType,
+    MapType, DecimalType
+)
+
+
+def generate_data(output_path: str, num_rows: int, num_partitions: int):
+    """Generate test data with realistic schema and write to parquet."""
+
+    spark = SparkSession.builder \
+        .appName("ShuffleBenchmark-DataGen") \
+        .getOrCreate()
+
+    print(f"Generating {num_rows:,} rows with {num_partitions} partitions")
+    print(f"Output path: {output_path}")
+    print("Schema: 100 columns including deeply nested structs, arrays, and 
maps")
+
+    # Start with a range and build up the columns
+    df = spark.range(0, num_rows, numPartitions=num_partitions)
+
+    # Add columns using selectExpr for better performance
+    df = df.selectExpr(
+        # Key columns for grouping/partitioning (1-3)
+        "cast(id % 1000 as int) as partition_key",
+        "cast(id % 100 as int) as group_key",
+        "id as row_id",
+
+        # Integer columns (4-15)
+        "cast(id % 10000 as int) as category_id",
+        "cast(id % 500 as int) as region_id",
+        "cast(id % 50 as int) as department_id",
+        "cast((id * 7) % 1000000 as int) as customer_id",
+        "cast((id * 13) % 100000 as int) as product_id",
+        "cast(id % 12 + 1 as int) as month",
+        "cast(id % 28 + 1 as int) as day",
+        "cast(2020 + (id % 5) as int) as year",
+        "cast((id * 17) % 256 as int) as priority",
+        "cast((id * 19) % 1000 as int) as rank",
+        "cast((id * 23) % 10000 as int) as score_int",
+        "cast((id * 29) % 500 as int) as level",
+
+        # Long columns (16-22)
+        "id * 1000 as transaction_id",
+        "(id * 17) % 10000000000 as account_number",
+        "(id * 31) % 1000000000 as reference_id",
+        "(id * 37) % 10000000000 as external_id",
+        "(id * 41) % 1000000000 as correlation_id",
+        "(id * 43) % 10000000000 as trace_id",
+        "(id * 47) % 1000000000 as span_id",
+
+        # Double columns (23-35)
+        "cast(id % 10000 as double) / 100.0 as amount",
+        "cast((id * 3) % 10000 as double) / 100.0 as price",
+        "cast(id % 100 as double) / 100.0 as discount",
+        "cast((id * 7) % 500 as double) / 10.0 as weight",
+        "cast((id * 11) % 1000 as double) / 10.0 as height",
+        "cast(id % 360 as double) as latitude",
+        "cast((id * 2) % 360 as double) as longitude",
+        "cast((id * 13) % 10000 as double) / 1000.0 as rate",
+        "cast((id * 17) % 100 as double) / 100.0 as percentage",
+        "cast((id * 19) % 1000 as double) as velocity",
+        "cast((id * 23) % 500 as double) / 10.0 as acceleration",
+        "cast((id * 29) % 10000 as double) / 100.0 as temperature",
+        "cast((id * 31) % 1000 as double) / 10.0 as pressure",
+
+        # String columns (36-50)
+        "concat('user_', cast(id % 100000 as string)) as user_name",
+        "concat('email_', cast(id % 50000 as string), '@example.com') as 
email",
+        "concat('SKU-', lpad(cast(id % 10000 as string), 6, '0')) as sku",
+        "concat('ORD-', cast(id as string)) as order_id",
+        "array('pending', 'processing', 'shipped', 'delivered', 
'cancelled')[cast(id % 5 as int)] as status",
+        "array('USD', 'EUR', 'GBP', 'JPY', 'CAD')[cast(id % 5 as int)] as 
currency",
+        "concat('Description for item ', cast(id % 1000 as string), ' with 
additional details') as description",
+        "concat('REF-', lpad(cast(id % 100000 as string), 8, '0')) as 
reference_code",
+        "concat('TXN-', cast(id as string), '-', cast(id % 1000 as string)) as 
transaction_code",
+        "array('A', 'B', 'C', 'D', 'E')[cast(id % 5 as int)] as grade",
+        "concat('Note: Record ', cast(id as string), ' processed 
successfully') as notes",
+        "concat('Session-', lpad(cast(id % 10000 as string), 6, '0')) as 
session_id",
+        "concat('Device-', cast(id % 1000 as string)) as device_id",
+        "array('chrome', 'firefox', 'safari', 'edge')[cast(id % 4 as int)] as 
browser",
+        "array('windows', 'macos', 'linux', 'ios', 'android')[cast(id % 5 as 
int)] as os",
+
+        # Boolean columns (51-56)
+        "id % 2 = 0 as is_active",
+        "id % 3 = 0 as is_verified",
+        "id % 7 = 0 as is_premium",
+        "id % 5 = 0 as is_deleted",
+        "id % 11 = 0 as is_featured",
+        "id % 13 = 0 as is_archived",
+
+        # Date and timestamp columns (57-60)
+        "date_add(to_date('2020-01-01'), cast(id % 1500 as int)) as 
created_date",
+        "date_add(to_date('2020-01-01'), cast((id + 30) % 1500 as int)) as 
updated_date",
+        "date_add(to_date('2020-01-01'), cast((id + 60) % 1500 as int)) as 
expires_date",
+        "to_timestamp(concat('2020-01-01 ', lpad(cast(id % 24 as string), 2, 
'0'), ':00:00')) as created_at",
+
+        # Simple arrays (61-65)
+        "array(cast(id % 100 as int), cast((id + 1) % 100 as int), cast((id + 
2) % 100 as int), cast((id + 3) % 100 as int), cast((id + 4) % 100 as int)) as 
tag_ids",
+        "array(cast(id % 1000 as double) / 10.0, cast((id * 2) % 1000 as 
double) / 10.0, cast((id * 3) % 1000 as double) / 10.0) as scores",
+        "array(concat('tag_', cast(id % 20 as string)), concat('tag_', 
cast((id + 5) % 20 as string)), concat('tag_', cast((id + 10) % 20 as string))) 
as tags",
+        "array(id % 2 = 0, id % 3 = 0, id % 5 = 0, id % 7 = 0) as flag_array",
+        "array(id * 1000, id * 2000, id * 3000) as long_array",
+
+        # Simple maps (66-68)
+        "map('key1', cast(id % 100 as string), 'key2', cast((id * 2) % 100 as 
string), 'key3', cast((id * 3) % 100 as string)) as str_attributes",
+        "map('score1', cast(id % 100 as double), 'score2', cast((id * 2) % 100 
as double)) as double_attributes",
+        "map(cast(id % 10 as int), concat('val_', cast(id % 100 as string)), 
cast((id + 1) % 10 as int), concat('val_', cast((id + 1) % 100 as string))) as 
int_key_map",
+
+        # Level 2 nested struct: address with nested geo (69)
+        "named_struct("
+        "  'street', concat(cast(id % 9999 as string), ' Main St'),"
+        "  'city', array('New York', 'Los Angeles', 'Chicago', 'Houston', 
'Phoenix')[cast(id % 5 as int)],"
+        "  'state', array('NY', 'CA', 'IL', 'TX', 'AZ')[cast(id % 5 as int)],"
+        "  'zip', lpad(cast(id % 99999 as string), 5, '0'),"
+        "  'country', 'USA',"
+        "  'geo', named_struct("
+        "    'lat', cast(id % 180 as double) - 90.0,"
+        "    'lng', cast(id % 360 as double) - 180.0,"

Review Comment:
   👍 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] chore: Add PySpark-based benchmarks, starting with ETL example [datafusion-comet]

Reply via email to