comphead commented on code in PR #3290:
URL: https://github.com/apache/datafusion-comet/pull/3290#discussion_r2732955677
##########
native/core/benches/row_columnar.rs:
##########
@@ -15,90 +15,822 @@
// specific language governing permissions and limitations
// under the License.
-use arrow::datatypes::DataType as ArrowDataType;
+//! Benchmarks for JVM shuffle row-to-columnar conversion.
+//!
+//! This benchmark measures the performance of converting Spark UnsafeRow
+//! to Arrow arrays via `process_sorted_row_partition()`, which is called
+//! by JVM shuffle (CometColumnarShuffle) when writing shuffle data.
+//!
+//! Covers:
+//! - Primitive types (Int64)
+//! - Struct (flat, nested, deeply nested)
+//! - List
+//! - Map
+
+use arrow::datatypes::{DataType, Field, Fields};
use comet::execution::shuffle::row::{
process_sorted_row_partition, SparkUnsafeObject, SparkUnsafeRow,
};
use comet::execution::shuffle::CompressionCodec;
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
+use std::sync::Arc;
use tempfile::Builder;
-const NUM_ROWS: usize = 10000;
const BATCH_SIZE: usize = 5000;
-const NUM_COLS: usize = 100;
-const ROW_SIZE: usize = SparkUnsafeRow::get_row_bitset_width(NUM_COLS) +
NUM_COLS * 8;
-fn benchmark(c: &mut Criterion) {
- let mut group = c.benchmark_group("row_array_conversion");
+/// Create a struct schema with the given number of int64 fields.
+fn make_struct_schema(num_fields: usize) -> DataType {
+ let fields: Vec<Field> = (0..num_fields)
+ .map(|i| Field::new(format!("f{}", i), DataType::Int64, true))
+ .collect();
+ DataType::Struct(Fields::from(fields))
+}
- group.bench_function("row_to_array", |b| {
- let spark_rows = (0..NUM_ROWS)
- .map(|_| {
- let mut spark_row =
SparkUnsafeRow::new_with_num_fields(NUM_COLS);
- let mut row = Row::new();
+/// Calculate the row size for a struct with the given number of fields.
+/// UnsafeRow layout: [null bits] [fixed-length values]
+/// For struct: the struct value is stored as offset+size (8 bytes) pointing
to nested row
+fn get_row_size(num_struct_fields: usize) -> usize {
+ // Top-level row has 1 column (the struct)
+ let top_level_bitset_width = SparkUnsafeRow::get_row_bitset_width(1);
+ // Struct pointer (offset + size) is 8 bytes
+ let struct_pointer_size = 8;
+ // Nested struct row
+ let nested_bitset_width =
SparkUnsafeRow::get_row_bitset_width(num_struct_fields);
+ let nested_data_size = num_struct_fields * 8; // int64 values
- for i in
SparkUnsafeRow::get_row_bitset_width(NUM_COLS)..ROW_SIZE {
- row.data[i] = i as u8;
- }
+ top_level_bitset_width + struct_pointer_size + nested_bitset_width +
nested_data_size
+}
+
+struct RowData {
+ data: Vec<u8>,
+}
+
+impl RowData {
+ fn new(num_struct_fields: usize) -> Self {
+ let row_size = get_row_size(num_struct_fields);
+ let mut data = vec![0u8; row_size];
+
+ // Top-level row layout:
+ // [null bits for 1 field] [struct pointer (offset, size)]
+ let top_level_bitset_width = SparkUnsafeRow::get_row_bitset_width(1);
+
+ // Nested struct starts after top-level row header + pointer
+ let nested_offset = top_level_bitset_width + 8;
+ let nested_bitset_width =
SparkUnsafeRow::get_row_bitset_width(num_struct_fields);
+ let nested_size = nested_bitset_width + num_struct_fields * 8;
+
+ // Write struct pointer (offset in upper 32 bits, size in lower 32
bits)
+ let offset_and_size = ((nested_offset as i64) << 32) | (nested_size as
i64);
+ data[top_level_bitset_width..top_level_bitset_width + 8]
+ .copy_from_slice(&offset_and_size.to_le_bytes());
+
+ // Fill nested struct with some data
+ for i in 0..num_struct_fields {
+ let value_offset = nested_offset + nested_bitset_width + i * 8;
+ let value = (i as i64) * 100;
Review Comment:
what is 100 here? is aligning?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]