(datafusion) branch main updated: fix partition-by panic (#12297)

dheres Tue, 03 Sep 2024 00:36:05 -0700

This is an automated email from the ASF dual-hosted git repository.

dheres pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git



The following commit(s) were added to refs/heads/main by this push:
     new 4e1b6de554 fix partition-by panic (#12297)
4e1b6de554 is described below

commit 4e1b6de5549cc7ed399766dd22cafd450701db16
Author: iamthinh <[email protected]>
AuthorDate: Tue Sep 3 00:35:56 2024 -0700

    fix partition-by panic (#12297)
---
 datafusion/physical-plan/src/coalesce/mod.rs       | 15 +++++++++++++--
 datafusion/physical-plan/src/repartition/mod.rs    | 12 +++++++++---
 datafusion/sqllogictest/test_files/repartition.slt | 20 ++++++++++++++++++++
 3 files changed, 42 insertions(+), 5 deletions(-)

diff --git a/datafusion/physical-plan/src/coalesce/mod.rs 
b/datafusion/physical-plan/src/coalesce/mod.rs
index 5befa5ecda..ce5a1e53ab 100644
--- a/datafusion/physical-plan/src/coalesce/mod.rs
+++ b/datafusion/physical-plan/src/coalesce/mod.rs
@@ -18,7 +18,7 @@
 use arrow::compute::concat_batches;
 use arrow_array::builder::StringViewBuilder;
 use arrow_array::cast::AsArray;
-use arrow_array::{Array, ArrayRef, RecordBatch};
+use arrow_array::{Array, ArrayRef, RecordBatch, RecordBatchOptions};
 use arrow_schema::SchemaRef;
 use std::sync::Arc;
 
@@ -265,7 +265,9 @@ fn gc_string_view_batch(batch: &RecordBatch) -> RecordBatch 
{
             }
         })
         .collect();
-    RecordBatch::try_new(batch.schema(), new_columns)
+    let mut options = RecordBatchOptions::new();
+    options = options.with_row_count(Some(batch.num_rows()));
+    RecordBatch::try_new_with_options(batch.schema(), new_columns, &options)
         .expect("Failed to re-create the gc'ed record batch")
 }
 
@@ -501,6 +503,15 @@ mod tests {
         assert_eq!(array.data_buffers().len(), gc_array.data_buffers().len()); 
// no compaction
     }
 
+    #[test]
+    fn test_gc_string_view_test_batch_empty() {
+        let schema = Schema::empty();
+        let batch = RecordBatch::new_empty(schema.into());
+        let output_batch = gc_string_view_batch(&batch);
+        assert_eq!(batch.num_columns(), output_batch.num_columns());
+        assert_eq!(batch.num_rows(), output_batch.num_rows());
+    }
+
     #[test]
     fn test_gc_string_view_batch_large_no_compact() {
         // view with large strings (has buffers) but full --> no need to 
compact
diff --git a/datafusion/physical-plan/src/repartition/mod.rs 
b/datafusion/physical-plan/src/repartition/mod.rs
index 47e5192c23..093803e3c8 100644
--- a/datafusion/physical-plan/src/repartition/mod.rs
+++ b/datafusion/physical-plan/src/repartition/mod.rs
@@ -41,7 +41,7 @@ use crate::{DisplayFormatType, ExecutionPlan, Partitioning, 
PlanProperties, Stat
 use arrow::array::ArrayRef;
 use arrow::datatypes::{SchemaRef, UInt64Type};
 use arrow::record_batch::RecordBatch;
-use arrow_array::PrimitiveArray;
+use arrow_array::{PrimitiveArray, RecordBatchOptions};
 use datafusion_common::utils::transpose;
 use datafusion_common::{arrow_datafusion_err, not_impl_err, DataFusionError, 
Result};
 use datafusion_common_runtime::SpawnedTask;
@@ -309,8 +309,14 @@ impl BatchPartitioner {
                                 })
                                 .collect::<Result<Vec<ArrayRef>>>()?;
 
-                            let batch =
-                                RecordBatch::try_new(batch.schema(), 
columns).unwrap();
+                            let mut options = RecordBatchOptions::new();
+                            options = 
options.with_row_count(Some(indices.len()));
+                            let batch = RecordBatch::try_new_with_options(
+                                batch.schema(),
+                                columns,
+                                &options,
+                            )
+                            .unwrap();
 
                             Ok((partition, batch))
                         });
diff --git a/datafusion/sqllogictest/test_files/repartition.slt 
b/datafusion/sqllogictest/test_files/repartition.slt
index e3c204a4f4..2d59ad2b5b 100644
--- a/datafusion/sqllogictest/test_files/repartition.slt
+++ b/datafusion/sqllogictest/test_files/repartition.slt
@@ -127,3 +127,23 @@ physical_plan
 04)------FilterExec: c3@2 > 0
 05)--------RepartitionExec: partitioning=RoundRobinBatch(3), input_partitions=1
 06)----------StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], 
infinite_source=true
+
+# Start repratition on empty column test.
+# See https://github.com/apache/datafusion/issues/12057
+
+statement ok
+CREATE TABLE t1(v1 int);
+
+statement ok
+INSERT INTO t1 values(42);
+
+query I
+SELECT sum(1) OVER (PARTITION BY false=false) 
+FROM t1 WHERE ((false > (v1 = v1)) IS DISTINCT FROM true);
+----
+1
+
+statement ok
+DROP TABLE t1;
+
+# End repartition on empty columns test
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(datafusion) branch main updated: fix partition-by panic (#12297)

Reply via email to