This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 84ac4f9db0 Provide field and schema metadata missing on distinct
aggregations. (#12691)
84ac4f9db0 is described below
commit 84ac4f9db09a88a9d24c33b26fe968dc5d8e0786
Author: wiedld <[email protected]>
AuthorDate: Tue Oct 1 00:03:30 2024 -1000
Provide field and schema metadata missing on distinct aggregations. (#12691)
* test(12687): reproducer of missing metadata bug
* fix(12687): minimum change needed to fix the missing metadata
---
datafusion/physical-plan/src/aggregates/mod.rs | 25 ++++++++++------
datafusion/physical-plan/src/projection.rs | 2 +-
datafusion/sqllogictest/test_files/metadata.slt | 38 +++++++++++++++++++++++++
3 files changed, 55 insertions(+), 10 deletions(-)
diff --git a/datafusion/physical-plan/src/aggregates/mod.rs
b/datafusion/physical-plan/src/aggregates/mod.rs
index 2bdaed4796..9466ff6dd4 100644
--- a/datafusion/physical-plan/src/aggregates/mod.rs
+++ b/datafusion/physical-plan/src/aggregates/mod.rs
@@ -26,6 +26,7 @@ use crate::aggregates::{
topk_stream::GroupedTopKAggregateStream,
};
use crate::metrics::{ExecutionPlanMetricsSet, MetricsSet};
+use crate::projection::get_field_metadata;
use crate::windows::get_ordered_partition_by_indices;
use crate::{
DisplayFormatType, Distribution, ExecutionPlan, InputOrderMode,
@@ -795,14 +796,17 @@ fn create_schema(
) -> Result<Schema> {
let mut fields = Vec::with_capacity(group_expr.len() + aggr_expr.len());
for (index, (expr, name)) in group_expr.iter().enumerate() {
- fields.push(Field::new(
- name,
- expr.data_type(input_schema)?,
- // In cases where we have multiple grouping sets, we will use NULL
expressions in
- // order to align the grouping sets. So the field must be nullable
even if the underlying
- // schema field is not.
- group_expr_nullable[index] || expr.nullable(input_schema)?,
- ))
+ fields.push(
+ Field::new(
+ name,
+ expr.data_type(input_schema)?,
+ // In cases where we have multiple grouping sets, we will use
NULL expressions in
+ // order to align the grouping sets. So the field must be
nullable even if the underlying
+ // schema field is not.
+ group_expr_nullable[index] || expr.nullable(input_schema)?,
+ )
+ .with_metadata(get_field_metadata(expr,
input_schema).unwrap_or_default()),
+ )
}
match mode {
@@ -823,7 +827,10 @@ fn create_schema(
}
}
- Ok(Schema::new(fields))
+ Ok(Schema::new_with_metadata(
+ fields,
+ input_schema.metadata().clone(),
+ ))
}
fn group_schema(schema: &Schema, group_count: usize) -> SchemaRef {
diff --git a/datafusion/physical-plan/src/projection.rs
b/datafusion/physical-plan/src/projection.rs
index f1b9cdaf72..4c889d1fc8 100644
--- a/datafusion/physical-plan/src/projection.rs
+++ b/datafusion/physical-plan/src/projection.rs
@@ -237,7 +237,7 @@ impl ExecutionPlan for ProjectionExec {
/// If e is a direct column reference, returns the field level
/// metadata for that field, if any. Otherwise returns None
-fn get_field_metadata(
+pub(crate) fn get_field_metadata(
e: &Arc<dyn PhysicalExpr>,
input_schema: &Schema,
) -> Option<HashMap<String, String>> {
diff --git a/datafusion/sqllogictest/test_files/metadata.slt
b/datafusion/sqllogictest/test_files/metadata.slt
index 3b2b219244..f38281abc5 100644
--- a/datafusion/sqllogictest/test_files/metadata.slt
+++ b/datafusion/sqllogictest/test_files/metadata.slt
@@ -58,5 +58,43 @@ WHERE "data"."id" = "samples"."id";
1
3
+
+
+# Regression test: prevent field metadata loss per
https://github.com/apache/datafusion/issues/12687
+query I
+select count(distinct name) from table_with_metadata;
+----
+2
+
+# Regression test: prevent field metadata loss per
https://github.com/apache/datafusion/issues/12687
+query I
+select approx_median(distinct id) from table_with_metadata;
+----
+2
+
+# Regression test: prevent field metadata loss per
https://github.com/apache/datafusion/issues/12687
+statement ok
+select array_agg(distinct id) from table_with_metadata;
+
+query I
+select distinct id from table_with_metadata order by id;
+----
+1
+3
+NULL
+
+query I
+select count(id) from table_with_metadata;
+----
+2
+
+query I
+select count(id) cnt from table_with_metadata group by name order by cnt;
+----
+0
+1
+1
+
+
statement ok
drop table table_with_metadata;
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]