buraksenn commented on code in PR #20926:
URL: https://github.com/apache/datafusion/pull/20926#discussion_r2962448495


##########
datafusion/physical-plan/src/aggregates/mod.rs:
##########
@@ -4333,6 +4362,157 @@ mod tests {
         Ok(())
     }
 
+    #[test]
+    fn test_aggregate_stats_ndv_zero_column() -> Result<()> {
+        use crate::test::exec::StatisticsExec;
+        use datafusion_common::ColumnStatistics;
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, true),
+            Field::new("b", DataType::Int32, true),
+        ]));
+
+        let input_stats = Statistics {
+            num_rows: Precision::Exact(1_000),
+            total_byte_size: Precision::Inexact(1_000),
+            column_statistics: vec![
+                ColumnStatistics {
+                    distinct_count: Precision::Exact(0),
+                    null_count: Precision::Exact(1_000),
+                    ..ColumnStatistics::new_unknown()
+                },
+                ColumnStatistics {
+                    distinct_count: Precision::Exact(50),
+                    ..ColumnStatistics::new_unknown()
+                },
+            ],
+        };
+
+        let input = Arc::new(StatisticsExec::new(input_stats, 
(*schema).clone()))
+            as Arc<dyn ExecutionPlan>;
+
+        let agg = AggregateExec::try_new(
+            AggregateMode::Final,
+            PhysicalGroupBy::new_single(vec![
+                (col("a", &schema)? as Arc<dyn PhysicalExpr>, "a".to_string()),
+                (col("b", &schema)? as Arc<dyn PhysicalExpr>, "b".to_string()),
+            ]),
+            vec![Arc::new(
+                AggregateExprBuilder::new(count_udaf(), vec![col("a", 
&schema)?])
+                    .schema(Arc::clone(&schema))
+                    .alias("COUNT(a)")
+                    .build()?,
+            )],
+            vec![None],
+            input,
+            Arc::clone(&schema),
+        )?;
+
+        let stats = agg.partition_statistics(None)?;
+        // NDV(a)=0 with nulls => max(0+1, 1)=1, NDV(b)=50 => 1*50=50
+        assert_eq!(
+            stats.num_rows,
+            Precision::Inexact(50),
+            "all-null column should contribute 1 to the product, not 0"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_aggregate_stats_absent_num_rows_with_ndv() -> Result<()> {
+        use crate::test::exec::StatisticsExec;
+        use datafusion_common::ColumnStatistics;
+
+        let schema = Arc::new(Schema::new(vec![Field::new("a", 
DataType::Int32, true)]));
+
+        let input_stats = Statistics {
+            num_rows: Precision::Absent,
+            total_byte_size: Precision::Absent,
+            column_statistics: vec![ColumnStatistics {
+                distinct_count: Precision::Exact(100),
+                ..ColumnStatistics::new_unknown()
+            }],
+        };
+
+        let input = Arc::new(StatisticsExec::new(input_stats, 
(*schema).clone()))
+            as Arc<dyn ExecutionPlan>;
+
+        let agg = AggregateExec::try_new(
+            AggregateMode::Final,
+            PhysicalGroupBy::new_single(vec![(
+                col("a", &schema)? as Arc<dyn PhysicalExpr>,
+                "a".to_string(),
+            )]),
+            vec![Arc::new(
+                AggregateExprBuilder::new(count_udaf(), vec![col("a", 
&schema)?])
+                    .schema(Arc::clone(&schema))
+                    .alias("COUNT(a)")
+                    .build()?,
+            )],
+            vec![None],
+            input,
+            Arc::clone(&schema),
+        )?;
+
+        let stats = agg.partition_statistics(None)?;
+        assert_eq!(
+            stats.num_rows,
+            Precision::Inexact(100),
+            "absent num_rows should fall back to NDV estimate"
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_aggregate_stats_absent_num_rows_with_ndv_and_limit() -> Result<()> 
{
+        use crate::test::exec::StatisticsExec;
+        use datafusion_common::ColumnStatistics;
+
+        let schema = Arc::new(Schema::new(vec![Field::new("a", 
DataType::Int32, true)]));
+
+        let input_stats = Statistics {
+            num_rows: Precision::Absent,
+            total_byte_size: Precision::Absent,
+            column_statistics: vec![ColumnStatistics {
+                distinct_count: Precision::Exact(100),
+                ..ColumnStatistics::new_unknown()
+            }],
+        };
+
+        let input = Arc::new(StatisticsExec::new(input_stats, 
(*schema).clone()))
+            as Arc<dyn ExecutionPlan>;
+
+        let mut agg = AggregateExec::try_new(
+            AggregateMode::Final,
+            PhysicalGroupBy::new_single(vec![(
+                col("a", &schema)? as Arc<dyn PhysicalExpr>,
+                "a".to_string(),
+            )]),
+            vec![Arc::new(
+                AggregateExprBuilder::new(count_udaf(), vec![col("a", 
&schema)?])
+                    .schema(Arc::clone(&schema))
+                    .alias("COUNT(a)")
+                    .build()?,
+            )],
+            vec![None],
+            input,
+            Arc::clone(&schema),
+        )?;

Review Comment:
   Thanks I've applied this changes yes it consolidated cases and some 
duplicate code 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to