This is an automated email from the ASF dual-hosted git repository.
yjhjstz pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/cloudberry.git
The following commit(s) were added to refs/heads/main by this push:
new 3ead998707d Fix: Prevent excessive sampling on QEs by restricting
ComputeExtStatisticsRows to QD
3ead998707d is described below
commit 3ead998707df0a3ef5e44f15693d789ddaed44e1
Author: Jianghua Yang <[email protected]>
AuthorDate: Thu Aug 7 00:01:26 2025 +0000
Fix: Prevent excessive sampling on QEs by restricting
ComputeExtStatisticsRows to QD
In `do_analyze_rel`, the function `ComputeExtStatisticsRows` calculates the
minimum
number of sample rows needed for extended statistics (e.g., dependencies,
ndistinct).
This calculation is only meaningful and required on the Query Dispatcher
(QD), since
only the QD is responsible for coordinating the final extended statistics
generation.
Previously, all segments (including QEs) executed this logic, resulting in
excessive
sampling. For large tables, this caused the QD to receive more rows than it
can handle,
leading to the error:
ERROR: too many sample rows received from gp_acquire_sample_rows
---
src/backend/commands/analyze.c | 6 ++++--
src/test/regress/expected/stats_ext.out | 13 +++++++++++++
src/test/regress/expected/stats_ext_optimizer.out | 13 +++++++++++++
src/test/regress/sql/stats_ext.sql | 12 ++++++++++++
4 files changed, 42 insertions(+), 2 deletions(-)
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index e06dbea2870..32c74f6cd14 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -717,8 +717,10 @@ do_analyze_rel(Relation onerel, VacuumParams *params,
* statistics target. So we may need to sample more rows and then build
* the statistics with enough detail.
*/
- minrows = ComputeExtStatisticsRows(onerel, attr_cnt, vacattrstats);
-
+ if (IS_QD_OR_SINGLENODE())
+ minrows = ComputeExtStatisticsRows(onerel, attr_cnt,
vacattrstats);
+ else
+ minrows = 0;
if (targrows < minrows)
targrows = minrows;
diff --git a/src/test/regress/expected/stats_ext.out
b/src/test/regress/expected/stats_ext.out
index 9b8580f59c2..3fc90553026 100644
--- a/src/test/regress/expected/stats_ext.out
+++ b/src/test/regress/expected/stats_ext.out
@@ -3240,3 +3240,16 @@ NOTICE: drop cascades to 2 other objects
DETAIL: drop cascades to table tststats.priv_test_tbl
drop cascades to view tststats.priv_test_view
DROP USER regress_stats_user1;
+-- test analyze with extended statistics
+CREATE TABLE tbl_issue1293 (col1 int, col2 int);
+NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named
'col1' as the Apache Cloudberry data distribution key for this table.
+HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make
sure column(s) chosen are the optimal data distribution key to minimize skew.
+INSERT INTO tbl_issue1293
+SELECT i / 10000, i / 100000
+FROM generate_series(1, 1000000) s(i);
+ANALYZE tbl_issue1293;
+-- Create extended statistics on col1, col2
+CREATE STATISTICS s1 (dependencies) ON col1, col2 FROM tbl_issue1293;
+-- Trigger extended stats collection
+ANALYZE tbl_issue1293;
+DROP TABLE tbl_issue1293;
diff --git a/src/test/regress/expected/stats_ext_optimizer.out
b/src/test/regress/expected/stats_ext_optimizer.out
index f62d399b4ca..d19caa775d1 100644
--- a/src/test/regress/expected/stats_ext_optimizer.out
+++ b/src/test/regress/expected/stats_ext_optimizer.out
@@ -3275,3 +3275,16 @@ NOTICE: drop cascades to 2 other objects
DETAIL: drop cascades to table tststats.priv_test_tbl
drop cascades to view tststats.priv_test_view
DROP USER regress_stats_user1;
+-- test analyze with extended statistics
+CREATE TABLE tbl_issue1293 (col1 int, col2 int);
+NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named
'col1' as the Apache Cloudberry data distribution key for this table.
+HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make
sure column(s) chosen are the optimal data distribution key to minimize skew.
+INSERT INTO tbl_issue1293
+SELECT i / 10000, i / 100000
+FROM generate_series(1, 1000000) s(i);
+ANALYZE tbl_issue1293;
+-- Create extended statistics on col1, col2
+CREATE STATISTICS s1 (dependencies) ON col1, col2 FROM tbl_issue1293;
+-- Trigger extended stats collection
+ANALYZE tbl_issue1293;
+DROP TABLE tbl_issue1293;
diff --git a/src/test/regress/sql/stats_ext.sql
b/src/test/regress/sql/stats_ext.sql
index f005f2c2957..91edd3a5bba 100644
--- a/src/test/regress/sql/stats_ext.sql
+++ b/src/test/regress/sql/stats_ext.sql
@@ -1651,3 +1651,15 @@ DROP FUNCTION op_leak(int, int);
RESET SESSION AUTHORIZATION;
DROP SCHEMA tststats CASCADE;
DROP USER regress_stats_user1;
+
+-- test analyze with extended statistics
+CREATE TABLE tbl_issue1293 (col1 int, col2 int);
+INSERT INTO tbl_issue1293
+SELECT i / 10000, i / 100000
+FROM generate_series(1, 1000000) s(i);
+ANALYZE tbl_issue1293;
+-- Create extended statistics on col1, col2
+CREATE STATISTICS s1 (dependencies) ON col1, col2 FROM tbl_issue1293;
+-- Trigger extended stats collection
+ANALYZE tbl_issue1293;
+DROP TABLE tbl_issue1293;
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]