This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit fa525dfdf72f6f612821a14e683cd7f16d2c423a Author: Abhishek Rawat <ara...@cloudera.com> AuthorDate: Wed Nov 11 07:58:11 2020 -0800 IMPALA-7876: COMPUTE STATS TABLESAMPLE is not updating number of estimated rows 'COMPUTE STATS TABLESAMPLE' uses a child query with following function 'ROUND(COUNT(*) / <effective_sample_perc>)' for computing the row count. The 'ROUND()' fn returns the row count as a DECIMAL type. The 'CatalogOpExecutor' (CatalogOpExecutor::SetTableStats) expects the row count as a BIGINT type. Due to this data type mismatch the table stats (Extrap #Rows) doesn't get set. Adding an explicit CAST to BIGINT for the ROUND function results in the table stats (Extrap #Rows) getting set properly. Fixed both 'custom_cluster/test_stats_extrapolation.py' and 'metadata/test_stats_extrapolation.py' so that they can catch issues like this, where table stats are not set when using 'COMPUTE STATS TABLESAMPLE'. Testing: - Ran core tests. Change-Id: I88a0a777c2be9cc18b3ff293cf1c06fb499ca052 Reviewed-on: http://gerrit.cloudera.org:8080/16712 Reviewed-by: Tim Armstrong <tarmstr...@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com> --- .../org/apache/impala/analysis/ComputeStatsStmt.java | 8 +++++--- tests/common/impala_test_suite.py | 2 +- tests/custom_cluster/test_stats_extrapolation.py | 9 +++++++++ tests/metadata/test_stats_extrapolation.py | 17 ++++++++++++----- 4 files changed, 27 insertions(+), 9 deletions(-) diff --git a/fe/src/main/java/org/apache/impala/analysis/ComputeStatsStmt.java b/fe/src/main/java/org/apache/impala/analysis/ComputeStatsStmt.java index 6bb2b17..906a972 100644 --- a/fe/src/main/java/org/apache/impala/analysis/ComputeStatsStmt.java +++ b/fe/src/main/java/org/apache/impala/analysis/ComputeStatsStmt.java @@ -322,7 +322,7 @@ public class ComputeStatsStmt extends StatementBase { * * 2. COMPUTE STATS with TABLESAMPLE * 2.1 Row counts: - * SELECT ROUND(COUNT(*) / <effective_sample_perc>) + * SELECT CAST(ROUND(COUNT(*) / <effective_sample_perc>) AS BIGINT) * FROM tbl TABLESAMPLE SYSTEM(<sample_perc>) REPEATABLE (<random_seed>) * * 2.1 Column stats: @@ -544,8 +544,10 @@ public class ComputeStatsStmt extends StatementBase { StringBuilder tableStatsQueryBuilder = new StringBuilder("SELECT "); String countSql = "COUNT(*)"; if (isSampling()) { - // Extrapolate the count based on the effective sampling rate. - countSql = String.format("ROUND(COUNT(*) / %.10f)", effectiveSamplePerc_); + // Extrapolate the count based on the effective sampling rate. Add an explicit CAST + // to BIGINT, which is the expected data type for row count. + countSql = String.format("CAST(ROUND(COUNT(*) / %.10f) AS BIGINT)", + effectiveSamplePerc_); } List<String> tableStatsSelectList = Lists.newArrayList(countSql); // Add group by columns for incremental stats or with extrapolation disabled. diff --git a/tests/common/impala_test_suite.py b/tests/common/impala_test_suite.py index 5d1a1fa..e5fcc70 100644 --- a/tests/common/impala_test_suite.py +++ b/tests/common/impala_test_suite.py @@ -930,7 +930,7 @@ class ImpalaTestSuite(BaseTestSuite): """Returns True if 'a' and 'b' are within 'diff_perc' percent of each other, False otherwise. 'diff_perc' must be a float in [0,1].""" if a == b: return True # Avoid division by 0 - assert abs(a - b) / float(max(a,b)) <= diff_perc + assert abs(a - b) / float(max(abs(a), abs(b))) <= diff_perc def _get_table_location(self, table_name, vector): """ Returns the HDFS location of the table """ diff --git a/tests/custom_cluster/test_stats_extrapolation.py b/tests/custom_cluster/test_stats_extrapolation.py index cd1accd..9b21921 100644 --- a/tests/custom_cluster/test_stats_extrapolation.py +++ b/tests/custom_cluster/test_stats_extrapolation.py @@ -48,8 +48,17 @@ class TestStatsExtrapolation(CustomClusterTestSuite): # Test COMPUTE STATS TABLESAMPLE part_test_tbl = unique_database + ".alltypes" self.clone_table("functional.alltypes", part_test_tbl, True, vector) + # Since our test tables are small, set the minimum sample size to 0 to make sure + # we exercise the sampling code paths. + self.client.execute("set COMPUTE_STATS_MIN_SAMPLE_SIZE=0") self.client.execute( "compute stats {0} tablesample system (13)".format(part_test_tbl)) + # Check that table stats were set. + table_stats = self.client.execute("show table stats {0}".format(part_test_tbl)) + col_names = [fs.name.upper() for fs in table_stats.schema.fieldSchemas] + extrap_rows_idx = col_names.index("EXTRAP #ROWS") + for row in table_stats.data: + assert int(row.split("\t")[extrap_rows_idx]) >= 0 # Check that column stats were set. col_stats = self.client.execute("show column stats {0}".format(part_test_tbl)) col_names = [fs.name.upper() for fs in col_stats.schema.fieldSchemas] diff --git a/tests/metadata/test_stats_extrapolation.py b/tests/metadata/test_stats_extrapolation.py index c6bd8c4..4dc14ff 100644 --- a/tests/metadata/test_stats_extrapolation.py +++ b/tests/metadata/test_stats_extrapolation.py @@ -130,7 +130,7 @@ class TestStatsExtrapolation(ImpalaTestSuite): self.client.execute("compute stats {0}{1} tablesample system ({2}) repeatable ({3})"\ .format(tbl, cols, perc, seed)) self.__check_table_stats(tbl, expected_tbl) - self.__check_column_stats(tbl, expected_tbl) + self.__check_column_stats(cols, tbl, expected_tbl) def __check_table_stats(self, tbl, expected_tbl): """Checks that the row counts reported in SHOW TABLE STATS on 'tbl' are within 2x @@ -147,18 +147,20 @@ class TestStatsExtrapolation(ImpalaTestSuite): act_cols = actual.data[i].split("\t") exp_cols = expected.data[i].split("\t") assert int(exp_cols[rows_col_idx]) >= 0 + # The expected_tbl is expected to have valid extrapolated #rows for every partition. + assert int(act_cols[extrap_rows_col_idx]) >= 0 self.appx_equals(\ - int(act_cols[extrap_rows_col_idx]), int(exp_cols[rows_col_idx]), 2) + int(act_cols[extrap_rows_col_idx]), int(exp_cols[rows_col_idx]), 1.0) # Only the table-level row count is stored. The partition row counts # are extrapolated. if act_cols[0] == "Total": self.appx_equals( - int(act_cols[rows_col_idx]), int(exp_cols[rows_col_idx]), 2) + int(act_cols[rows_col_idx]), int(exp_cols[rows_col_idx]), 1.0) elif len(actual.data) > 1: # Partition row count is expected to not be set. assert int(act_cols[rows_col_idx]) == -1 - def __check_column_stats(self, tbl, expected_tbl): + def __check_column_stats(self, cols, tbl, expected_tbl): """Checks that the NDVs in SHOW COLUMNS STATS on 'tbl' are within 2x of those reported for 'expected_tbl'. Assumes that COMPUTE STATS was previously run on 'expected_table' and that COMPUTE STATS TABLESAMPLE was run on 'tbl'.""" @@ -172,4 +174,9 @@ class TestStatsExtrapolation(ImpalaTestSuite): act_cols = actual.data[i].split("\t") exp_cols = expected.data[i].split("\t") assert int(exp_cols[ndv_col_idx]) >= 0 - self.appx_equals(int(act_cols[ndv_col_idx]), int(exp_cols[ndv_col_idx]), 2) + # Only compare the NDVs for columns which were included in the COMPUTE STATS. + # The other non partitioning columns are expected to have NDV not set, since the + # caller drops the stats before calling COMPUTE STATS. + if cols == "" or act_cols[0] in cols: + assert int(act_cols[ndv_col_idx]) >= 0 + self.appx_equals(int(act_cols[ndv_col_idx]), int(exp_cols[ndv_col_idx]), 1.0)