[impala] 01/03: IMPALA-7876: COMPUTE STATS TABLESAMPLE is not updating number of estimated rows

tarmstrong Tue, 17 Nov 2020 12:54:24 -0800

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


commit fa525dfdf72f6f612821a14e683cd7f16d2c423a
Author: Abhishek Rawat <ara...@cloudera.com>
AuthorDate: Wed Nov 11 07:58:11 2020 -0800

    IMPALA-7876: COMPUTE STATS TABLESAMPLE is not updating number of estimated 
rows
    
    'COMPUTE STATS TABLESAMPLE' uses a child query with following function
    'ROUND(COUNT(*) / <effective_sample_perc>)' for computing the row count.
    The 'ROUND()' fn returns the row count as a DECIMAL type. The
    'CatalogOpExecutor' (CatalogOpExecutor::SetTableStats) expects the row
    count as a BIGINT type. Due to this data type mismatch the table stats
    (Extrap #Rows) doesn't get set.
    
    Adding an explicit CAST to BIGINT for the ROUND function results in the
    table stats (Extrap #Rows) getting set properly.
    
    Fixed both 'custom_cluster/test_stats_extrapolation.py' and
    'metadata/test_stats_extrapolation.py' so that they can catch issues
    like this, where table stats are not set when using
    'COMPUTE STATS TABLESAMPLE'.
    
    Testing:
    - Ran core tests.
    
    Change-Id: I88a0a777c2be9cc18b3ff293cf1c06fb499ca052
    Reviewed-on: http://gerrit.cloudera.org:8080/16712
    Reviewed-by: Tim Armstrong <tarmstr...@cloudera.com>
    Tested-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com>
---
 .../org/apache/impala/analysis/ComputeStatsStmt.java    |  8 +++++---
 tests/common/impala_test_suite.py                       |  2 +-
 tests/custom_cluster/test_stats_extrapolation.py        |  9 +++++++++
 tests/metadata/test_stats_extrapolation.py              | 17 ++++++++++++-----
 4 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/fe/src/main/java/org/apache/impala/analysis/ComputeStatsStmt.java 
b/fe/src/main/java/org/apache/impala/analysis/ComputeStatsStmt.java
index 6bb2b17..906a972 100644
--- a/fe/src/main/java/org/apache/impala/analysis/ComputeStatsStmt.java
+++ b/fe/src/main/java/org/apache/impala/analysis/ComputeStatsStmt.java
@@ -322,7 +322,7 @@ public class ComputeStatsStmt extends StatementBase {
    *
    * 2. COMPUTE STATS with TABLESAMPLE
    * 2.1 Row counts:
-   * SELECT ROUND(COUNT(*) / <effective_sample_perc>)
+   * SELECT CAST(ROUND(COUNT(*) / <effective_sample_perc>) AS BIGINT)
    * FROM tbl TABLESAMPLE SYSTEM(<sample_perc>) REPEATABLE (<random_seed>)
    *
    * 2.1 Column stats:
@@ -544,8 +544,10 @@ public class ComputeStatsStmt extends StatementBase {
     StringBuilder tableStatsQueryBuilder = new StringBuilder("SELECT ");
     String countSql = "COUNT(*)";
     if (isSampling()) {
-      // Extrapolate the count based on the effective sampling rate.
-      countSql = String.format("ROUND(COUNT(*) / %.10f)", 
effectiveSamplePerc_);
+      // Extrapolate the count based on the effective sampling rate. Add an 
explicit CAST
+      // to BIGINT, which is the expected data type for row count.
+      countSql = String.format("CAST(ROUND(COUNT(*) / %.10f) AS BIGINT)",
+        effectiveSamplePerc_);
     }
     List<String> tableStatsSelectList = Lists.newArrayList(countSql);
     // Add group by columns for incremental stats or with extrapolation 
disabled.
diff --git a/tests/common/impala_test_suite.py 
b/tests/common/impala_test_suite.py
index 5d1a1fa..e5fcc70 100644
--- a/tests/common/impala_test_suite.py
+++ b/tests/common/impala_test_suite.py
@@ -930,7 +930,7 @@ class ImpalaTestSuite(BaseTestSuite):
     """Returns True if 'a' and 'b' are within 'diff_perc' percent of each 
other,
     False otherwise. 'diff_perc' must be a float in [0,1]."""
     if a == b: return True # Avoid division by 0
-    assert abs(a - b) / float(max(a,b)) <= diff_perc
+    assert abs(a - b) / float(max(abs(a), abs(b))) <= diff_perc
 
   def _get_table_location(self, table_name, vector):
     """ Returns the HDFS location of the table """
diff --git a/tests/custom_cluster/test_stats_extrapolation.py 
b/tests/custom_cluster/test_stats_extrapolation.py
index cd1accd..9b21921 100644
--- a/tests/custom_cluster/test_stats_extrapolation.py
+++ b/tests/custom_cluster/test_stats_extrapolation.py
@@ -48,8 +48,17 @@ class TestStatsExtrapolation(CustomClusterTestSuite):
     # Test COMPUTE STATS TABLESAMPLE
     part_test_tbl = unique_database + ".alltypes"
     self.clone_table("functional.alltypes", part_test_tbl, True, vector)
+    # Since our test tables are small, set the minimum sample size to 0 to 
make sure
+    # we exercise the sampling code paths.
+    self.client.execute("set COMPUTE_STATS_MIN_SAMPLE_SIZE=0")
     self.client.execute(
         "compute stats {0} tablesample system (13)".format(part_test_tbl))
+    # Check that table stats were set.
+    table_stats = self.client.execute("show table stats 
{0}".format(part_test_tbl))
+    col_names = [fs.name.upper() for fs in table_stats.schema.fieldSchemas]
+    extrap_rows_idx = col_names.index("EXTRAP #ROWS")
+    for row in table_stats.data:
+      assert int(row.split("\t")[extrap_rows_idx]) >= 0
     # Check that column stats were set.
     col_stats = self.client.execute("show column stats 
{0}".format(part_test_tbl))
     col_names = [fs.name.upper() for fs in col_stats.schema.fieldSchemas]
diff --git a/tests/metadata/test_stats_extrapolation.py 
b/tests/metadata/test_stats_extrapolation.py
index c6bd8c4..4dc14ff 100644
--- a/tests/metadata/test_stats_extrapolation.py
+++ b/tests/metadata/test_stats_extrapolation.py
@@ -130,7 +130,7 @@ class TestStatsExtrapolation(ImpalaTestSuite):
     self.client.execute("compute stats {0}{1} tablesample system ({2}) 
repeatable ({3})"\
       .format(tbl, cols, perc, seed))
     self.__check_table_stats(tbl, expected_tbl)
-    self.__check_column_stats(tbl, expected_tbl)
+    self.__check_column_stats(cols, tbl, expected_tbl)
 
   def __check_table_stats(self, tbl, expected_tbl):
     """Checks that the row counts reported in SHOW TABLE STATS on 'tbl' are 
within 2x
@@ -147,18 +147,20 @@ class TestStatsExtrapolation(ImpalaTestSuite):
       act_cols = actual.data[i].split("\t")
       exp_cols = expected.data[i].split("\t")
       assert int(exp_cols[rows_col_idx]) >= 0
+      # The expected_tbl is expected to have valid extrapolated #rows for 
every partition.
+      assert int(act_cols[extrap_rows_col_idx]) >= 0
       self.appx_equals(\
-        int(act_cols[extrap_rows_col_idx]), int(exp_cols[rows_col_idx]), 2)
+        int(act_cols[extrap_rows_col_idx]), int(exp_cols[rows_col_idx]), 1.0)
       # Only the table-level row count is stored. The partition row counts
       # are extrapolated.
       if act_cols[0] == "Total":
         self.appx_equals(
-          int(act_cols[rows_col_idx]), int(exp_cols[rows_col_idx]), 2)
+          int(act_cols[rows_col_idx]), int(exp_cols[rows_col_idx]), 1.0)
       elif len(actual.data) > 1:
         # Partition row count is expected to not be set.
         assert int(act_cols[rows_col_idx]) == -1
 
-  def __check_column_stats(self, tbl, expected_tbl):
+  def __check_column_stats(self, cols, tbl, expected_tbl):
     """Checks that the NDVs in SHOW COLUMNS STATS on 'tbl' are within 2x of 
those
     reported for 'expected_tbl'. Assumes that COMPUTE STATS was previously run
     on 'expected_table' and that COMPUTE STATS TABLESAMPLE was run on 'tbl'."""
@@ -172,4 +174,9 @@ class TestStatsExtrapolation(ImpalaTestSuite):
       act_cols = actual.data[i].split("\t")
       exp_cols = expected.data[i].split("\t")
       assert int(exp_cols[ndv_col_idx]) >= 0
-      self.appx_equals(int(act_cols[ndv_col_idx]), int(exp_cols[ndv_col_idx]), 
2)
+      # Only compare the NDVs for columns which were included in the COMPUTE 
STATS.
+      # The other non partitioning columns are expected to have NDV not set, 
since the
+      # caller drops the stats before calling COMPUTE STATS.
+      if cols == "" or act_cols[0] in cols:
+        assert int(act_cols[ndv_col_idx]) >= 0
+        self.appx_equals(int(act_cols[ndv_col_idx]), 
int(exp_cols[ndv_col_idx]), 1.0)

[impala] 01/03: IMPALA-7876: COMPUTE STATS TABLESAMPLE is not updating number of estimated rows

Reply via email to