This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


The following commit(s) were added to refs/heads/master by this push:
     new 96decf5  IMPALA-10345: Impala hits DCHECK in 
parquet-column-stats.inline.h
96decf5 is described below

commit 96decf535ba9ae7a6b295012d2be85c3a1ae892c
Author: Zoltan Borok-Nagy <borokna...@cloudera.com>
AuthorDate: Mon Nov 23 13:30:50 2020 +0100

    IMPALA-10345: Impala hits DCHECK in parquet-column-stats.inline.h
    
    During Parquet file writing, a DCHECK checks if row group stats have
    copied the min/max string values into their internal buffers. This check
    is at the finalization of each page. The copying of the string values
    happened at the end of each row batch.
    
    Thus, if a row batch spans over multiple pages then the min/max
    string values don't get copied by the end of the page. Since the
    memory is attached to the row batch this isn't really an error.
    
    As a workaround this commit also copies the min/max string values
    at the end of the page if they haven't been copied yet.
    
    Testing
     * Added e2e test
    
    Change-Id: I4289bd743e951cc4c607d5a5ea75d27825a1c12b
    Reviewed-on: http://gerrit.cloudera.org:8080/16771
    Reviewed-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com>
    Tested-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com>
---
 be/src/exec/parquet/hdfs-parquet-table-writer.cc    |  1 +
 .../queries/QueryTest/parquet-page-index.test       | 21 +++++++++++++++++++++
 tests/query_test/test_parquet_stats.py              |  2 +-
 3 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/be/src/exec/parquet/hdfs-parquet-table-writer.cc 
b/be/src/exec/parquet/hdfs-parquet-table-writer.cc
index d33578d..5203c3f 100644
--- a/be/src/exec/parquet/hdfs-parquet-table-writer.cc
+++ b/be/src/exec/parquet/hdfs-parquet-table-writer.cc
@@ -936,6 +936,7 @@ Status 
HdfsParquetTableWriter::BaseColumnWriter::FinalizeCurrentPage() {
 
   // Update row group statistics from page statistics.
   DCHECK(row_group_stats_base_ != nullptr);
+  
RETURN_IF_ERROR(row_group_stats_base_->MaterializeStringValuesToInternalBuffers());
   row_group_stats_base_->Merge(*page_stats_base_);
 
   // Add the size of the data page header
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/parquet-page-index.test 
b/testdata/workloads/functional-query/queries/QueryTest/parquet-page-index.test
index d3cbd9f..f37a064 100644
--- 
a/testdata/workloads/functional-query/queries/QueryTest/parquet-page-index.test
+++ 
b/testdata/workloads/functional-query/queries/QueryTest/parquet-page-index.test
@@ -316,3 +316,24 @@ STRING, BIGINT
 aggregation(SUM, NumPages): 30
 aggregation(SUM, NumStatsFilteredPages): 27
 ====
+---- QUERY
+# IMPALA-10345: Impala hits DCHECK in parquet-column-stats.inline.h
+# Impala could hit a DCHECK when the row batch spanned over multiple pages and
+# they had the same min or max string values.
+set parquet_page_row_count_limit=5;
+create table lineitem_comment (s string) stored as parquet;
+insert into lineitem_comment select l_comment from tpch_parquet.lineitem
+                             order by l_comment
+                             limit 100;
+insert into lineitem_comment select * from lineitem_comment;
+select count(*) from lineitem_comment;
+---- RESULTS
+200
+---- TYPES
+BIGINT
+====
+---- QUERY
+drop table lineitem_comment;
+---- RESULTS
+'Table has been dropped.'
+====
\ No newline at end of file
diff --git a/tests/query_test/test_parquet_stats.py 
b/tests/query_test/test_parquet_stats.py
index 319dd1a..28a2288 100644
--- a/tests/query_test/test_parquet_stats.py
+++ b/tests/query_test/test_parquet_stats.py
@@ -91,7 +91,7 @@ class TestParquetStats(ImpalaTestSuite):
     create_table_from_parquet(self.client, unique_database,
                               'customer_multiblock_page_index')
 
-    for batch_size in [1]:
+    for batch_size in [0, 1]:
       new_vector.get_value('exec_option')['batch_size'] = batch_size
       self.run_test_case('QueryTest/parquet-page-index', new_vector, 
unique_database)
       self.run_test_case('QueryTest/nested-types-parquet-page-index', 
new_vector,

Reply via email to