[ 
https://issues.apache.org/jira/browse/IMPALA-11665?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17620584#comment-17620584
 ] 

Qifan Chen commented on IMPALA-11665:
-------------------------------------

Setup a table with nulls and empty strings in the STRING columns. When loading, 
configured the table with 1 page and 3 pages. 

Ran the query in DML section below and observed the following when the fast 
code path is taken.
1. Nulls are not part of the page min/max stats and min/max filter stats at 
all, which is good;
2. The runtime filtering works as designed. 

DDL


{code:java}
create table null_pq (
id string, 
null_str string,
null_int int
) 
sort by (null_str) 
stored as parquet
;
{code}


data loading:


{code:java}
set PARQUET_PAGE_ROW_COUNT_LIMIT=12;
insert into null_pq values
('a', null, 1),
('b', null, 2),
('c',null,3),
('aa', 'a', 1),
('ab', 'b', 2),
('ac','c',3),
('ad', '', 4),
('ae', '', 5),
('ac','',6);


{code}

1 page case (set PARQUET_PAGE_ROW_COUNT_LIMIT=12)



{code:java}
[14:11:06 qchen@qifan-10229: src] pqtools dump 
hdfs://localhost:20500/test-warehouse/null_pq/9341bc3df646c530-9701c2fc00000000_162963959_data.0.parq
22/10/17 14:23:15 INFO compress.CodecPool: Got brand-new decompressor [.snappy]
row group 0 
--------------------------------------------------------------------------------
id:        BINARY SNAPPY DO:4 FPO:56 SZ:85/89/1.05 VC:9 ENC:RLE,PLAIN_DICTIONARY
null_str:  BINARY SNAPPY DO:146 FPO:180 SZ:64/60/0.94 VC:9 ENC:RLE,PLA [more]...
null_int:  INT32 SNAPPY DO:273 FPO:312 SZ:72/68/0.94 VC:9 ENC:RLE,PLAI [more]...

    id TV=9 RL=0 DL=1 DS:       8 DE:PLAIN_DICTIONARY
    ----------------------------------------------------------------------------
    page 0:                      DLE:RLE RLE:RLE VLE:PLAIN_DICTIONARY  
[more]... VC:9

    null_str TV=9 RL=0 DL=1 DS: 4 DE:PLAIN_DICTIONARY
    ----------------------------------------------------------------------------
    page 0:                      DLE:RLE RLE:RLE VLE:PLAIN_DICTIONARY  
[more]... VC:9

    null_int TV=9 RL=0 DL=1 DS: 6 DE:PLAIN_DICTIONARY
    ----------------------------------------------------------------------------
    page 0:                      DLE:RLE RLE:RLE VLE:PLAIN_DICTIONARY  
[more]... VC:9

BINARY id 
--------------------------------------------------------------------------------
*** row group 1 of 1, values 1 to 9 *** 
value 1: R:0 D:1 V:ad
value 2: R:0 D:1 V:ae
value 3: R:0 D:1 V:ac
value 4: R:0 D:1 V:aa
value 5: R:0 D:1 V:ab
value 6: R:0 D:1 V:ac
value 7: R:0 D:1 V:a
value 8: R:0 D:1 V:b
value 9: R:0 D:1 V:c

BINARY null_str 
--------------------------------------------------------------------------------
*** row group 1 of 1, values 1 to 9 *** 
value 1: R:0 D:1 V:
value 2: R:0 D:1 V:
value 3: R:0 D:1 V:
value 4: R:0 D:1 V:a
value 5: R:0 D:1 V:b
value 6: R:0 D:1 V:c
value 7: R:0 D:0 V:<null>
value 8: R:0 D:0 V:<null>
value 9: R:0 D:0 V:<null>

INT32 null_int 
--------------------------------------------------------------------------------
*** row group 1 of 1, values 1 to 9 *** 
value 1: R:0 D:1 V:4
value 2: R:0 D:1 V:5
value 3: R:0 D:1 V:6
value 4: R:0 D:1 V:1
value 5: R:0 D:1 V:2
value 6: R:0 D:1 V:3
value 7: R:0 D:1 V:1
value 8: R:0 D:1 V:2
value 9: R:0 D:1 V:3
[14:23:16 qchen@qifan-10229: src] 
{code}




3 pages case (set PARQUET_PAGE_ROW_COUNT_LIMIT=4)


{code:java}
pqtools dump 
hdfs://localhost:20500/test-warehouse/null_pq/aa449f944bb9d005-7df200e300000000_811956887_data.0.parq

[13:50:22 qchen@qifan-10229: cluster] pqtools dump 
hdfs://localhost:20500/test-warehouse/null_pq/aa449f944bb9d005-7df200e300000000_811956887_data.0.parq
22/10/17 13:51:02 INFO compress.CodecPool: Got brand-new decompressor [.snappy]
row group 0 
--------------------------------------------------------------------------------
id:        BINARY SNAPPY DO:4 FPO:56 SZ:139/139/1.00 VC:9 ENC:RLE,PLAI [more]...
null_str:  BINARY SNAPPY DO:200 FPO:234 SZ:116/108/0.93 VC:9 ENC:RLE,P [more]...
null_int:  INT32 SNAPPY DO:388 FPO:427 SZ:126/118/0.94 VC:9 ENC:RLE,PL [more]...

    id TV=9 RL=0 DL=1 DS:       8 DE:PLAIN_DICTIONARY
    ----------------------------------------------------------------------------
    page 0:                      DLE:RLE RLE:RLE VLE:PLAIN_DICTIONARY  
[more]... VC:4
    page 1:                      DLE:RLE RLE:RLE VLE:PLAIN_DICTIONARY  
[more]... VC:4
    page 2:                      DLE:RLE RLE:RLE VLE:PLAIN_DICTIONARY  
[more]... VC:1

    null_str TV=9 RL=0 DL=1 DS: 4 DE:PLAIN_DICTIONARY
    ----------------------------------------------------------------------------
    page 0:                      DLE:RLE RLE:RLE VLE:PLAIN_DICTIONARY  
[more]... VC:4
    page 1:                      DLE:RLE RLE:RLE VLE:PLAIN_DICTIONARY  
[more]... VC:4
    page 2:                      DLE:RLE RLE:RLE VLE:PLAIN ST:[no stat 
[more]... VC:1

    null_int TV=9 RL=0 DL=1 DS: 6 DE:PLAIN_DICTIONARY
    ----------------------------------------------------------------------------
    page 0:                      DLE:RLE RLE:RLE VLE:PLAIN_DICTIONARY  
[more]... VC:4
    page 1:                      DLE:RLE RLE:RLE VLE:PLAIN_DICTIONARY  
[more]... VC:4
    page 2:                      DLE:RLE RLE:RLE VLE:PLAIN_DICTIONARY  
[more]... VC:1

BINARY id 
--------------------------------------------------------------------------------
*** row group 1 of 1, values 1 to 9 *** 
value 1: R:0 D:1 V:ad
value 2: R:0 D:1 V:ae
value 3: R:0 D:1 V:ac
value 4: R:0 D:1 V:aa
value 5: R:0 D:1 V:ab
value 6: R:0 D:1 V:ac
value 7: R:0 D:1 V:a
value 8: R:0 D:1 V:b
value 9: R:0 D:1 V:c

BINARY null_str 
--------------------------------------------------------------------------------
*** row group 1 of 1, values 1 to 9 *** 
value 1: R:0 D:1 V:
value 2: R:0 D:1 V:
value 3: R:0 D:1 V:
value 4: R:0 D:1 V:a
value 5: R:0 D:1 V:b
value 6: R:0 D:1 V:c
value 7: R:0 D:0 V:<null>
value 8: R:0 D:0 V:<null>
value 9: R:0 D:0 V:<null>

INT32 null_int 
--------------------------------------------------------------------------------
*** row group 1 of 1, values 1 to 9 *** 
value 1: R:0 D:1 V:4
value 2: R:0 D:1 V:5
value 3: R:0 D:1 V:6
value 4: R:0 D:1 V:1
value 5: R:0 D:1 V:2
value 6: R:0 D:1 V:3
value 7: R:0 D:1 V:1
value 8: R:0 D:1 V:2
value 9: R:0 D:1 V:3

{code}




> Min/Max filter could crash in fast code path for string data type
> -----------------------------------------------------------------
>
>                 Key: IMPALA-11665
>                 URL: https://issues.apache.org/jira/browse/IMPALA-11665
>             Project: IMPALA
>          Issue Type: Bug
>            Reporter: Abhishek Rawat
>            Assignee: Qifan Chen
>            Priority: Critical
>
> The impalad logs show that memcmp failed due to a segfault:
> {code:java}
> #
> # A fatal error has been detected by the Java Runtime Environment:
> #
> #  SIGSEGV (0xb) at pc=0x00007f0396c3ff22, pid=1, tid=0x00007f023f365700
> #
> # JRE version: OpenJDK Runtime Environment (8.0_332-b09) (build 1.8.0_332-b09)
> # Java VM: OpenJDK 64-Bit Server VM (25.332-b09 mixed mode linux-amd64 
> compressed oops)
> # Problematic frame:
> # C  [libc.so.6+0x16af22]  __memcmp_sse4_1+0xd42 {code}
> Resolved Stack Trace for the crashed thread:
> {code:java}
> Thread 530 (crashed)
>  0  libc-2.17.so + 0x16af22
>     rax = 0x00007f61567715f0   rdx = 0x000000000000000a
>     rcx = 0x00007f62ae04cf22   rbx = 0x0000000000000000
>     rsi = 0x000000005d1e900a   rdi = 0x000000000000000a
>     rbp = 0x00007f6156771560   rsp = 0x00007f6156771548
>      r8 = 0x00000000034d40f0    r9 = 0x00007f62ae022e90
>     r10 = 0x000000000498ff6c   r11 = 0x00007f62ae06f590
>     r12 = 0x000000000000000a   r13 = 0x000000001a9678e8
>     r14 = 0x00007f6156771730   r15 = 0x0000000001b1f380
>     rip = 0x00007f62ae04cf22
>     Found by: given as instruction pointer in context
>  1  
> impalad!impala::HdfsParquetScanner::CollectSkippedPageRangesForSortedColumn(impala::MinMaxFilter
>  const*, impala::ColumnType const&, 
> std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, 
> std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, 
> std::char_traits<char>, std::allocator<char> > > > const&, 
> std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, 
> std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, 
> std::char_traits<char>, std::allocator<char> > > > const&, int, int, 
> std::vector<impala::PageRange, std::allocator<impala::PageRange> >*) 
> [hdfs-parquet-scanner.cc : 1388 + 0x3]
>     rbp = 0x00007f6156771650   rsp = 0x00007f6156771570
>     rip = 0x0000000001b10305
>     Found by: previous frame's frame pointer
>  2  impalad!impala::HdfsParquetScanner::SkipPagesBatch(parquet::RowGroup&, 
> impala::ColumnStatsReader const&, parquet::ColumnIndex const&, int, int, 
> impala::ColumnType const&, int, parquet::ColumnChunk const&, 
> impala::MinMaxFilter const*, std::vector<impala::RowRange, 
> std::allocator<impala::RowRange> >*, int*) [hdfs-parquet-scanner.cc : 1230 + 
> 0x34]
>     rbx = 0x00007f61567716f0   rbp = 0x00007f61567717e0
>     rsp = 0x00007f6156771660   r12 = 0x00007f6156771710
>     r13 = 0x00007f6156771950   r14 = 0x000000001a9678e8
>     r15 = 0x00007f6156771920   rip = 0x0000000001b14838
>     Found by: call frame info
>  3  
> impalad!impala::HdfsParquetScanner::FindSkipRangesForPagesWithMinMaxFilters(std::vector<impala::RowRange,
>  std::allocator<impala::RowRange> >*) [hdfs-parquet-scanner.cc : 1528 + 0x57]
>     rbx = 0x000000000000004a   rbp = 0x00007f6156771b10
>     rsp = 0x00007f61567717f0   r12 = 0x000000002c195800
>     r13 = 0x000000002aa115d0   r14 = 0x0000000000000001
>     r15 = 0x0000000000000049   rip = 0x0000000001b1cf1a
>     Found by: call frame info
>  4  impalad!impala::HdfsParquetScanner::EvaluatePageIndex() 
> [hdfs-parquet-scanner.cc : 1600 + 0x19]
>     rbx = 0x00007f6156771c30   rbp = 0x00007f6156771cf0
>     rsp = 0x00007f6156771b20   r12 = 0x000000002c195800
>     r13 = 0x00007f6156771de8   r14 = 0x00000000104528a0
>     r15 = 0x00007f6156771df0   rip = 0x0000000001b1d9dd
>     Found by: call frame info
>  5  impalad!impala::HdfsParquetScanner::ProcessPageIndex() 
> [hdfs-parquet-scanner.cc : 1318 + 0xb]
>     rbx = 0x000000002c195800   rbp = 0x00007f6156771d70
>     rsp = 0x00007f6156771d00   r12 = 0x00007f6156771d10
>     r13 = 0x00007f6156771de8   r14 = 0x00000000104528a0
>     r15 = 0x00007f6156771df0   rip = 0x0000000001b1dd0b
>     Found by: call frame info
>  6  impalad!impala::HdfsParquetScanner::NextRowGroup() 
> [hdfs-parquet-scanner.cc : 934 + 0xf]
>     rbx = 0x00000000318ce040   rbp = 0x00007f6156771e40
>     rsp = 0x00007f6156771d80   r12 = 0x000000002c195800
>     r13 = 0x00007f6156771de8   r14 = 0x00000000104528a0
>     r15 = 0x00007f6156771df0   rip = 0x0000000001b1e1b4
>     Found by: call frame info
>  7  impalad!impala::HdfsParquetScanner::GetNextInternal(impala::RowBatch*) 
> [hdfs-parquet-scanner.cc : 504 + 0xb]
>     rbx = 0x000000002c195800   rbp = 0x00007f6156771ec0
>     rsp = 0x00007f6156771e50   r12 = 0x00000000c1ca4d00
>     r13 = 0x00007f6156771e78   r14 = 0x00007f6156771e80
>     r15 = 0xaaaaaaaaaaaaaaab   rip = 0x0000000001b1ed5b
>     Found by: call frame info
>  8  impalad!impala::HdfsScanNodeMt::GetNext(impala::RuntimeState*, 
> impala::RowBatch*, bool*) [hdfs-scanner.h : 138 + 0x1d]
>     rbx = 0x0000000012272a00   rbp = 0x00007f6156772070
>     rsp = 0x00007f6156771ed0   r12 = 0x000000002c195800
>     r13 = 0x0000000000000000   r14 = 0x00007f6156771f70
>     r15 = 0x00007f6156771fd0   rip = 0x00000000017d6235
>     Found by: call frame info
>  9  impalad!impala::BlockingJoinNode::GetFirstProbeRow(impala::RuntimeState*) 
> [blocking-join-node.cc : 316 + 0x6]
>     rbx = 0x000000000adba000   rbp = 0x00007f61567720c0
>     rsp = 0x00007f6156772080   r12 = 0x00007f6156772088
>     r13 = 0x000000000adba209   r14 = 0x00000000496b9680
>     r15 = 0x00007f61567720e0   rip = 0x00000000018c2069
>     Found by: call frame info
> 10  impalad!impala::PartitionedHashJoinNode::Open(impala::RuntimeState*) 
> [partitioned-hash-join-node.cc : 215 + 0xe]
>     rbx = 0x000000000adba000   rbp = 0x00007f6156772170
>     rsp = 0x00007f61567720d0   r12 = 0x00007f61567720e0
>     r13 = 0x00000000496b9680   r14 = 0x00007f6156772290
>     r15 = 0x0000000042c22030   rip = 0x000000000186c68d
>     Found by: call frame info
> 11  
> impalad!impala::BlockingJoinNode::ProcessBuildInputAndOpenProbe(impala::RuntimeState*,
>  impala::JoinBuilder*) [blocking-join-node.cc : 242 + 0x6]
>     rbx = 0x000000000adbb400   rbp = 0x00007f6156772300
>     rsp = 0x00007f6156772180   r12 = 0x00007f6156772290
>     r13 = 0x00007f6156772320   r14 = 0x00000000496b9680
>     r15 = 0x0000000010f1cf00   rip = 0x00000000018c33b7
>     Found by: call frame info
> 12  impalad!impala::PartitionedHashJoinNode::Open(impala::RuntimeState*) 
> [partitioned-hash-join-node.cc : 209 + 0x15]
>     rbx = 0x000000000adbb400   rbp = 0x00007f61567723b0
>     rsp = 0x00007f6156772310   r12 = 0x00007f6156772320
>     r13 = 0x00000000496b9680   r14 = 0x00007f6156772440
>     r15 = 0x0000000042c47660   rip = 0x000000000186c62d
>     Found by: call frame info
> 13  impalad!impala::SortNode::Open(impala::RuntimeState*) [sort-node.cc : 123 
> + 0x6]
>     rbx = 0x00000000496b9b00   rbp = 0x00007f61567724e0
>     rsp = 0x00007f61567723c0   r12 = 0x00007f6156772440
>     r13 = 0x0000000042c46e90   r14 = 0x00007f6156772420
>     r15 = 0x00000000496b9680   rip = 0x0000000001892002
>     Found by: call frame info
> 14  impalad!impala::FragmentInstanceState::Open() [fragment-instance-state.cc 
> : 426 + 0x11]
>     rbx = 0x000000000b0863c0   rbp = 0x00007f61567726a0
>     rsp = 0x00007f61567724f0   r12 = 0x00007f61567725b0
>     r13 = 0x0000000010f1d2c0   r14 = 0x00007f6156772510
>     r15 = 0x000000001196be00   rip = 0x000000000129bbe3
>     Found by: call frame info
> 15  impalad!impala::FragmentInstanceState::Exec() [fragment-instance-state.cc 
> : 95 + 0xf]
>     rbx = 0x000000000b0863c0   rbp = 0x00007f6156772760
>     rsp = 0x00007f61567726b0   r12 = 0x00007f61567727a8
>     r13 = 0x000000001aabd330   r14 = 0x00007f61567726f0
>     r15 = 0x000000001196be00   rip = 0x000000000129dabd
>     Found by: call frame info
> 16  impalad!impala::QueryState::ExecFInstance(impala::FragmentInstanceState*) 
> [query-state.cc : 955 + 0x19]
>     rbx = 0x00007f61567727d0   rbp = 0x00007f6156772830
>     rsp = 0x00007f6156772770   r12 = 0x000000000b0863c0
>     r13 = 0x00007f61567727b0   r14 = 0x0000000004950770
>     r15 = 0x000000001196be00   rip = 0x0000000001223f01
>     Found by: call frame info
> 17  impalad!impala::Thread::SuperviseThread(std::__cxx11::basic_string<char, 
> std::char_traits<char>, std::allocator<char> > const&, 
> std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> 
> > const&, boost::function<void ()>, impala::ThreadDebugInfo const*, 
> impala::Promise<long, (impala::PromiseMode)0>*) [function_template.hpp : 763 
> + 0x7]
>     rbx = 0x00007f61567728c0   rbp = 0x00007f6156772b30
>     rsp = 0x00007f6156772840   r12 = 0x00007f61567728a0
>     r13 = 0x0000000006dfe300   r14 = 0x00007f62b159c7d0
>     r15 = 0x00007f615a377dd8   rip = 0x000000000171aeb2
>     Found by: call frame info
> 18  impalad!boost::detail::thread_data<boost::_bi::bind_t<void, void 
> (*)(std::__cxx11::basic_string<char, std::char_traits<char>, 
> std::allocator<char> > const&, std::__cxx11::basic_string<char, 
> std::char_traits<char>, std::allocator<char> > const&, boost::function<void 
> ()>, impala::ThreadDebugInfo const*, impala::Promise<long, 
> (impala::PromiseMode)0>*), 
> boost::_bi::list5<boost::_bi::value<std::__cxx11::basic_string<char, 
> std::char_traits<char>, std::allocator<char> > >, 
> boost::_bi::value<std::__cxx11::basic_string<char, std::char_traits<char>, 
> std::allocator<char> > >, boost::_bi::value<boost::function<void ()> >, 
> boost::_bi::value<impala::ThreadDebugInfo*>, 
> boost::_bi::value<impala::Promise<long, (impala::PromiseMode)0>*> > > 
> >::run() [bind.hpp : 531 + 0xc]
>     rbx = 0x0000000063492300   rbp = 0x00007f6156772b90
>     rsp = 0x00007f6156772b40   r12 = 0x00007f6156772b40
>     r13 = 0x000000000171abb0   r14 = 0x00007f615a3788c0
>     r15 = 0x00007f615a377da0   rip = 0x000000000171c3ab
>     Found by: call frame info
> 19  impalad!thread_proxy + 0xa1
>     rbx = 0x0000000000000000   rbp = 0x0000000063492300
>     rsp = 0x00007f6156772ba0   r12 = 0x0000000000000000
>     r13 = 0x0000000018a976c0   r14 = 0x0000000000000000
>     r15 = 0x00007f6156773700   rip = 0x0000000001fac9d1
>     Found by: call frame info
> 20  libpthread-2.17.so + 0x7ea5
>     rbx = 0x0000000000000000   rbp = 0x0000000000000000
>     rsp = 0x00007f6156772be0   r12 = 0x0000000000000000
>     r13 = 0x0000000000a01000   r14 = 0x0000000000000000
>     r15 = 0x00007f6156773700   rip = 0x00007f62b1597ea5
>     Found by: call frame info
> 21  libc-2.17.so + 0xfeb0d
>     rsp = 0x00007f6156772c80   rip = 0x00007f62adfe0b0d
>     Found by: stack scanning
>  {code}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-all-unsubscr...@impala.apache.org
For additional commands, e-mail: issues-all-h...@impala.apache.org

Reply via email to