[
https://issues.apache.org/jira/browse/DRILL-5669?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16084877#comment-16084877
]
Dechang Gu commented on DRILL-5669:
-----------------------------------
Here is the plan for the failed query:
{code}
00-00 Screen : rowType = RecordType(ANY p_brand, ANY p_type, ANY p_size,
BIGINT supplier_cnt): rowcount = 2000.0, cumulative cost = {4.16474202E8 rows,
3.783967028483349E9 cpu, 0.0 io, 1.9509256192E11 network, 9.903480000000001E8
memory}, id = 41734
00-01 Project(p_brand=[$0], p_type=[$1], p_size=[$2], supplier_cnt=[$3]) :
rowType = RecordType(ANY p_brand, ANY p_type, ANY p_size, BIGINT supplier_cnt):
rowcount = 2000.0, cumulative cost = {4.16474002E8 rows, 3.783966828483349E9
cpu, 0.0 io, 1.9509256192E11 network, 9.903480000000001E8 memory}, id = 41733
00-02 SelectionVectorRemover : rowType = RecordType(ANY p_brand, ANY
p_type, ANY p_size, BIGINT supplier_cnt): rowcount = 2000.0, cumulative cost =
{4.16474002E8 rows, 3.783966828483349E9 cpu, 0.0 io, 1.9509256192E11 network,
9.903480000000001E8 memory}, id = 41732
00-03 Sort(sort0=[$3], sort1=[$0], sort2=[$1], sort3=[$2],
dir0=[DESC], dir1=[ASC], dir2=[ASC], dir3=[ASC]) : rowType = RecordType(ANY
p_brand, ANY p_type, ANY p_size, BIGINT supplier_cnt): rowcount = 2000.0,
cumulative cost = {4.16472002E8 rows, 3.783964828483349E9 cpu, 0.0 io,
1.9509256192E11 network, 9.903480000000001E8 memory}, id = 41731
00-04 StreamAgg(group=[{0, 1, 2}], supplier_cnt=[$SUM0($3)]) :
rowType = RecordType(ANY p_brand, ANY p_type, ANY p_size, BIGINT supplier_cnt):
rowcount = 2000.0, cumulative cost = {4.16470002E8 rows, 3.7836139233862395E9
cpu, 0.0 io, 1.9509256192E11 network, 9.902840000000001E8 memory}, id = 41730
00-05 HashToMergeExchange : rowType = RecordType(ANY p_brand, ANY
p_type, ANY p_size, BIGINT supplier_cnt): rowcount = 20000.0, cumulative cost =
{4.16450002E8 rows, 3.7831339233862395E9 cpu, 0.0 io, 1.9509256192E11 network,
9.902840000000001E8 memory}, id = 41729
01-01 StreamAgg(group=[{0, 1, 2}], supplier_cnt=[COUNT($3)]) :
rowType = RecordType(ANY p_brand, ANY p_type, ANY p_size, BIGINT supplier_cnt):
rowcount = 20000.0, cumulative cost = {4.16430002E8 rows, 3.7822281691386485E9
cpu, 0.0 io, 1.9476488192E11 network, 9.902840000000001E8 memory}, id = 41728
01-02 Sort(sort0=[$0], sort1=[$1], sort2=[$2], dir0=[ASC],
dir1=[ASC], dir2=[ASC]) : rowType = RecordType(ANY p_brand, ANY p_type, ANY
p_size, ANY ps_suppkey): rowcount = 200000.0, cumulative cost = {4.16230002E8
rows, 3.7774281691386485E9 cpu, 0.0 io, 1.9476488192E11 network,
9.902840000000001E8 memory}, id = 41727
01-03 HashAgg(group=[{0, 1, 2, 3}]) : rowType =
RecordType(ANY p_brand, ANY p_type, ANY p_size, ANY ps_suppkey): rowcount =
200000.0, cumulative cost = {4.16030002E8 rows, 3.735165032E9 cpu, 0.0 io,
1.9476488192E11 network, 9.838840000000001E8 memory}, id = 41726
01-04 Project(p_brand=[$0], p_type=[$1], p_size=[$2],
ps_suppkey=[$3]) : rowType = RecordType(ANY p_brand, ANY p_type, ANY p_size,
ANY ps_suppkey): rowcount = 2000000.0, cumulative cost = {4.14030002E8 rows,
3.671165032E9 cpu, 0.0 io, 1.9476488192E11 network, 8.958840000000001E8
memory}, id = 41725
01-05 HashToRandomExchange(dist0=[[$0]], dist1=[[$1]],
dist2=[[$2]], dist3=[[$3]]) : rowType = RecordType(ANY p_brand, ANY p_type, ANY
p_size, ANY ps_suppkey, ANY E_X_P_R_H_A_S_H_F_I_E_L_D): rowcount = 2000000.0,
cumulative cost = {4.14030002E8 rows, 3.671165032E9 cpu, 0.0 io,
1.9476488192E11 network, 8.958840000000001E8 memory}, id = 41724
02-01 UnorderedMuxExchange : rowType = RecordType(ANY
p_brand, ANY p_type, ANY p_size, ANY ps_suppkey, ANY
E_X_P_R_H_A_S_H_F_I_E_L_D): rowcount = 2000000.0, cumulative cost =
{4.12030002E8 rows, 3.651165032E9 cpu, 0.0 io, 1.5380488192E11 network,
8.958840000000001E8 memory}, id = 41723
03-01 Project(p_brand=[$0], p_type=[$1],
p_size=[$2], ps_suppkey=[$3], E_X_P_R_H_A_S_H_F_I_E_L_D=[hash32AsDouble($3,
hash32AsDouble($2, hash32AsDouble($1, hash32AsDouble($0, 1301011))))]) :
rowType = RecordType(ANY p_brand, ANY p_type, ANY p_size, ANY ps_suppkey, ANY
E_X_P_R_H_A_S_H_F_I_E_L_D): rowcount = 2000000.0, cumulative cost =
{4.10030002E8 rows, 3.649165032E9 cpu, 0.0 io, 1.5380488192E11 network,
8.958840000000001E8 memory}, id = 41722
03-02 HashAgg(group=[{0, 1, 2, 3}]) : rowType =
RecordType(ANY p_brand, ANY p_type, ANY p_size, ANY ps_suppkey): rowcount =
2000000.0, cumulative cost = {4.08030002E8 rows, 3.641165032E9 cpu, 0.0 io,
1.5380488192E11 network, 8.958840000000001E8 memory}, id = 41721
03-03 Project(p_brand=[$2], p_type=[$3],
p_size=[$4], ps_suppkey=[$1]) : rowType = RecordType(ANY p_brand, ANY p_type,
ANY p_size, ANY ps_suppkey): rowcount = 2.0E7, cumulative cost = {3.88030002E8
rows, 3.001165032E9 cpu, 0.0 io, 1.5380488192E11 network, 1.5884E7 memory}, id
= 41720
03-04 SelectionVectorRemover : rowType =
RecordType(ANY ps_partkey, ANY ps_suppkey, ANY p_brand, ANY p_type, ANY p_size,
ANY p_partkey, BIGINT $f0, BIGINT $f1, ANY f9, ANY $f00, BOOLEAN $f10):
rowcount = 2.0E7, cumulative cost = {3.88030002E8 rows, 3.001165032E9 cpu, 0.0
io, 1.5380488192E11 network, 1.5884E7 memory}, id = 41719
03-05 Filter(condition=[CASE(=($6, 0),
CAST(true):BOOLEAN, IS NOT NULL($10), CAST(false):BOOLEAN, IS NULL($8), null,
<($7, $6), null, CAST(true):BOOLEAN)]) : rowType = RecordType(ANY ps_partkey,
ANY ps_suppkey, ANY p_brand, ANY p_type, ANY p_size, ANY p_partkey, BIGINT $f0,
BIGINT $f1, ANY f9, ANY $f00, BOOLEAN $f10): rowcount = 2.0E7, cumulative cost
= {3.68030002E8 rows, 2.981165032E9 cpu, 0.0 io, 1.5380488192E11 network,
1.5884E7 memory}, id = 41718
03-06 HashJoin(condition=[=($8, $9)],
joinType=[left]) : rowType = RecordType(ANY ps_partkey, ANY ps_suppkey, ANY
p_brand, ANY p_type, ANY p_size, ANY p_partkey, BIGINT $f0, BIGINT $f1, ANY f9,
ANY $f00, BOOLEAN $f10): rowcount = 8.0E7, cumulative cost = {2.88030002E8
rows, 2.581165032E9 cpu, 0.0 io, 1.5380488192E11 network, 1.5884E7 memory}, id
= 41717
03-08 Project(ps_partkey=[$0],
ps_suppkey=[$1], p_brand=[$2], p_type=[$3], p_size=[$4], p_partkey=[$5],
$f0=[$6], $f1=[$7], f9=[$1]) : rowType = RecordType(ANY ps_partkey, ANY
ps_suppkey, ANY p_brand, ANY p_type, ANY p_size, ANY p_partkey, BIGINT $f0,
BIGINT $f1, ANY f9): rowcount = 8.0E7, cumulative cost = {2.05250002E8 rows,
1.607375032E9 cpu, 0.0 io, 1.5360008192E11 network, 1.1E7 memory}, id = 41708
03-10 HashJoin(condition=[=($0, $5)],
joinType=[inner]) : rowType = RecordType(ANY ps_partkey, ANY ps_suppkey, ANY
p_brand, ANY p_type, ANY p_size, ANY p_partkey, BIGINT $f0, BIGINT $f1):
rowcount = 8.0E7, cumulative cost = {2.05250002E8 rows, 1.607375032E9 cpu, 0.0
io, 1.5360008192E11 network, 1.1E7 memory}, id = 41707
03-12
Scan(groupscan=[ParquetGroupScan [entries=[ReadEntryWithPath
[path=maprfs:///tpchParquet/SF100/partsupp]],
selectionRoot=maprfs:/tpchParquet/SF100/partsupp, numFiles=1,
usedMetadataFile=false, columns=[`ps_partkey`, `ps_suppkey`]]]) : rowType =
RecordType(ANY ps_partkey, ANY ps_suppkey): rowcount = 8.0E7, cumulative cost =
{8.0E7 rows, 1.6E8 cpu, 0.0 io, 0.0 network, 0.0 memory}, id = 41694
03-11 BroadcastExchange : rowType =
RecordType(ANY p_brand, ANY p_type, ANY p_size, ANY p_partkey, BIGINT $f0,
BIGINT $f1): rowcount = 625000.0, cumulative cost = {4.4625002E7 rows,
4.82375032E8 cpu, 0.0 io, 1.5360008192E11 network, 0.0 memory}, id = 41706
04-01
NestedLoopJoin(condition=[true], joinType=[inner]) : rowType = RecordType(ANY
p_brand, ANY p_type, ANY p_size, ANY p_partkey, BIGINT $f0, BIGINT $f1):
rowcount = 625000.0, cumulative cost = {4.4000002E7 rows, 4.77375032E8 cpu, 0.0
io, 81920.0 network, 0.0 memory}, id = 41705
04-03 SelectionVectorRemover :
rowType = RecordType(ANY p_brand, ANY p_type, ANY p_size, ANY p_partkey):
rowcount = 625000.0, cumulative cost = {4.0625E7 rows, 2.13125E8 cpu, 0.0 io,
0.0 network, 0.0 memory}, id = 41697
04-04
Filter(condition=[AND(<>($0, 'Brand#21'), NOT(LIKE($1, 'MEDIUM PLATED%')),
OR(=($2, 38), =($2, 2), =($2, 8), =($2, 31), =($2, 44), =($2, 5), =($2, 14),
=($2, 24)))]) : rowType = RecordType(ANY p_brand, ANY p_type, ANY p_size, ANY
p_partkey): rowcount = 625000.0, cumulative cost = {4.0E7 rows, 2.125E8 cpu,
0.0 io, 0.0 network, 0.0 memory}, id = 41696
04-05
Scan(groupscan=[ParquetGroupScan [entries=[ReadEntryWithPath
[path=maprfs:///tpchParquet/SF100/part]],
selectionRoot=maprfs:/tpchParquet/SF100/part, numFiles=1,
usedMetadataFile=false, columns=[`p_brand`, `p_type`, `p_size`, `p_partkey`]]])
: rowType = RecordType(ANY p_brand, ANY p_type, ANY p_size, ANY p_partkey):
rowcount = 2.0E7, cumulative cost = {2.0E7 rows, 8.0E7 cpu, 0.0 io, 0.0
network, 0.0 memory}, id = 41695
04-02 BroadcastExchange :
rowType = RecordType(BIGINT $f0, BIGINT $f1): rowcount = 1.0, cumulative cost =
{2750002.0 rows, 1.4250032E7 cpu, 0.0 io, 81920.0 network, 0.0 memory}, id =
41704
06-01 StreamAgg(group=[{}],
agg#0=[$SUM0($0)], agg#1=[$SUM0($1)]) : rowType = RecordType(BIGINT $f0, BIGINT
$f1): rowcount = 1.0, cumulative cost = {2750001.0 rows, 1.4250024E7 cpu, 0.0
io, 0.0 network, 0.0 memory}, id = 41703
06-02 StreamAgg(group=[{}],
agg#0=[COUNT()], agg#1=[COUNT($0)]) : rowType = RecordType(BIGINT $f0, BIGINT
$f1): rowcount = 1.0, cumulative cost = {2750000.0 rows, 1.425E7 cpu, 0.0 io,
0.0 network, 0.0 memory}, id = 41702
06-03 Project($f0=[$1],
$f1=[true]) : rowType = RecordType(ANY $f0, BOOLEAN $f1): rowcount = 250000.0,
cumulative cost = {2500000.0 rows, 8250000.0 cpu, 0.0 io, 0.0 network, 0.0
memory}, id = 41701
06-04
SelectionVectorRemover : rowType = RecordType(ANY s_comment, ANY s_suppkey):
rowcount = 250000.0, cumulative cost = {2250000.0 rows, 7250000.0 cpu, 0.0 io,
0.0 network, 0.0 memory}, id = 41700
06-05
Filter(condition=[LIKE($0, '%Customer%Complaints%')]) : rowType =
RecordType(ANY s_comment, ANY s_suppkey): rowcount = 250000.0, cumulative cost
= {2000000.0 rows, 7000000.0 cpu, 0.0 io, 0.0 network, 0.0 memory}, id = 41699
06-06
Scan(groupscan=[ParquetGroupScan [entries=[ReadEntryWithPath
[path=maprfs:///tpchParquet/SF100/supplier]],
selectionRoot=maprfs:/tpchParquet/SF100/supplier, numFiles=1,
usedMetadataFile=false, columns=[`s_comment`, `s_suppkey`]]]) : rowType =
RecordType(ANY s_comment, ANY s_suppkey): rowcount = 1000000.0, cumulative cost
= {1000000.0 rows, 2000000.0 cpu, 0.0 io, 0.0 network, 0.0 memory}, id = 41698
03-07 Project($f00=[$0], $f10=[$1]) :
rowType = RecordType(ANY $f00, BOOLEAN $f10): rowcount = 2500.0, cumulative
cost = {2777500.0 rows, 1.377E7 cpu, 0.0 io, 2.048E8 network, 4840000.0
memory}, id = 41716
03-09 BroadcastExchange : rowType =
RecordType(ANY $f0, BOOLEAN $f1): rowcount = 2500.0, cumulative cost =
{2777500.0 rows, 1.377E7 cpu, 0.0 io, 2.048E8 network, 4840000.0 memory}, id =
41715
05-01 HashAgg(group=[{0}],
agg#0=[MIN($1)]) : rowType = RecordType(ANY $f0, BOOLEAN $f1): rowcount =
2500.0, cumulative cost = {2775000.0 rows, 1.375E7 cpu, 0.0 io, 0.0 network,
4840000.0 memory}, id = 41714
05-02 HashAgg(group=[{0}],
agg#0=[MIN($1)]) : rowType = RecordType(ANY $f0, BOOLEAN $f1): rowcount =
25000.0, cumulative cost = {2750000.0 rows, 1.325E7 cpu, 0.0 io, 0.0 network,
4400000.0 memory}, id = 41713
05-03 Project($f0=[$1],
$f1=[true]) : rowType = RecordType(ANY $f0, BOOLEAN $f1): rowcount = 250000.0,
cumulative cost = {2500000.0 rows, 8250000.0 cpu, 0.0 io, 0.0 network, 0.0
memory}, id = 41712
05-04 SelectionVectorRemover :
rowType = RecordType(ANY s_comment, ANY s_suppkey): rowcount = 250000.0,
cumulative cost = {2250000.0 rows, 7250000.0 cpu, 0.0 io, 0.0 network, 0.0
memory}, id = 41711
05-05
Filter(condition=[LIKE($0, '%Customer%Complaints%')]) : rowType =
RecordType(ANY s_comment, ANY s_suppkey): rowcount = 250000.0, cumulative cost
= {2000000.0 rows, 7000000.0 cpu, 0.0 io, 0.0 network, 0.0 memory}, id = 41710
05-06
Scan(groupscan=[ParquetGroupScan [entries=[ReadEntryWithPath
[path=maprfs:///tpchParquet/SF100/supplier]],
selectionRoot=maprfs:/tpchParquet/SF100/supplier, numFiles=1,
usedMetadataFile=false, columns=[`s_comment`, `s_suppkey`]]]) : rowType =
RecordType(ANY s_comment, ANY s_suppkey): rowcount = 1000000.0, cumulative cost
= {1000000.0 rows, 2000000.0 cpu, 0.0 io, 0.0 network, 0.0 memory}, id = 41709
{code}
> Multiple TPCH queries failed due to OOM
> ---------------------------------------
>
> Key: DRILL-5669
> URL: https://issues.apache.org/jira/browse/DRILL-5669
> Project: Apache Drill
> Issue Type: Bug
> Components: Functions - Drill
> Environment: RHEL 6.4 2.6.32-358.el6.x86_64, 10+1 nodes cluster
> Reporter: Dechang Gu
> Assignee: Boaz Ben-Zvi
> Fix For: 1.11.0
>
>
> Running TPCH SF100 Parquet (and CSV) tests, multiple queries failed due to
> OOM. For example, Q16 hit the following error:
> {code}
> java.sql.SQLException: RESOURCE ERROR: One or more nodes ran out of memory
> while executing the query.
> Unable to allocate sv2 for 65536 records, and not enough batchGroups to spill.
> batchGroups.size 1
> spilledBatchGroups.size 0
> allocated memory 23500416
> allocator limit 20000000
> Fragment 1:11
> [Error Id: e58161a6-2383-48b1-a350-50db1b5408c6 on ucs-node10.perf.lab:31010]
> at
> org.apache.drill.jdbc.impl.DrillCursor.nextRowInternally(DrillCursor.java:489)
> at org.apache.drill.jdbc.impl.DrillCursor.next(DrillCursor.java:593)
> at
> org.apache.calcite.avatica.AvaticaResultSet.next(AvaticaResultSet.java:215)
> at
> org.apache.drill.jdbc.impl.DrillResultSetImpl.next(DrillResultSetImpl.java:140)
> at PipSQueak.fetchRows(PipSQueak.java:420)
> at PipSQueak.runTest(PipSQueak.java:116)
> at PipSQueak.main(PipSQueak.java:556)
> Caused by: org.apache.drill.common.exceptions.UserRemoteException: RESOURCE
> ERROR: One or more nodes ran out of memory while executing the query.
> Unable to allocate sv2 for 65536 records, and not enough batchGroups to spill.
> batchGroups.size 1
> spilledBatchGroups.size 0
> allocated memory 23500416
> allocator limit 20000000
> Fragment 1:11
> {code}
> And in drillbit.log:
> {code}
> 2017-07-12 11:34:11,670 ucs-node10.perf.lab
> [26999476-174e-98fd-e21e-fd53f79284c7:frag:1:11] INFO
> o.a.d.e.p.i.xsort.ExternalSortBatch - User Error Occurred: One or more nodes
> ran out of memory while executing the query.
> org.apache.drill.common.exceptions.UserException: RESOURCE ERROR: One or more
> nodes ran out of memory while executing the query.
> Unable to allocate sv2 for 65536 records, and not enough batchGroups to spill.
> batchGroups.size 1
> spilledBatchGroups.size 0
> allocated memory 23500416
> allocator limit 20000000
> [Error Id: e58161a6-2383-48b1-a350-50db1b5408c6 ]
> at
> org.apache.drill.common.exceptions.UserException$Builder.build(UserException.java:550)
> ~[drill-common-1.11.0-SNAPSHOT.jar:1.11.0-SNAPSHOT]
> at
> org.apache.drill.exec.physical.impl.xsort.ExternalSortBatch.newSV2(ExternalSortBatch.java:639)
> [drill-java-exec-1.11.0-SNAPSHOT.jar:1.11.0-SNAPSHOT]
> at
> org.apache.drill.exec.physical.impl.xsort.ExternalSortBatch.innerNext(ExternalSortBatch.java:381)
> [drill-java-exec-1.11.0-SNAPSHOT.jar:1.11.0-SNAPSHOT]
> at
> org.apache.drill.exec.record.AbstractRecordBatch.next(AbstractRecordBatch.java:162)
> [drill-java-exec-1.11.0-SNAPSHOT.jar:1.11.0-SNAPSHOT]
> at
> org.apache.drill.exec.record.AbstractRecordBatch.next(AbstractRecordBatch.java:119)
> [drill-java-exec-1.11.0-SNAPSHOT.jar:1.11.0-SNAPSHOT]
> at
> org.apache.drill.exec.record.AbstractRecordBatch.next(AbstractRecordBatch.java:109)
> [drill-java-exec-1.11.0-SNAPSHOT.jar:1.11.0-SNAPSHOT]
> at
> org.apache.drill.exec.physical.impl.aggregate.StreamingAggBatch.innerNext(StreamingAggBatch.java:140)
> [drill-java-exec-1.11.0-SNAPSHOT.jar:1.11.0-SNAPSHOT]
> at
> org.apache.drill.exec.record.AbstractRecordBatch.next(AbstractRecordBatch.java:162)
> [drill-java-exec-1.11.0-SNAPSHOT.jar:1.11.0-SNAPSHOT]
> at
> org.apache.drill.exec.physical.impl.BaseRootExec.next(BaseRootExec.java:105)
> [drill-java-exec-1.11.0-SNAPSHOT.jar:1.11.0-SNAPSHOT]
> at
> org.apache.drill.exec.physical.impl.partitionsender.PartitionSenderRootExec.innerNext(PartitionSenderRootExec.java:144)
> [drill-java-exec-1.11.0-SNAPSHOT.jar:1.11.0-SNAPSHOT]
> at
> org.apache.drill.exec.physical.impl.BaseRootExec.next(BaseRootExec.java:95)
> [drill-java-exec-1.11.0-SNAPSHOT.jar:1.11.0-SNAPSHOT]
> at
> org.apache.drill.exec.work.fragment.FragmentExecutor$1.run(FragmentExecutor.java:234)
> [drill-java-exec-1.11.0-SNAPSHOT.jar:1.11.0-SNAPSHOT]
> at
> org.apache.drill.exec.work.fragment.FragmentExecutor$1.run(FragmentExecutor.java:227)
> [drill-java-exec-1.11.0-SNAPSHOT.jar:1.11.0-SNAPSHOT]
> at java.security.AccessController.doPrivileged(Native Method)
> [na:1.7.0_65]
> at javax.security.auth.Subject.doAs(Subject.java:415) [na:1.7.0_65]
> at
> org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1595)
> [hadoop-common-2.7.0-mapr-1607.jar:na]
> at
> org.apache.drill.exec.work.fragment.FragmentExecutor.run(FragmentExecutor.java:227)
> [drill-java-exec-1.11.0-SNAPSHOT.jar:1.11.0-SNAPSHOT]
> at
> org.apache.drill.common.SelfCleaningRunnable.run(SelfCleaningRunnable.java:38)
> [drill-common-1.11.0-SNAPSHOT.jar:1.11.0-SNAPSHOT]
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
> [na:1.7.0_65]
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
> [na:1.7.0_65]
> at java.lang.Thread.run(Thread.java:745) [na:1.7.0_65]
> {code}
--
This message was sent by Atlassian JIRA
(v6.4.14#64029)