all, please help me in giving suggestions on what areas i can look into why the query planning time is taking so long for files which are local to the drill machines. I have the same directory structure copied on all the 5 nodes of the cluster. I am accessing the source files using out of the box dfs storage plugin.
Query planning time is approx 30 secs Query execution time is apprx 1.5 secs Regards, Projjwal ---------- Forwarded message ---------- From: PROJJWAL SAHA <proj.s...@gmail.com> Date: Fri, Mar 3, 2017 at 5:06 PM Subject: Minimise query plan time for dfs plugin for local file system on tsv file To: user@drill.apache.org Hello all, I am quering select * from dfs.xxx where yyy (filter condition) I am using dfs storage plugin that comes out of the box from drill on a 1GB file, local to the drill cluster. The 1GB file is split into 10 files of 100 MB each. As expected I see 11 minor and 2 major fagments. The drill cluster is 5 nodes cluster with 4 cores, 32 GB each. One observation is that the query plan time is more than 30 seconds. I ran the explain plan query to validate this. The query execution time is 2 secs. total time taken is 32secs I wanted to understand how can i minimise the query plan time. Suggestions ? Is the time taken described above expected ? Attached is result from explain plan query Regards, Projjwal
+------+------+ | text | json | +------+------+ | 00-00 Screen 00-01 Project(*=[$0]) 00-02 UnionExchange 01-01 Project(T2¦¦*=[$0]) 01-02 SelectionVectorRemover 01-03 Filter(condition=[AND(=($1, '41'), =($2, '568'))]) 01-04 Project(T2¦¦*=[$0], ORDER_ID=[$1], CUSTOMER_ID=[$2]) 01-05 Scan(groupscan=[EasyGroupScan [selectionRoot=file:/scratch/localdisk/drill/testdata/Cust_1G_tsv, numFiles=10, columns=[`*`], files=[file:/scratch/localdisk/drill/testdata/Cust_1G_tsv/4.tsv, file:/scratch/localdisk/drill/testdata/Cust_1G_tsv/5.tsv, file:/scratch/localdisk/drill/testdata/Cust_1G_tsv/10.tsv, file:/scratch/localdisk/drill/testdata/Cust_1G_tsv/2.tsv, file:/scratch/localdisk/drill/testdata/Cust_1G_tsv/3.tsv, file:/scratch/localdisk/drill/testdata/Cust_1G_tsv/1.tsv, file:/scratch/localdisk/drill/testdata/Cust_1G_tsv/7.tsv, file:/scratch/localdisk/drill/testdata/Cust_1G_tsv/6.tsv, file:/scratch/localdisk/drill/testdata/Cust_1G_tsv/8.tsv, file:/scratch/localdisk/drill/testdata/Cust_1G_tsv/9.tsv]]]) | { "head" : { "version" : 1, "generator" : { "type" : "ExplainHandler", "info" : "" }, "type" : "APACHE_DRILL_PHYSICAL", "options" : [ ], "queue" : 0, "resultMode" : "EXEC" }, "graph" : [ { "pop" : "fs-scan", "@id" : 65541, "userName" : "optitest", "files" : [ "file:/scratch/localdisk/drill/testdata/Cust_1G_tsv/4.tsv", "file:/scratch/localdisk/drill/testdata/Cust_1G_tsv/5.tsv", "file:/scratch/localdisk/drill/testdata/Cust_1G_tsv/10.tsv", "file:/scratch/localdisk/drill/testdata/Cust_1G_tsv/2.tsv", "file:/scratch/localdisk/drill/testdata/Cust_1G_tsv/3.tsv", "file:/scratch/localdisk/drill/testdata/Cust_1G_tsv/1.tsv", "file:/scratch/localdisk/drill/testdata/Cust_1G_tsv/7.tsv", "file:/scratch/localdisk/drill/testdata/Cust_1G_tsv/6.tsv", "file:/scratch/localdisk/drill/testdata/Cust_1G_tsv/8.tsv", "file:/scratch/localdisk/drill/testdata/Cust_1G_tsv/9.tsv" ], "storage" : { "type" : "file", "enabled" : true, "connection" : "file:///", "config" : null, "workspaces" : { "root" : { "location" : "/", "writable" : true, "defaultInputFormat" : null }, "tpch9m" : { "location" : "/user/hive/warehouse/tpch9m.db", "writable" : true, "defaultInputFormat" : null }, "taxi1m" : { "location" : "/user/hive/warehouse/taxi.db/taxi_enriched_sukhdeep_1m", "writable" : true, "defaultInputFormat" : null }, "tmp" : { "location" : "/tmp", "writable" : true, "defaultInputFormat" : null } }, "formats" : { "psv" : { "type" : "text", "extensions" : [ "tbl" ], "delimiter" : "|" }, "csv" : { "type" : "text", "extensions" : [ "csv" ], "delimiter" : "," }, "tsv" : { "type" : "text", "extensions" : [ "tsv" ], "extractHeader" : true, "delimiter" : "\t" }, "parquet" : { "type" : "parquet" }, "json" : { "type" : "json", "extensions" : [ "json" ] }, "avro" : { "type" : "avro" }, "sequencefile" : { "type" : "sequencefile", "extensions" : [ "seq" ] }, "csvh" : { "type" : "text", "extensions" : [ "csvh" ], "extractHeader" : true, "delimiter" : "," } } }, "format" : { "type" : "text", "extensions" : [ "tsv" ], "extractHeader" : true, "delimiter" : "\t" }, "columns" : [ "`*`" ], "selectionRoot" : "file:/scratch/localdisk/drill/testdata/Cust_1G_tsv", "cost" : 1.0704467E7 }, { "pop" : "project", "@id" : 65540, "exprs" : [ { "ref" : "`T2¦¦*`", "expr" : "`*`" }, { "ref" : "`ORDER_ID`", "expr" : "`ORDER_ID`" }, { "ref" : "`CUSTOMER_ID`", "expr" : "`CUSTOMER_ID`" } ], "child" : 65541, "initialAllocation" : 1000000, "maxAllocation" : 10000000000, "cost" : 1.0704467E7 }, { "pop" : "filter", "@id" : 65539, "child" : 65540, "expr" : "booleanAnd(equal(`ORDER_ID`, '41') , equal(`CUSTOMER_ID`, '568') ) ", "initialAllocation" : 1000000, "maxAllocation" : 10000000000, "cost" : 240850.50749999998 }, { "pop" : "selection-vector-remover", "@id" : 65538, "child" : 65539, "initialAllocation" : 1000000, "maxAllocation" : 10000000000, "cost" : 240850.50749999998 }, { "pop" : "project", "@id" : 65537, "exprs" : [ { "ref" : "`T2¦¦*`", "expr" : "`T2¦¦*`" } ], "child" : 65538, "initialAllocation" : 1000000, "maxAllocation" : 10000000000, "cost" : 240850.50749999998 }, { "pop" : "union-exchange", "@id" : 2, "child" : 65537, "initialAllocation" : 1000000, "maxAllocation" : 10000000000, "cost" : 240850.50749999998 }, { "pop" : "project", "@id" : 1, "exprs" : [ { "ref" : "`*`", "expr" : "`T2¦¦*`" } ], "child" : 2, "initialAllocation" : 1000000, "maxAllocation" : 10000000000, "cost" : 240850.50749999998 }, { "pop" : "screen", "@id" : 0, "child" : 1, "initialAllocation" : 1000000, "maxAllocation" : 10000000000, "cost" : 240850.50749999998 } ] } | +------+------+ 1 row selected (30.126 seconds)