Please find the query plan for both queries. FYI: I am not seeing any planning difference between these 2 queries except Cost.
/******************************** Query on GZ ****************************************/ 0: jdbc:drill:> explain plan for select channelid, count(serverTime) from dfs.`/tmp/stest-gz/kafka_3_25-Jul-2016-12a.json.gz` group by channelid ; +------+------+ | text | json | +------+------+ | 00-00 Screen 00-01 Project(channelid=[$0], EXPR$1=[$1]) 00-02 UnionExchange 01-01 HashAgg(group=[{0}], EXPR$1=[$SUM0($1)]) 01-02 Project(channelid=[$0], EXPR$1=[$1]) 01-03 HashToRandomExchange(dist0=[[$0]]) 02-01 UnorderedMuxExchange 03-01 Project(channelid=[$0], EXPR$1=[$1], E_X_P_R_H_A_S_H_F_I_E_L_D=[hash32AsDouble($0)]) 03-02 HashAgg(group=[{0}], EXPR$1=[COUNT($1)]) 03-03 Scan(groupscan=[EasyGroupScan [selectionRoot=hdfs://namenode:9000/tmp/stest-gz/kafka_3_25-Jul-2016-12a.json.gz, numFiles=1, columns=[`channelid`, `serverTime`], files=[hdfs://namenode:9000/tmp/stest-gz/kafka_3_25-Jul-2016-12a.json.gz]]]) | { "head" : { "version" : 1, "generator" : { "type" : "ExplainHandler", "info" : "" }, "type" : "APACHE_DRILL_PHYSICAL", "options" : [ ], "queue" : 0, "resultMode" : "EXEC" }, "graph" : [ { "pop" : "fs-scan", "@id" : 196611, "userName" : "hadoop", "files" : [ "hdfs://namenode:9000/tmp/stest-gz/kafka_3_25-Jul-2016-12a.json.gz" ], "storage" : { "type" : "file", "enabled" : true, "connection" : "hdfs://namenode:9000", "config" : null, "workspaces" : { "root" : { "location" : "/tmp/", "writable" : true, "defaultInputFormat" : null }, "tmp" : { "location" : "/tmp", "writable" : true, "defaultInputFormat" : null } }, "formats" : { "psv" : { "type" : "text", "extensions" : [ "tbl" ], "delimiter" : "|" }, "csv" : { "type" : "text", "extensions" : [ "csv" ], "delimiter" : "," }, "tsv" : { "type" : "text", "extensions" : [ "tsv" ], "delimiter" : "\t" }, "parquet" : { "type" : "parquet" }, "json" : { "type" : "json", "extensions" : [ "json" ] }, "avro" : { "type" : "avro" } } }, "format" : { "type" : "json", "extensions" : [ "json" ] }, "columns" : [ "`channelid`", "`serverTime`" ], "selectionRoot" : "hdfs://namenode:9000/tmp/stest-gz/kafka_3_25-Jul-2016-12a.json.gz", "cost" : 1800981.0 }, { "pop" : "hash-aggregate", "@id" : 196610, "child" : 196611, "cardinality" : 1.0, "initialAllocation" : 1000000, "maxAllocation" : 10000000000, "groupByExprs" : [ { "ref" : "`channelid`", "expr" : "`channelid`" } ], "aggrExprs" : [ { "ref" : "`EXPR$1`", "expr" : "count(`serverTime`) " } ], "cost" : 900490.5 }, { "pop" : "project", "@id" : 196609, "exprs" : [ { "ref" : "`channelid`", "expr" : "`channelid`" }, { "ref" : "`EXPR$1`", "expr" : "`EXPR$1`" }, { "ref" : "`E_X_P_R_H_A_S_H_F_I_E_L_D`", "expr" : "hash32asdouble(`channelid`) " } ], "child" : 196610, "initialAllocation" : 1000000, "maxAllocation" : 10000000000, "cost" : 180098.1 }, { "pop" : "unordered-mux-exchange", "@id" : 131073, "child" : 196609, "initialAllocation" : 1000000, "maxAllocation" : 10000000000, "cost" : 180098.1 }, { "pop" : "hash-to-random-exchange", "@id" : 65539, "child" : 131073, "expr" : "`E_X_P_R_H_A_S_H_F_I_E_L_D`", "initialAllocation" : 1000000, "maxAllocation" : 10000000000, "cost" : 180098.1 }, { "pop" : "project", "@id" : 65538, "exprs" : [ { "ref" : "`channelid`", "expr" : "`channelid`" }, { "ref" : "`EXPR$1`", "expr" : "`EXPR$1`" } ], "child" : 65539, "initialAllocation" : 1000000, "maxAllocation" : 10000000000, "cost" : 180098.1 }, { "pop" : "hash-aggregate", "@id" : 65537, "child" : 65538, "cardinality" : 1.0, "initialAllocation" : 1000000, "maxAllocation" : 10000000000, "groupByExprs" : [ { "ref" : "`channelid`", "expr" : "`channelid`" } ], "aggrExprs" : [ { "ref" : "`EXPR$1`", "expr" : "$sum0(`EXPR$1`) " } ], "cost" : 90049.05 }, { "pop" : "union-exchange", "@id" : 2, "child" : 65537, "initialAllocation" : 1000000, "maxAllocation" : 10000000000, "cost" : 18009.81 }, { "pop" : "project", "@id" : 1, "exprs" : [ { "ref" : "`channelid`", "expr" : "`channelid`" }, { "ref" : "`EXPR$1`", "expr" : "`EXPR$1`" } ], "child" : 2, "initialAllocation" : 1000000, "maxAllocation" : 10000000000, "cost" : 18009.81 }, { "pop" : "screen", "@id" : 0, "child" : 1, "initialAllocation" : 1000000, "maxAllocation" : 10000000000, "cost" : 18009.81 } ] } | +------+------+ 1 row selected (0.729 seconds) 0: jdbc:drill:> /******************************** Query on BZ ****************************************/ 0: jdbc:drill:> explain plan for select channelid, count(serverTime) from dfs.`/tmp/stest-bz2/kafka_3_25-Jul-2016-12a.json.bz2` group by channelid ; +------+------+ | text | json | +------+------+ | 00-00 Screen 00-01 Project(channelid=[$0], EXPR$1=[$1]) 00-02 UnionExchange 01-01 HashAgg(group=[{0}], EXPR$1=[$SUM0($1)]) 01-02 Project(channelid=[$0], EXPR$1=[$1]) 01-03 HashToRandomExchange(dist0=[[$0]]) 02-01 UnorderedMuxExchange 03-01 Project(channelid=[$0], EXPR$1=[$1], E_X_P_R_H_A_S_H_F_I_E_L_D=[hash32AsDouble($0)]) 03-02 HashAgg(group=[{0}], EXPR$1=[COUNT($1)]) 03-03 Scan(groupscan=[EasyGroupScan [selectionRoot=hdfs://namenode:9000/tmp/stest-bz2/kafka_3_25-Jul-2016-12a.json.bz2, numFiles=1, columns=[`channelid`, `serverTime`], files=[hdfs://namenode:9000/tmp/stest-bz2/kafka_3_25-Jul-2016-12a.json.bz2]]]) | { "head" : { "version" : 1, "generator" : { "type" : "ExplainHandler", "info" : "" }, "type" : "APACHE_DRILL_PHYSICAL", "options" : [ ], "queue" : 0, "resultMode" : "EXEC" }, "graph" : [ { "pop" : "fs-scan", "@id" : 196611, "userName" : "hadoop", "files" : [ "hdfs://namenode:9000/tmp/stest-bz2/kafka_3_25-Jul-2016-12a.json.bz2" ], "storage" : { "type" : "file", "enabled" : true, "connection" : "hdfs://namenode:9000", "config" : null, "workspaces" : { "root" : { "location" : "/tmp/", "writable" : true, "defaultInputFormat" : null }, "tmp" : { "location" : "/tmp", "writable" : true, "defaultInputFormat" : null } }, "formats" : { "psv" : { "type" : "text", "extensions" : [ "tbl" ], "delimiter" : "|" }, "csv" : { "type" : "text", "extensions" : [ "csv" ], "delimiter" : "," }, "tsv" : { "type" : "text", "extensions" : [ "tsv" ], "delimiter" : "\t" }, "parquet" : { "type" : "parquet" }, "json" : { "type" : "json", "extensions" : [ "json" ] }, "avro" : { "type" : "avro" } } }, "format" : { "type" : "json", "extensions" : [ "json" ] }, "columns" : [ "`channelid`", "`serverTime`" ], "selectionRoot" : "hdfs://namenode:9000/tmp/stest-bz2/kafka_3_25-Jul-2016-12a.json.bz2", "cost" : 1148224.0 }, { "pop" : "hash-aggregate", "@id" : 196610, "child" : 196611, "cardinality" : 1.0, "initialAllocation" : 1000000, "maxAllocation" : 10000000000, "groupByExprs" : [ { "ref" : "`channelid`", "expr" : "`channelid`" } ], "aggrExprs" : [ { "ref" : "`EXPR$1`", "expr" : "count(`serverTime`) " } ], "cost" : 574112.0 }, { "pop" : "project", "@id" : 196609, "exprs" : [ { "ref" : "`channelid`", "expr" : "`channelid`" }, { "ref" : "`EXPR$1`", "expr" : "`EXPR$1`" }, { "ref" : "`E_X_P_R_H_A_S_H_F_I_E_L_D`", "expr" : "hash32asdouble(`channelid`) " } ], "child" : 196610, "initialAllocation" : 1000000, "maxAllocation" : 10000000000, "cost" : 114822.4 }, { "pop" : "unordered-mux-exchange", "@id" : 131073, "child" : 196609, "initialAllocation" : 1000000, "maxAllocation" : 10000000000, "cost" : 114822.4 }, { "pop" : "hash-to-random-exchange", "@id" : 65539, "child" : 131073, "expr" : "`E_X_P_R_H_A_S_H_F_I_E_L_D`", "initialAllocation" : 1000000, "maxAllocation" : 10000000000, "cost" : 114822.4 }, { "pop" : "project", "@id" : 65538, "exprs" : [ { "ref" : "`channelid`", "expr" : "`channelid`" }, { "ref" : "`EXPR$1`", "expr" : "`EXPR$1`" } ], "child" : 65539, "initialAllocation" : 1000000, "maxAllocation" : 10000000000, "cost" : 114822.4 }, { "pop" : "hash-aggregate", "@id" : 65537, "child" : 65538, "cardinality" : 1.0, "initialAllocation" : 1000000, "maxAllocation" : 10000000000, "groupByExprs" : [ { "ref" : "`channelid`", "expr" : "`channelid`" } ], "aggrExprs" : [ { "ref" : "`EXPR$1`", "expr" : "$sum0(`EXPR$1`) " } ], "cost" : 57411.2 }, { "pop" : "union-exchange", "@id" : 2, "child" : 65537, "initialAllocation" : 1000000, "maxAllocation" : 10000000000, "cost" : 11482.24 }, { "pop" : "project", "@id" : 1, "exprs" : [ { "ref" : "`channelid`", "expr" : "`channelid`" }, { "ref" : "`EXPR$1`", "expr" : "`EXPR$1`" } ], "child" : 2, "initialAllocation" : 1000000, "maxAllocation" : 10000000000, "cost" : 11482.24 }, { "pop" : "screen", "@id" : 0, "child" : 1, "initialAllocation" : 1000000, "maxAllocation" : 10000000000, "cost" : 11482.24 } ] } | +------+------+ 1 row selected (0.381 seconds) 0: jdbc:drill:> On Thu, Aug 4, 2016 at 3:07 PM, Khurram Faraaz <kfar...@maprtech.com> wrote: > Can you please do an explain plan over the two aggregate queries. That way > we can know where most of the time is being spent, is it in the query > planning phase or is it query execution that is taking longer. Please share > the query plans and the time taken for those explain plan statements. > > On Mon, Aug 1, 2016 at 3:46 PM, Shankar Mane <shankar.m...@games24x7.com> > wrote: > > > It is plain json (1 json per line). > > Each json message size = ~4kb > > no. of json messages = ~5 Millions. > > > > store.parquet.compression = snappy ( i don't think, this parameter get > > used. As I am querying select only.) > > > > > > On Mon, Aug 1, 2016 at 3:27 PM, Khurram Faraaz <kfar...@maprtech.com> > > wrote: > > > > > What is the data format within those .gz and .bz2 files ? It is parquet > > or > > > JSON or plain text (CSV) ? > > > Also, what was this config parameter `store.parquet.compression` set > to, > > > when ypu ran your test ? > > > > > > - Khurram > > > > > > On Sun, Jul 31, 2016 at 11:17 PM, Shankar Mane < > > shankar.m...@games24x7.com > > > > > > > wrote: > > > > > > > Awaiting for response.. > > > > > > > > On 30-Jul-2016 3:20 PM, "Shankar Mane" <shankar.m...@games24x7.com> > > > wrote: > > > > > > > > > > > > > > > > > > I am Comparing Querying speed between GZ and BZ2. > > > > > > > > > > Below are the 2 files and their sizes (This 2 files have same > data): > > > > > kafka_3_25-Jul-2016-12a.json.gz = 1.8G > > > > > kafka_3_25-Jul-2016-12a.json.bz2= 1.1G > > > > > > > > > > > > > > > > > > > > Results: > > > > > > > > > > 0: jdbc:drill:> select channelid, count(serverTime) from > > > > dfs.`/tmp/stest-gz/kafka_3_25-Jul-2016-12a.json.gz` group by > channelid > > ; > > > > > +------------+----------+ > > > > > | channelid | EXPR$1 | > > > > > +------------+----------+ > > > > > | 3 | 977134 | > > > > > | 0 | 836850 | > > > > > | 2 | 3202854 | > > > > > +------------+----------+ > > > > > 3 rows selected (86.034 seconds) > > > > > > > > > > > > > > > > > > > > 0: jdbc:drill:> select channelid, count(serverTime) from > > > > dfs.`/tmp/stest-bz2/kafka_3_25-Jul-2016-12a.json.bz2` group by > > channelid > > > ; > > > > > +------------+----------+ > > > > > | channelid | EXPR$1 | > > > > > +------------+----------+ > > > > > | 3 | 977134 | > > > > > | 0 | 836850 | > > > > > | 2 | 3202854 | > > > > > +------------+----------+ > > > > > 3 rows selected (459.079 seconds) > > > > > > > > > > > > > > > > > > > > Questions: > > > > > 1. As per above Test: Gz is 6x fast than Bz2. why is that ? > > > > > 2. How can we speed to up Bz2. Are there any configuration to do ? > > > > > 3. As bz2 is splittable format, How drill using it ? > > > > > > > > > > > > > > > regards, > > > > > shankar > > > > > > > > > >