Chun Chang created DRILL-1434:
---------------------------------
Summary: count() on null value gives zero
Key: DRILL-1434
URL: https://issues.apache.org/jira/browse/DRILL-1434
Project: Apache Drill
Issue Type: Bug
Components: Functions - Drill
Affects Versions: 0.6.0
Reporter: Chun Chang
code base
#Fri Sep 12 14:08:02 PDT 2014
git.commit.id.abbrev=9e16466
I have a parquet file (tpcds data) which contains null value on a column. The
total count of the column:
0: jdbc:drill:schema=dfs> select count(ss_quantity) from
`tpcds/p1/store_sales.parquet`;
+------------+
| EXPR$0 |
+------------+
| 2880404 |
+------------+
The count without considering null is:
0: jdbc:drill:schema=dfs> select count(ss_quantity) from
`tpcds/p1/store_sales.parquet` where ss_quantity is not null;
+------------+
| EXPR$0 |
+------------+
| 2750408 |
+------------+
But the count for null value is zero:
0: jdbc:drill:schema=dfs> select count(ss_quantity) from
`tpcds/p1/store_sales.parquet` where ss_quantity is null;
+------------+
| EXPR$0 |
+------------+
| 0 |
+------------+
Here is the physical plan look like for this query:
0: jdbc:drill:schema=dfs> explain plan for select count(ss_quantity) from
`tpcds/p1/store_sales.parquet` where ss_quantity is null;
+------------+------------+
| text | json |
+------------+------------+
| 00-00 Screen
00-01 StreamAgg(group=[{}], EXPR$0=[COUNT($0)])
00-02 Filter(condition=[IS NULL($0)])
00-03 ProducerConsumer
00-04 Scan(groupscan=[ParquetGroupScan [entries=[ReadEntryWithPath
[path=maprfs:/user/root/mondrian/tpcds/p1/store_sales.parquet]],
selectionRoot=/user/root/mondrian/tpcds/p1/store_sales.parquet,
columns=[SchemaPath [`ss_quantity`]]]])
| {
"head" : {
"version" : 1,
"generator" : {
"type" : "ExplainHandler",
"info" : ""
},
"type" : "APACHE_DRILL_PHYSICAL",
"options" : [ ],
"queue" : 0,
"resultMode" : "EXEC"
},
"graph" : [ {
"pop" : "parquet-scan",
"@id" : 4,
"entries" : [ {
"path" : "maprfs:/user/root/mondrian/tpcds/p1/store_sales.parquet"
} ],
"storage" : {
"type" : "file",
"enabled" : true,
"connection" : "maprfs:///",
"workspaces" : {
"default" : {
"location" : "/user/root/mondrian/",
"writable" : true,
"storageformat" : null
},
"home" : {
"location" : "/",
"writable" : false,
"storageformat" : null
},
"root" : {
"location" : "/",
"writable" : false,
"storageformat" : null
},
"abhi" : {
"location" : "/tables",
"writable" : true,
"storageformat" : "csv"
},
"chun" : {
"location" : "/drill/testdata/chun/",
"writable" : false,
"storageformat" : null
},
"tmp" : {
"location" : "/tmp",
"writable" : true,
"storageformat" : "csv"
}
},
"formats" : {
"psv" : {
"type" : "text",
"extensions" : [ "tbl" ],
"delimiter" : "|"
},
"csv" : {
"type" : "text",
"extensions" : [ "csv" ],
"delimiter" : ","
},
"tsv" : {
"type" : "text",
"extensions" : [ "tsv" ],
"delimiter" : "\t"
},
"parquet" : {
"type" : "parquet"
},
"json" : {
"type" : "json"
}
}
},
"format" : {
"type" : "parquet"
},
"columns" : [ "`ss_quantity`" ],
"selectionRoot" : "/user/root/mondrian/tpcds/p1/store_sales.parquet",
"cost" : 2880404.0
}, {
"pop" : "producer-consumer",
"@id" : 3,
"child" : 4,
"size" : 10,
"initialAllocation" : 1000000,
"maxAllocation" : 10000000000,
"cost" : 2880404.0
}, {
"pop" : "filter",
"@id" : 2,
"child" : 3,
"expr" : "isnull(`ss_quantity`) ",
"initialAllocation" : 1000000,
"maxAllocation" : 10000000000,
"cost" : 720101.0
}, {
"pop" : "streaming-aggregate",
"@id" : 1,
"child" : 2,
"keys" : [ ],
"exprs" : [ {
"ref" : "`EXPR$0`",
"expr" : "count(`ss_quantity`) "
} ],
"initialAllocation" : 1000000,
"maxAllocation" : 10000000000,
"cost" : 1.0
}, {
"pop" : "screen",
"@id" : 0,
"child" : 1,
"initialAllocation" : 1000000,
"maxAllocation" : 10000000000,
"cost" : 72010.1
} ]
} |
+------------+------------+
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)