[
https://issues.apache.org/jira/browse/HIVE-7982?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Mostafa Mokhtar updated HIVE-7982:
----------------------------------
Description:
Now explain for Q17 is back in the 12 second range, I checked the queries
issues to MySQL and they are very different than before
on August 15 explain was completing in under 5 seconds and we issued the
following queries :
{code}
select "COLUMN_NAME", "COLUMN_TYPE", min("LONG_LOW_VALUE"),
max("LONG_HIGH_VALUE"), min("DOUBLE_LOW_VALUE"), max("DOUBLE_HIGH_VALUE"),
min("BIG_DECIMAL_LOW_VALUE"), max("BIG_DECIMAL_HIGH_VALUE"), sum("NUM_NULLS"),
max("NUM_DISTINCTS"), max("AVG_COL_LEN"), max("MAX_COL_LEN"), sum("NUM_TRUES"),
sum("NUM_FALSES") from "PART_COL_STATS" where "DB_NAME" =
'tpcds_bin_partitioned_orc_30000' and "TABLE_NAME" = 'store_returns' and
"COLUMN_NAME" in ('sr_item_sk','sr_customer_sk','sr_ticket_number') AND
"PARTITION_NAME" in
('sr_returned_date=1998-01-06','sr_returned_date=1998-01-07',..'sr_returned_date=2003-07-01')
group by "COLUMN_NAME", "COLUMN_TYPE";
select "COLUMN_NAME", "COLUMN_TYPE", min("LONG_LOW_VALUE"),
max("LONG_HIGH_VALUE"), min("DOUBLE_LOW_VALUE"), max("DOUBLE_HIGH_VALUE"),
min("BIG_DECIMAL_LOW_VALUE"), max("BIG_DECIMAL_HIGH_VALUE"), sum("NUM_NULLS"),
max("NUM_DISTINCTS"), max("AVG_COL_LEN"), max("MAX_COL_LEN"), sum("NUM_TRUES"),
sum("NUM_FALSES") from "PART_COL_STATS" where "DB_NAME" =
'tpcds_bin_partitioned_orc_30000' and "TABLE_NAME" = 'store_returns' and
"COLUMN_NAME" in ('sr_returned_date_sk') AND "PARTITION_NAME" in
('sr_returned_date=1998-01-06'..'sr_returned_date=2003-07-01') group by
"COLUMN_NAME", "COLUMN_TYPE"
{code}
Currently explain Q17 takes 11 seconds and the queries sent to MySQL are very
inefficient because
1) They no longer do the aggregation on MySQL and get a row per partition
2) There is a query per stats K,V pair so the number of queries is up by 9x
{code}
select COLUMN_NAME, COLUMN_TYPE, count(PARTITION_NAME) from
PART_COL_STATS where DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME =
'store_returns' and COLUMN_NAME in
('sr_item_sk','sr_customer_sk','sr_ticket_number') and PARTITION_NAME in
('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') group by
COLUMN_NAME, COLUMN_TYPE
select COLUMN_NAME, sum(NUM_NULLS), sum(NUM_TRUES),
sum(NUM_FALSES) from PART_COL_STATS where DB_NAME =
'tpcds_bin_partitioned_orc_200' and TABLE_NAME = 'store_returns' and
COLUMN_NAME in ('sr_customer_sk','sr_item_sk','sr_ticket_number') and
PARTITION_NAME in ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01')
group by COLUMN_NAME
select LONG_LOW_VALUE,PARTITION_NAME from PART_COL_STATS where
DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME = 'store_returns' and
COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME in
('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by
'LONG_LOW_VALUE'
select LONG_HIGH_VALUE,PARTITION_NAME from PART_COL_STATS where
DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME = 'store_returns' and
COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME in
('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by
'LONG_HIGH_VALUE'
select DOUBLE_LOW_VALUE,PARTITION_NAME from PART_COL_STATS
where DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME =
'store_returns' and COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME in
('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by
'DOUBLE_LOW_VALUE'
select DOUBLE_HIGH_VALUE,PARTITION_NAME from PART_COL_STATS
where DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME =
'store_returns' and COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME in
('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by
'DOUBLE_HIGH_VALUE'
select BIG_DECIMAL_LOW_VALUE,PARTITION_NAME from PART_COL_STATS
where DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME =
'store_returns' and COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME in
('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by
'BIG_DECIMAL_LOW_VALUE'
select BIG_DECIMAL_HIGH_VALUE,PARTITION_NAME from
PART_COL_STATS where DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME =
'store_returns' and COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME in
('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by
'BIG_DECIMAL_HIGH_VALUE'
select NUM_DISTINCTS,PARTITION_NAME from PART_COL_STATS where
DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME = 'store_returns' and
COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME in
('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by
'NUM_DISTINCTS'
select AVG_COL_LEN,PARTITION_NAME from PART_COL_STATS where
DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME = 'store_returns' and
COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME in
('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by
'AVG_COL_LEN'
select MAX_COL_LEN,PARTITION_NAME from PART_COL_STATS where
DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME = 'store_returns' and
COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME in
('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by
'MAX_COL_LEN'
select LONG_LOW_VALUE,PARTITION_NAME from PART_COL_STATS where
DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME = 'store_returns' and
COLUMN_NAME in ('sr_item_sk') and PARTITION_NAME in
('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by
'LONG_LOW_VALUE'
select LONG_HIGH_VALUE,PARTITION_NAME from PART_COL_STATS where
DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME = 'store_returns' and
COLUMN_NAME in ('sr_item_sk') and PARTITION_NAME in
('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by
'LONG_HIGH_VALUE'
{code}
was:
With CBO we need the correct set of indexes to provide an efficient Read/Write
access.
These indexes improve performance of Explain plan and Analyzed table by 60% and
300%.
{code}
MySQL
CREATE INDEX PART_COL_STATS_N50 ON PART_COL_STATS
(DB_NAME,TABLE_NAME,COLUMN_NAME) USING BTREE;
MsSQL
CREATE INDEX PART_COL_STATS_N50 ON PART_COL_STATS
(DB_NAME,TABLE_NAME,COLUMN_NAME);
Oracle
CREATE INDEX PART_COL_STATS_N50 ON PART_COL_STATS
(DB_NAME,TABLE_NAME,COLUMN_NAME);
Postgres
CREATE INDEX "PART_COL_STATS_N50" ON "PART_COL_STATS" USING btree
("DB_NAME","TABLE_NAME","COLUMN_NAME");
{code}
> Regression in explain with CBO enabled due to issuing query per K,V for the
> stats
> ---------------------------------------------------------------------------------
>
> Key: HIVE-7982
> URL: https://issues.apache.org/jira/browse/HIVE-7982
> Project: Hive
> Issue Type: Bug
> Components: CBO
> Affects Versions: 0.14.0
> Reporter: Mostafa Mokhtar
> Assignee: Ashutosh Chauhan
> Fix For: 0.14.0
>
>
> Now explain for Q17 is back in the 12 second range, I checked the queries
> issues to MySQL and they are very different than before
> on August 15 explain was completing in under 5 seconds and we issued the
> following queries :
> {code}
> select "COLUMN_NAME", "COLUMN_TYPE", min("LONG_LOW_VALUE"),
> max("LONG_HIGH_VALUE"), min("DOUBLE_LOW_VALUE"), max("DOUBLE_HIGH_VALUE"),
> min("BIG_DECIMAL_LOW_VALUE"), max("BIG_DECIMAL_HIGH_VALUE"),
> sum("NUM_NULLS"), max("NUM_DISTINCTS"), max("AVG_COL_LEN"),
> max("MAX_COL_LEN"), sum("NUM_TRUES"), sum("NUM_FALSES") from "PART_COL_STATS"
> where "DB_NAME" = 'tpcds_bin_partitioned_orc_30000' and "TABLE_NAME" =
> 'store_returns' and "COLUMN_NAME" in
> ('sr_item_sk','sr_customer_sk','sr_ticket_number') AND "PARTITION_NAME" in
> ('sr_returned_date=1998-01-06','sr_returned_date=1998-01-07',..'sr_returned_date=2003-07-01')
> group by "COLUMN_NAME", "COLUMN_TYPE";
> select "COLUMN_NAME", "COLUMN_TYPE", min("LONG_LOW_VALUE"),
> max("LONG_HIGH_VALUE"), min("DOUBLE_LOW_VALUE"), max("DOUBLE_HIGH_VALUE"),
> min("BIG_DECIMAL_LOW_VALUE"), max("BIG_DECIMAL_HIGH_VALUE"),
> sum("NUM_NULLS"), max("NUM_DISTINCTS"), max("AVG_COL_LEN"),
> max("MAX_COL_LEN"), sum("NUM_TRUES"), sum("NUM_FALSES") from "PART_COL_STATS"
> where "DB_NAME" = 'tpcds_bin_partitioned_orc_30000' and "TABLE_NAME" =
> 'store_returns' and "COLUMN_NAME" in ('sr_returned_date_sk') AND
> "PARTITION_NAME" in
> ('sr_returned_date=1998-01-06'..'sr_returned_date=2003-07-01') group by
> "COLUMN_NAME", "COLUMN_TYPE"
> {code}
> Currently explain Q17 takes 11 seconds and the queries sent to MySQL are very
> inefficient because
> 1) They no longer do the aggregation on MySQL and get a row per partition
> 2) There is a query per stats K,V pair so the number of queries is up by 9x
> {code}
> select COLUMN_NAME, COLUMN_TYPE, count(PARTITION_NAME) from
> PART_COL_STATS where DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME
> = 'store_returns' and COLUMN_NAME in
> ('sr_item_sk','sr_customer_sk','sr_ticket_number') and PARTITION_NAME in
> ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') group by
> COLUMN_NAME, COLUMN_TYPE
> select COLUMN_NAME, sum(NUM_NULLS), sum(NUM_TRUES),
> sum(NUM_FALSES) from PART_COL_STATS where DB_NAME =
> 'tpcds_bin_partitioned_orc_200' and TABLE_NAME = 'store_returns' and
> COLUMN_NAME in ('sr_customer_sk','sr_item_sk','sr_ticket_number') and
> PARTITION_NAME in
> ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') group by
> COLUMN_NAME
> select LONG_LOW_VALUE,PARTITION_NAME from PART_COL_STATS where
> DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME = 'store_returns'
> and COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME in
> ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by
> 'LONG_LOW_VALUE'
> select LONG_HIGH_VALUE,PARTITION_NAME from PART_COL_STATS where
> DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME = 'store_returns'
> and COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME in
> ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by
> 'LONG_HIGH_VALUE'
> select DOUBLE_LOW_VALUE,PARTITION_NAME from PART_COL_STATS
> where DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME =
> 'store_returns' and COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME in
> ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by
> 'DOUBLE_LOW_VALUE'
> select DOUBLE_HIGH_VALUE,PARTITION_NAME from PART_COL_STATS
> where DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME =
> 'store_returns' and COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME in
> ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by
> 'DOUBLE_HIGH_VALUE'
> select BIG_DECIMAL_LOW_VALUE,PARTITION_NAME from PART_COL_STATS
> where DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME =
> 'store_returns' and COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME in
> ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by
> 'BIG_DECIMAL_LOW_VALUE'
> select BIG_DECIMAL_HIGH_VALUE,PARTITION_NAME from
> PART_COL_STATS where DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME
> = 'store_returns' and COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME in
> ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by
> 'BIG_DECIMAL_HIGH_VALUE'
> select NUM_DISTINCTS,PARTITION_NAME from PART_COL_STATS where
> DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME = 'store_returns'
> and COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME in
> ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by
> 'NUM_DISTINCTS'
> select AVG_COL_LEN,PARTITION_NAME from PART_COL_STATS where
> DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME = 'store_returns'
> and COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME in
> ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by
> 'AVG_COL_LEN'
> select MAX_COL_LEN,PARTITION_NAME from PART_COL_STATS where
> DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME = 'store_returns'
> and COLUMN_NAME in ('sr_customer_sk') and PARTITION_NAME in
> ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by
> 'MAX_COL_LEN'
> select LONG_LOW_VALUE,PARTITION_NAME from PART_COL_STATS where
> DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME = 'store_returns'
> and COLUMN_NAME in ('sr_item_sk') and PARTITION_NAME in
> ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by
> 'LONG_LOW_VALUE'
> select LONG_HIGH_VALUE,PARTITION_NAME from PART_COL_STATS where
> DB_NAME = 'tpcds_bin_partitioned_orc_200' and TABLE_NAME = 'store_returns'
> and COLUMN_NAME in ('sr_item_sk') and PARTITION_NAME in
> ('sr_returned_date=1998-01-06','sr_returned_date=2003-07-01') order by
> 'LONG_HIGH_VALUE'
> {code}
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)