[jira] [Created] (HIVE-11460) CTAS : Allows user to create table without explicit column names which fails stats creation on new table

2015-08-04 Thread Mostafa Mokhtar (JIRA)
Mostafa Mokhtar created HIVE-11460:
--

 Summary: CTAS : Allows user to create table without explicit 
column names which fails stats creation on new table
 Key: HIVE-11460
 URL: https://issues.apache.org/jira/browse/HIVE-11460
 Project: Hive
  Issue Type: Bug
Reporter: Mostafa Mokhtar
Assignee: Laljo John Pullokkaran


{code}
create table customer_reload 
stored as orc as 
select  c.c_custkey,
c.c_name, 
c.c_acctbal, 
cast (c.c_custkey as bigint) 
from customer c;
{code}

Create table schema 
{code}
# col_name  data_type   comment

c_custkey   int
c_name  string
c_acctbal   double
_c3 bigint
{code}

Analyze table command 
{code}
analyze table customer_reload compute statistics for columns;
{code}

Exception 
{code}
hive analyze table customer_reload compute statistics for columns;
NoViableAltException(13@[323:1: atomExpression : ( ( KW_NULL )= KW_NULL - 
TOK_NULL | ( constant )= constant | castExpression | caseExpression | 
whenExpression | ( functionName LPAREN )= function | tableOrColumn | LPAREN ! 
expression RPAREN !);])
at 
org.apache.hadoop.hive.ql.parse.HiveParser_IdentifiersParser$DFA32.specialStateTransition(HiveParser_IdentifiersParser.java)
at org.antlr.runtime.DFA.predict(DFA.java:80)
at 
org.apache.hadoop.hive.ql.parse.HiveParser_IdentifiersParser.atomExpression(HiveParser_IdentifiersParser.java:6177)
at 
org.apache.hadoop.hive.ql.parse.HiveParser_IdentifiersParser.precedenceFieldExpression(HiveParser_IdentifiersParser.java:6383)
at 
org.apache.hadoop.hive.ql.parse.HiveParser_IdentifiersParser.precedenceUnaryPrefixExpression(HiveParser_IdentifiersParser.java:6768)
at 
org.apache.hadoop.hive.ql.parse.HiveParser_IdentifiersParser.precedenceUnarySuffixExpression(HiveParser_IdentifiersParser.java:6828)
at 
org.apache.hadoop.hive.ql.parse.HiveParser_IdentifiersParser.precedenceBitwiseXorExpression(HiveParser_IdentifiersParser.java:7012)
at 
org.apache.hadoop.hive.ql.parse.HiveParser_IdentifiersParser.precedenceStarExpression(HiveParser_IdentifiersParser.java:7172)
at 
org.apache.hadoop.hive.ql.parse.HiveParser_IdentifiersParser.precedencePlusExpression(HiveParser_IdentifiersParser.java:7332)
at 
org.apache.hadoop.hive.ql.parse.HiveParser_IdentifiersParser.precedenceAmpersandExpression(HiveParser_IdentifiersParser.java:7483)
at 
org.apache.hadoop.hive.ql.parse.HiveParser_IdentifiersParser.precedenceBitwiseOrExpression(HiveParser_IdentifiersParser.java:7634)
at 
org.apache.hadoop.hive.ql.parse.HiveParser_IdentifiersParser.precedenceEqualExpression(HiveParser_IdentifiersParser.java:8164)
at 
org.apache.hadoop.hive.ql.parse.HiveParser_IdentifiersParser.precedenceNotExpression(HiveParser_IdentifiersParser.java:9177)
at 
org.apache.hadoop.hive.ql.parse.HiveParser_IdentifiersParser.precedenceAndExpression(HiveParser_IdentifiersParser.java:9296)
at 
org.apache.hadoop.hive.ql.parse.HiveParser_IdentifiersParser.precedenceOrExpression(HiveParser_IdentifiersParser.java:9455)
at 
org.apache.hadoop.hive.ql.parse.HiveParser_IdentifiersParser.expression(HiveParser_IdentifiersParser.java:6105)
at 
org.apache.hadoop.hive.ql.parse.HiveParser.expression(HiveParser.java:45896)
at 
org.apache.hadoop.hive.ql.parse.HiveParser_SelectClauseParser.selectExpression(HiveParser_SelectClauseParser.java:3755)
at 
org.apache.hadoop.hive.ql.parse.HiveParser.selectExpression(HiveParser.java:45914)
at 
org.apache.hadoop.hive.ql.parse.HiveParser_IdentifiersParser.function(HiveParser_IdentifiersParser.java:3850)
at 
org.apache.hadoop.hive.ql.parse.HiveParser_IdentifiersParser.atomExpression(HiveParser_IdentifiersParser.java:6280)
at 
org.apache.hadoop.hive.ql.parse.HiveParser_IdentifiersParser.precedenceFieldExpression(HiveParser_IdentifiersParser.java:6383)
at 
org.apache.hadoop.hive.ql.parse.HiveParser_IdentifiersParser.precedenceUnaryPrefixExpression(HiveParser_IdentifiersParser.java:6768)
at 
org.apache.hadoop.hive.ql.parse.HiveParser_IdentifiersParser.precedenceUnarySuffixExpression(HiveParser_IdentifiersParser.java:6828)
at 
org.apache.hadoop.hive.ql.parse.HiveParser_IdentifiersParser.precedenceBitwiseXorExpression(HiveParser_IdentifiersParser.java:7012)
at 
org.apache.hadoop.hive.ql.parse.HiveParser_IdentifiersParser.precedenceStarExpression(HiveParser_IdentifiersParser.java:7172)
at 
org.apache.hadoop.hive.ql.parse.HiveParser_IdentifiersParser.precedencePlusExpression(HiveParser_IdentifiersParser.java:7332)
at 
org.apache.hadoop.hive.ql.parse.HiveParser_IdentifiersParser.precedenceAmpersandExpression(HiveParser_IdentifiersParser.java:7483)

[jira] [Created] (HIVE-11341) Avoid resizing of ASTNode tree

2015-07-22 Thread Mostafa Mokhtar (JIRA)
Mostafa Mokhtar created HIVE-11341:
--

 Summary: Avoid resizing of ASTNode tree 
 Key: HIVE-11341
 URL: https://issues.apache.org/jira/browse/HIVE-11341
 Project: Hive
  Issue Type: Bug
  Components: Hive, Physical Optimizer
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Hari Sankar Sivarama Subramaniyan


{code}
Stack Trace Sample CountPercentage(%)
   
org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.analyze(ASTNode, Context)  
 1,605   90.474
  
org.apache.hadoop.hive.ql.parse.CalcitePlanner.analyzeInternal(ASTNode)   
1,605   90.474
 
org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.analyzeInternal(ASTNode, 
SemanticAnalyzer$PlannerContext) 1,605   90.474

org.apache.hadoop.hive.ql.parse.CalcitePlanner.genOPTree(ASTNode, 
SemanticAnalyzer$PlannerContext)  1,604   90.417
   
org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genOPTree(ASTNode, 
SemanticAnalyzer$PlannerContext) 1,604   90.417
  
org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genPlan(QB)  1,604   90.417

 org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genPlan(QB, boolean)  1,604   
90.417

org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genBodyPlan(QB, Operator, 
Map) 1,604   90.417

   org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genFilterPlan(ASTNode, 
QB, Operator, Map, boolean)  1,603   90.361

  org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genFilterPlan(QB, 
ASTNode, Operator, boolean)1,603   90.361

 
org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genExprNodeDesc(ASTNode, 
RowResolver, boolean)1,603   90.361


org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genExprNodeDesc(ASTNode, 
RowResolver, TypeCheckCtx)1,603   90.361

   
org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genAllExprNodeDesc(ASTNode, 
RowResolver, TypeCheckCtx)  1,603   90.361

  
org.apache.hadoop.hive.ql.parse.TypeCheckProcFactory.genExprNode(ASTNode, 
TypeCheckCtx)   1,603   90.361

 
org.apache.hadoop.hive.ql.parse.TypeCheckProcFactory.genExprNode(ASTNode, 
TypeCheckCtx, TypeCheckProcFactory)  1,603   90.361


org.apache.hadoop.hive.ql.lib.DefaultGraphWalker.startWalking(Collection, 
HashMap)  1,579   89.008

   
org.apache.hadoop.hive.ql.lib.DefaultGraphWalker.walk(Node)  1,571   88.557

  java.util.ArrayList.removeAll(Collection) 
1,433   80.778

 
java.util.ArrayList.batchRemove(Collection, boolean)   1,433   80.778

java.util.ArrayList.contains(Object)
1,228   69.222

   java.util.ArrayList.indexOf(Object)  
1,228   69.222
{code}



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)


[jira] [Created] (HIVE-11330) Add early termination for recursion in StatsRulesProcFactory$FilterStatsRule.evaluateExpression

2015-07-21 Thread Mostafa Mokhtar (JIRA)
Mostafa Mokhtar created HIVE-11330:
--

 Summary: Add early termination for recursion in 
StatsRulesProcFactory$FilterStatsRule.evaluateExpression
 Key: HIVE-11330
 URL: https://issues.apache.org/jira/browse/HIVE-11330
 Project: Hive
  Issue Type: Bug
  Components: Hive, Physical Optimizer
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Prasanth Jayachandran


Queries with heavily nested filters can cause a StackOverflowError

{code}
Exception in thread main java.lang.StackOverflowError
at 
org.apache.hadoop.hive.ql.optimizer.stats.annotation.StatsRulesProcFactory$FilterStatsRule.evaluateExpression(StatsRulesProcFactory.java:301)
at 
org.apache.hadoop.hive.ql.optimizer.stats.annotation.StatsRulesProcFactory$FilterStatsRule.evaluateChildExpr(StatsRulesProcFactory.java:525)
at 
org.apache.hadoop.hive.ql.optimizer.stats.annotation.StatsRulesProcFactory$FilterStatsRule.evaluateExpression(StatsRulesProcFactory.java:326)
at 
org.apache.hadoop.hive.ql.optimizer.stats.annotation.StatsRulesProcFactory$FilterStatsRule.evaluateChildExpr(StatsRulesProcFactory.java:525)
at 
org.apache.hadoop.hive.ql.optimizer.stats.annotation.StatsRulesProcFactory$FilterStatsRule.evaluateExpression(StatsRulesProcFactory.java:326)
at 
org.apache.hadoop.hive.ql.optimizer.stats.annotation.StatsRulesProcFactory$FilterStatsRule.evaluateChildExpr(StatsRulesProcFactory.java:525)
at 
org.apache.hadoop.hive.ql.optimizer.stats.annotation.StatsRulesProcFactory$FilterStatsRule.evaluateExpression(StatsRulesProcFactory.java:326)
at 
org.apache.hadoop.hive.ql.optimizer.stats.annotation.StatsRulesProcFactory$FilterStatsRule.evaluateChildExpr(StatsRulesProcFactory.java:525)
at 
org.apache.hadoop.hive.ql.optimizer.stats.annotation.StatsRulesProcFactory$FilterStatsRule.evaluateExpression(StatsRulesProcFactory.java:326)
at 
org.apache.hadoop.hive.ql.optimizer.stats.annotation.StatsRulesProcFactory$FilterStatsRule.evaluateChildExpr(StatsRulesProcFactory.java:525)
at 
org.apache.hadoop.hive.ql.optimizer.stats.annotation.StatsRulesProcFactory$FilterStatsRule.evaluateExpression(StatsRulesProcFactory.java:326)
at 
org.apache.hadoop.hive.ql.optimizer.stats.annotation.StatsRulesProcFactory$FilterStatsRule.evaluateChildExpr(StatsRulesProcFactory.java:525)
at 
org.apache.hadoop.hive.ql.optimizer.stats.annotation.StatsRulesProcFactory$FilterStatsRule.evaluateExpression(StatsRulesProcFactory.java:326)
at 
org.apache.hadoop.hive.ql.optimizer.stats.annotation.StatsRulesProcFactory$FilterStatsRule.evaluateChildExpr(StatsRulesProcFactory.java:525)
 
{code}



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)


[jira] [Created] (HIVE-11299) Hive query planning allocating lots of memory and hitting OOMs

2015-07-17 Thread Mostafa Mokhtar (JIRA)
Mostafa Mokhtar created HIVE-11299:
--

 Summary: Hive query planning allocating lots of memory and hitting 
OOMs 
 Key: HIVE-11299
 URL: https://issues.apache.org/jira/browse/HIVE-11299
 Project: Hive
  Issue Type: Bug
  Components: Hive
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Mostafa Mokhtar


Plan generation for queries with lots of disjunct filters spends lots of time 
compiling. 



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)


[jira] [Created] (HIVE-10793) Hybrid Hybrid Grace Hash Join : Don't allocate all hash table memory upfront

2015-05-21 Thread Mostafa Mokhtar (JIRA)
Mostafa Mokhtar created HIVE-10793:
--

 Summary: Hybrid Hybrid Grace Hash Join : Don't allocate all hash 
table memory upfront
 Key: HIVE-10793
 URL: https://issues.apache.org/jira/browse/HIVE-10793
 Project: Hive
  Issue Type: Bug
  Components: Hive
Affects Versions: 1.2.0
Reporter: Mostafa Mokhtar
Assignee: Mostafa Mokhtar
 Fix For: 1.2.1


HybridHashTableContainer will allocate memory based on estimate, which means if 
the actual is less than the estimate the allocated memory won't be used.

Number of partitions is calculated based on estimated data size
{code}
numPartitions = calcNumPartitions(memoryThreshold, estimatedTableSize, 
minNumParts, minWbSize,
  nwayConf);
{code}

Then based on number of partitions writeBufferSize is set

{code}
writeBufferSize = (int)(estimatedTableSize / numPartitions);
{code}

Each hash partition will allocate 1 WriteBuffer, with no further allocation if 
the estimate data size is correct.

Suggested solution is to reduce writeBufferSize by a factor such that only X% 
of the memory is preallocated.





--
This message was sent by Atlassian JIRA
(v6.3.4#6332)


[jira] [Created] (HIVE-10609) Vectorization : Q64 fails with ClassCastException

2015-05-04 Thread Mostafa Mokhtar (JIRA)
Mostafa Mokhtar created HIVE-10609:
--

 Summary: Vectorization : Q64 fails with ClassCastException
 Key: HIVE-10609
 URL: https://issues.apache.org/jira/browse/HIVE-10609
 Project: Hive
  Issue Type: Bug
Affects Versions: 1.2.0
Reporter: Mostafa Mokhtar
Assignee: Matt McCline
 Fix For: 1.2.0


TPC-DS Q64 fails with ClassCastException.
Query
{code}


select cs1.product_name ,cs1.store_name ,cs1.store_zip ,cs1.b_street_number 
,cs1.b_streen_name ,cs1.b_city
 ,cs1.b_zip ,cs1.c_street_number ,cs1.c_street_name ,cs1.c_city ,cs1.c_zip 
,cs1.syear ,cs1.cnt
 ,cs1.s1 ,cs1.s2 ,cs1.s3
 ,cs2.s1 ,cs2.s2 ,cs2.s3 ,cs2.syear ,cs2.cnt
from
(select i_product_name as product_name ,i_item_sk as item_sk ,s_store_name as 
store_name
 ,s_zip as store_zip ,ad1.ca_street_number as b_street_number 
,ad1.ca_street_name as b_streen_name
 ,ad1.ca_city as b_city ,ad1.ca_zip as b_zip ,ad2.ca_street_number as 
c_street_number
 ,ad2.ca_street_name as c_street_name ,ad2.ca_city as c_city ,ad2.ca_zip as 
c_zip
 ,d1.d_year as syear ,d2.d_year as fsyear ,d3.d_year as s2year ,count(*) as 
cnt
 ,sum(ss_wholesale_cost) as s1 ,sum(ss_list_price) as s2 
,sum(ss_coupon_amt) as s3
  FROM   store_sales
JOIN store_returns ON store_sales.ss_item_sk = store_returns.sr_item_sk 
and store_sales.ss_ticket_number = store_returns.sr_ticket_number
JOIN customer ON store_sales.ss_customer_sk = customer.c_customer_sk
JOIN date_dim d1 ON store_sales.ss_sold_date_sk = d1.d_date_sk
JOIN date_dim d2 ON customer.c_first_sales_date_sk = d2.d_date_sk 
JOIN date_dim d3 ON customer.c_first_shipto_date_sk = d3.d_date_sk
JOIN store ON store_sales.ss_store_sk = store.s_store_sk
JOIN customer_demographics cd1 ON store_sales.ss_cdemo_sk= 
cd1.cd_demo_sk
JOIN customer_demographics cd2 ON customer.c_current_cdemo_sk = 
cd2.cd_demo_sk
JOIN promotion ON store_sales.ss_promo_sk = promotion.p_promo_sk
JOIN household_demographics hd1 ON store_sales.ss_hdemo_sk = 
hd1.hd_demo_sk
JOIN household_demographics hd2 ON customer.c_current_hdemo_sk = 
hd2.hd_demo_sk
JOIN customer_address ad1 ON store_sales.ss_addr_sk = ad1.ca_address_sk
JOIN customer_address ad2 ON customer.c_current_addr_sk = 
ad2.ca_address_sk
JOIN income_band ib1 ON hd1.hd_income_band_sk = ib1.ib_income_band_sk
JOIN income_band ib2 ON hd2.hd_income_band_sk = ib2.ib_income_band_sk
JOIN item ON store_sales.ss_item_sk = item.i_item_sk
JOIN
 (select cs_item_sk
,sum(cs_ext_list_price) as 
sale,sum(cr_refunded_cash+cr_reversed_charge+cr_store_credit) as refund
  from catalog_sales JOIN catalog_returns
  ON catalog_sales.cs_item_sk = catalog_returns.cr_item_sk
and catalog_sales.cs_order_number = catalog_returns.cr_order_number
  group by cs_item_sk
  having 
sum(cs_ext_list_price)2*sum(cr_refunded_cash+cr_reversed_charge+cr_store_credit))
 cs_ui
ON store_sales.ss_item_sk = cs_ui.cs_item_sk
  WHERE  
 cd1.cd_marital_status  cd2.cd_marital_status and
 i_color in ('maroon','burnished','dim','steel','navajo','chocolate') 
and
 i_current_price between 35 and 35 + 10 and
 i_current_price between 35 + 1 and 35 + 15
group by i_product_name ,i_item_sk ,s_store_name ,s_zip ,ad1.ca_street_number
   ,ad1.ca_street_name ,ad1.ca_city ,ad1.ca_zip ,ad2.ca_street_number
   ,ad2.ca_street_name ,ad2.ca_city ,ad2.ca_zip ,d1.d_year ,d2.d_year 
,d3.d_year
) cs1
JOIN
(select i_product_name as product_name ,i_item_sk as item_sk ,s_store_name as 
store_name
 ,s_zip as store_zip ,ad1.ca_street_number as b_street_number 
,ad1.ca_street_name as b_streen_name
 ,ad1.ca_city as b_city ,ad1.ca_zip as b_zip ,ad2.ca_street_number as 
c_street_number
 ,ad2.ca_street_name as c_street_name ,ad2.ca_city as c_city ,ad2.ca_zip as 
c_zip
 ,d1.d_year as syear ,d2.d_year as fsyear ,d3.d_year as s2year ,count(*) as 
cnt
 ,sum(ss_wholesale_cost) as s1 ,sum(ss_list_price) as s2 
,sum(ss_coupon_amt) as s3
  FROM   store_sales
JOIN store_returns ON store_sales.ss_item_sk = store_returns.sr_item_sk 
and store_sales.ss_ticket_number = store_returns.sr_ticket_number
JOIN customer ON store_sales.ss_customer_sk = customer.c_customer_sk
JOIN date_dim d1 ON store_sales.ss_sold_date_sk = d1.d_date_sk
JOIN date_dim d2 ON customer.c_first_sales_date_sk = d2.d_date_sk 
JOIN date_dim d3 ON customer.c_first_shipto_date_sk = d3.d_date_sk
JOIN store ON store_sales.ss_store_sk = store.s_store_sk
JOIN customer_demographics cd1 ON store_sales.ss_cdemo_sk= 
cd1.cd_demo_sk
JOIN customer_demographics cd2 ON customer.c_current_cdemo_sk = 
cd2.cd_demo_sk
JOIN promotion ON store_sales.ss_promo_sk = promotion.p_promo_sk
JOIN 

[jira] [Created] (HIVE-10557) CBO : Support reference to alias in queries

2015-04-30 Thread Mostafa Mokhtar (JIRA)
Mostafa Mokhtar created HIVE-10557:
--

 Summary: CBO : Support reference to alias in queries 
 Key: HIVE-10557
 URL: https://issues.apache.org/jira/browse/HIVE-10557
 Project: Hive
  Issue Type: Bug
  Components: CBO
Affects Versions: 1.2.0
Reporter: Mostafa Mokhtar
Assignee: Laljo John Pullokkaran
Priority: Minor
 Fix For: 1.2.0


Query 
{code}
explain
select
count(*) rowcount
from
(select
ss_item_sk, ss_ticket_number, ss_store_sk
from
store_sales a, store_returns b
where
a.ss_item_sk = b.sr_item_sk
and a.ss_ticket_number = b.sr_ticket_number
and ss_sold_date_sk between 2450816 and 2451500
and sr_returned_date_sk between 2450816 and 2451500
union all
select
ss_item_sk, ss_ticket_number, ss_store_sk
from
store_sales c, store_returns d
where
c.ss_item_sk = d.sr_item_sk
and c.ss_ticket_number = d.sr_ticket_number
and ss_sold_date_sk between 2450816 and 2451500
and sr_returned_date_sk between 2450816 and 2451500) t
group by t.ss_store_sk , t.ss_item_sk , t.ss_ticket_number
having rowcount  1
{code}

Exception 
{code}
15/04/30 04:44:21 [main]: ERROR parse.CalcitePlanner: CBO failed, skipping CBO.
org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException: 
Encountered Select alias 'rowcount' in having clause 'rowcount  1' 
This non standard behavior is not supported with cbo on. Turn off cbo for these 
queries.
at 
org.apache.hadoop.hive.ql.parse.CalcitePlanner$CalcitePlannerAction.validateNoHavingReferenceToAlias(CalcitePlanner.java:2888)
at 
org.apache.hadoop.hive.ql.parse.CalcitePlanner$CalcitePlannerAction.genGBHavingLogicalPlan(CalcitePlanner.java:2828)
at 
org.apache.hadoop.hive.ql.parse.CalcitePlanner$CalcitePlannerAction.genLogicalPlan(CalcitePlanner.java:2738)
at 
org.apache.hadoop.hive.ql.parse.CalcitePlanner$CalcitePlannerAction.apply(CalcitePlanner.java:804)
at 
org.apache.hadoop.hive.ql.parse.CalcitePlanner$CalcitePlannerAction.apply(CalcitePlanner.java:765)
at org.apache.calcite.tools.Frameworks$1.apply(Frameworks.java:109)
at 
org.apache.calcite.prepare.CalcitePrepareImpl.perform(CalcitePrepareImpl.java:730)
at org.apache.calcite.tools.Frameworks.withPrepare(Frameworks.java:145)
at org.apache.calcite.tools.Frameworks.withPlanner(Frameworks.java:105)
at 
org.apache.hadoop.hive.ql.parse.CalcitePlanner.getOptimizedAST(CalcitePlanner.java:604)
at 
org.apache.hadoop.hive.ql.parse.CalcitePlanner.genOPTree(CalcitePlanner.java:242)
at 
org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.analyzeInternal(SemanticAnalyzer.java:10015)
at 
org.apache.hadoop.hive.ql.parse.CalcitePlanner.analyzeInternal(CalcitePlanner.java:205)
at 
org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.analyze(BaseSemanticAnalyzer.java:227)
at 
org.apache.hadoop.hive.ql.parse.ExplainSemanticAnalyzer.analyzeInternal(ExplainSemanticAnalyzer.java:74)
at 
org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.analyze(BaseSemanticAnalyzer.java:227)
at org.apache.hadoop.hive.ql.Driver.compile(Driver.java:424)
at org.apache.hadoop.hive.ql.Driver.compile(Driver.java:308)
at org.apache.hadoop.hive.ql.Driver.compileInternal(Driver.java:1122)
at org.apache.hadoop.hive.ql.Driver.runInternal(Driver.java:1170)
at org.apache.hadoop.hive.ql.Driver.run(Driver.java:1059)
{code}



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)


[jira] [Created] (HIVE-10537) ConvertJoinMapJoin should take into account JOINNOCONDITIONALTASKTHRESHOLD when packing tables in a Vertex

2015-04-29 Thread Mostafa Mokhtar (JIRA)
Mostafa Mokhtar created HIVE-10537:
--

 Summary: ConvertJoinMapJoin should take into account 
JOINNOCONDITIONALTASKTHRESHOLD when packing tables in a Vertex
 Key: HIVE-10537
 URL: https://issues.apache.org/jira/browse/HIVE-10537
 Project: Hive
  Issue Type: Bug
Reporter: Mostafa Mokhtar
Assignee: Vikram Dixit K


Vertex Map 2 has inputs that add up to 803MB while NCTS is 610MB.

Query 
{code}
set hive.auto.convert.join.noconditionaltask.size=64000;

explain  


select  i_item_desc
  ,w_warehouse_name
  ,d1.d_week_seq
  ,count(case when p_promo_sk is null then 1 else 0 end) no_promo
  ,count(case when p_promo_sk is not null then 1 else 0 end) promo
  ,count(*) total_cnt
from catalog_sales
join inventory on (catalog_sales.cs_item_sk = inventory.inv_item_sk)
join warehouse on (warehouse.w_warehouse_sk=inventory.inv_warehouse_sk)
join item on (item.i_item_sk = catalog_sales.cs_item_sk)
join customer_demographics on (catalog_sales.cs_bill_cdemo_sk = 
customer_demographics.cd_demo_sk)
join household_demographics on (catalog_sales.cs_bill_hdemo_sk = 
household_demographics.hd_demo_sk)
join date_dim d1 on (catalog_sales.cs_sold_date_sk = d1.d_date_sk)
join date_dim d2 on (inventory.inv_date_sk = d2.d_date_sk)
join date_dim d3 on (catalog_sales.cs_ship_date_sk = d3.d_date_sk)
left outer join promotion on (catalog_sales.cs_promo_sk=promotion.p_promo_sk)
left outer join catalog_returns on (catalog_returns.cr_item_sk = 
catalog_sales.cs_item_sk and catalog_returns.cr_order_number = 
catalog_sales.cs_order_number)
where d1.d_week_seq = d2.d_week_seq
  and inv_quantity_on_hand  cs_quantity 
  and d3.d_date  d1.d_date + 5
  and hd_buy_potential = '1001-5000'
  and d1.d_year = 2001
  and hd_buy_potential = '1001-5000'
  and cd_marital_status = 'M'
  and d1.d_year = 2001
group by i_item_desc,w_warehouse_name,d1.d_week_seq
order by total_cnt desc, i_item_desc, w_warehouse_name, d_week_seq
limit 100
{code}

Plan  
{code}

STAGE DEPENDENCIES:
  Stage-1 is a root stage
  Stage-0 depends on stages: Stage-1

STAGE PLANS:
  Stage: Stage-1
Tez
  Edges:
Map 2 - Map 1 (BROADCAST_EDGE), Map 10 (BROADCAST_EDGE), Map 11 
(BROADCAST_EDGE), Map 12 (BROADCAST_EDGE), Map 13 (BROADCAST_EDGE), Map 5 
(BROADCAST_EDGE), Map 6 (BROADCAST_EDGE), Map 7 (BROADCAST_EDGE), Map 8 
(BROADCAST_EDGE), Map 9 (BROADCAST_EDGE)
Reducer 3 - Map 2 (SIMPLE_EDGE)
Reducer 4 - Reducer 3 (SIMPLE_EDGE)
  DagName: jenkins_20150429044838_bd0a1cf7-235f-48db-9321-c13899fed7b3:1
  Vertices:
Map 1 
Map Operator Tree:
TableScan
  alias: catalog_returns
  Statistics: Num rows: 28798881 Data size: 2942039156 Basic 
stats: COMPLETE Column stats: COMPLETE
  Select Operator
expressions: cr_item_sk (type: int), cr_order_number (type: 
int)
outputColumnNames: _col0, _col1
Statistics: Num rows: 28798881 Data size: 230391048 Basic 
stats: COMPLETE Column stats: COMPLETE
Reduce Output Operator
  key expressions: _col0 (type: int), _col1 (type: int)
  sort order: ++
  Map-reduce partition columns: _col0 (type: int), _col1 
(type: int)
  Statistics: Num rows: 28798881 Data size: 230391048 Basic 
stats: COMPLETE Column stats: COMPLETE
Execution mode: vectorized
Map 10 
Map Operator Tree:
TableScan
  alias: warehouse
  filterExpr: w_warehouse_sk is not null (type: boolean)
  Statistics: Num rows: 6 Data size: 6166 Basic stats: COMPLETE 
Column stats: COMPLETE
  Filter Operator
predicate: w_warehouse_sk is not null (type: boolean)
Statistics: Num rows: 6 Data size: 618 Basic stats: 
COMPLETE Column stats: COMPLETE
Select Operator
  expressions: w_warehouse_sk (type: int), w_warehouse_name 
(type: string)
  outputColumnNames: _col0, _col1
  Statistics: Num rows: 6 Data size: 618 Basic stats: 
COMPLETE Column stats: COMPLETE
  Reduce Output Operator
key expressions: _col0 (type: int)
sort order: +
Map-reduce partition columns: _col0 (type: int)
Statistics: Num rows: 6 Data size: 618 Basic stats: 
COMPLETE Column stats: COMPLETE
value expressions: _col1 (type: string)
Execution mode: vectorized
Map 11 
Map Operator Tree:
TableScan
  alias: d1
  filterExpr: (((d_year = 2001) and d_date_sk is not null) and 
d_week_seq is 

[jira] [Created] (HIVE-10484) Vectorization : Big Table Retained Mapping duplicate column

2015-04-24 Thread Mostafa Mokhtar (JIRA)
Mostafa Mokhtar created HIVE-10484:
--

 Summary: Vectorization : Big Table Retained Mapping duplicate 
column
 Key: HIVE-10484
 URL: https://issues.apache.org/jira/browse/HIVE-10484
 Project: Hive
  Issue Type: Bug
  Components: Tez, Vectorization
Affects Versions: 1.2.0
Reporter: Mostafa Mokhtar
Assignee: Matt McCline
 Fix For: 1.2.0


With vectorization and tez enabled TPC-DS Q70 fails with 
{code}
Caused by: java.lang.RuntimeException: Big Table Retained Mapping duplicate 
column 6 in ordered column map {6=(value column: 6, type name: int), 21=(value 
column: 21, type name: float), 22=(value column: 22, type name: int)} when 
adding value column 6, type int
at 
org.apache.hadoop.hive.ql.exec.vector.VectorColumnOrderedMap.add(VectorColumnOrderedMap.java:97)
at 
org.apache.hadoop.hive.ql.exec.vector.VectorColumnOutputMapping.add(VectorColumnOutputMapping.java:40)
at 
org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinCommonOperator.determineCommonInfo(VectorMapJoinCommonOperator.java:320)
at 
org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinCommonOperator.init(VectorMapJoinCommonOperator.java:254)
at 
org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinGenerateResultOperator.init(VectorMapJoinGenerateResultOperator.java:89)
at 
org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinInnerGenerateResultOperator.init(VectorMapJoinInnerGenerateResultOperator.java:97)
at 
org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinInnerLongOperator.init(VectorMapJoinInnerLongOperator.java:79)
... 49 more
{code}

Query 
{code}
 select s_state
   from  (select s_state as s_state, sum(ss_net_profit),
 rank() over ( partition by s_state order by 
sum(ss_net_profit) desc) as ranking
  from   store_sales, store, date_dim
  where  d_month_seq between 1193 and 1193+11
and date_dim.d_date_sk = store_sales.ss_sold_date_sk
and store.s_store_sk  = store_sales.ss_store_sk
  group by s_state
 ) tmp1
   where ranking = 5
{code}



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)


[jira] [Created] (HIVE-10446) Hybrid Hybrid Grace Hash Join : java.lang.IllegalArgumentException in Kryo while spilling big table

2015-04-22 Thread Mostafa Mokhtar (JIRA)
Mostafa Mokhtar created HIVE-10446:
--

 Summary: Hybrid Hybrid Grace Hash Join : 
java.lang.IllegalArgumentException in Kryo while spilling big table
 Key: HIVE-10446
 URL: https://issues.apache.org/jira/browse/HIVE-10446
 Project: Hive
  Issue Type: Bug
  Components: Hive
Affects Versions: 1.2.0
Reporter: Mostafa Mokhtar
Assignee: Wei Zheng
 Fix For: 1.2.0


TPC-DS Q85 fails with Kryo exception when spilling big table data.

Query 
{code}
select  substr(r_reason_desc,1,20) as r
   ,avg(wr_return_ship_cost) wq
   ,avg(wr_refunded_cash) ref
   ,avg(wr_fee) fee
 from web_returns, customer_demographics cd1,
  customer_demographics cd2, customer_address, date_dim, reason 
 where 
   cd1.cd_demo_sk = web_returns.wr_refunded_cdemo_sk 
   and cd2.cd_demo_sk = web_returns.wr_returning_cdemo_sk
   and customer_address.ca_address_sk = web_returns.wr_refunded_addr_sk
   and reason.r_reason_sk = web_returns.wr_reason_sk
   and cd1.cd_marital_status = cd2.cd_marital_status
   and cd1.cd_education_status = cd2.cd_education_status
group by r_reason_desc
order by r, wq, ref, fee
limit 100
{code}

Plan 
{code}
OK
STAGE DEPENDENCIES:
  Stage-1 is a root stage
  Stage-0 depends on stages: Stage-1

STAGE PLANS:
  Stage: Stage-1
Tez
  Edges:
Map 1 - Map 4 (BROADCAST_EDGE), Map 5 (BROADCAST_EDGE), Map 6 
(BROADCAST_EDGE), Map 7 (BROADCAST_EDGE)
Reducer 2 - Map 1 (SIMPLE_EDGE)
Reducer 3 - Reducer 2 (SIMPLE_EDGE)
  DagName: mmokhtar_20150422165209_d8eb5634-c19f-4576-9525-cad248c7ca37:5
  Vertices:
Map 1
Map Operator Tree:
TableScan
  alias: web_returns
  filterExpr: (((wr_refunded_addr_sk is not null and 
wr_reason_sk is not null) and wr_refunded_cdemo_sk is not null) and 
wr_returning_cdemo_sk is not null) (type: boolean)
  Statistics: Num rows: 2062802370 Data size: 185695406284 
Basic stats: COMPLETE Column stats: COMPLETE
  Filter Operator
predicate: (((wr_refunded_addr_sk is not null and 
wr_reason_sk is not null) and wr_refunded_cdemo_sk is not null) and 
wr_returning_cdemo_sk is not null) (type: boolean)
Statistics: Num rows: 1875154723 Data size: 51267313780 
Basic stats: COMPLETE Column stats: COMPLETE
Select Operator
  expressions: wr_refunded_cdemo_sk (type: int), 
wr_refunded_addr_sk (type: int), wr_returning_cdemo_sk (type: int), 
wr_reason_sk (type: int), wr_fee (type: float), wr_return_ship_cost (type: 
float), wr_refunded_cash (type: float)
  outputColumnNames: _col0, _col1, _col2, _col3, _col4, 
_col5, _col6
  Statistics: Num rows: 1875154723 Data size: 51267313780 
Basic stats: COMPLETE Column stats: COMPLETE
  Map Join Operator
condition map:
 Inner Join 0 to 1
keys:
  0 _col1 (type: int)
  1 _col0 (type: int)
outputColumnNames: _col0, _col2, _col3, _col4, _col5, 
_col6
input vertices:
  1 Map 4
Statistics: Num rows: 1875154688 Data size: 45003712512 
Basic stats: COMPLETE Column stats: COMPLETE
HybridGraceHashJoin: true
Map Join Operator
  condition map:
   Inner Join 0 to 1
  keys:
0 _col3 (type: int)
1 _col0 (type: int)
  outputColumnNames: _col0, _col2, _col4, _col5, _col6, 
_col9
  input vertices:
1 Map 5
  Statistics: Num rows: 1875154688 Data size: 
219393098496 Basic stats: COMPLETE Column stats: COMPLETE
  HybridGraceHashJoin: true
  Map Join Operator
condition map:
 Inner Join 0 to 1
keys:
  0 _col0 (type: int)
  1 _col0 (type: int)
outputColumnNames: _col2, _col4, _col5, _col6, 
_col9, _col11, _col12
input vertices:
  1 Map 6
Statistics: Num rows: 1875154688 Data size: 
547545168896 Basic stats: COMPLETE Column stats: COMPLETE
HybridGraceHashJoin: true
Map Join Operator
  condition map:
   Inner Join 0 to 1
  keys:
   

[jira] [Created] (HIVE-10412) CBO : Calculate join selectivity when computing HiveJoin cost

2015-04-20 Thread Mostafa Mokhtar (JIRA)
Mostafa Mokhtar created HIVE-10412:
--

 Summary: CBO : Calculate join selectivity when computing HiveJoin 
cost
 Key: HIVE-10412
 URL: https://issues.apache.org/jira/browse/HIVE-10412
 Project: Hive
  Issue Type: Sub-task
  Components: CBO
Reporter: Mostafa Mokhtar
Assignee: Laljo John Pullokkaran


This is from TPC-DS Q7
Because we don't compute the selectivity of sub-expression in a HiveJoin we 
assume that selective and non-selective joins have the similar cost.

{code}
select  i_item_id, 
avg(ss_quantity) agg1,
avg(ss_list_price) agg2,
avg(ss_coupon_amt) agg3,
avg(ss_sales_price) agg4 
 from store_sales, customer_demographics, item
 where store_sales.ss_item_sk = item.i_item_sk and
   store_sales.ss_cdemo_sk = customer_demographics.cd_demo_sk and
   cd_gender = 'F' and 
   cd_marital_status = 'W' and
   cd_education_status = 'Primary'
 group by i_item_id
 order by i_item_id
 limit 100
{code}

Cardinality 
{code}
item 462,000
customer_demographics 1,920,800
store_sales 82,510,879,939
{code}

NDVs
{code}
item.i_item_sk 439501
customer_demographics.cd_demo_sk 1835839
store_sales.ss_cdemo_sk 1835839
{code}



From the logs 
{code}
2015-04-20 21:09:58,055 DEBUG [main]: cost.HiveCostModel 
(HiveCostModel.java:getJoinCost(60)) - Join algorithm selection for:
HiveJoin(condition=[=($0, $10)], joinType=[inner], algorithm=[none], cost=[not 
available])
  HiveJoin(condition=[=($1, $6)], joinType=[inner], algorithm=[MapJoin], 
cost=[{8.25108951834E10 rows, 2.324083308641975E8 cpu, 275417.56 
io}])
HiveProject(ss_item_sk=[$1], ss_cdemo_sk=[$3], ss_quantity=[$9], 
ss_list_price=[$11], ss_sales_price=[$12], ss_coupon_amt=[$18])
  HiveTableScan(table=[[tpcds_bin_partitioned_orc_3.store_sales]])
HiveProject(cd_demo_sk=[$0], cd_gender=[$1], cd_marital_status=[$2], 
cd_education_status=[$3])
  HiveFilter(condition=[AND(=($1, 'F'), =($2, 'W'), =($3, 'Primary'))])

HiveTableScan(table=[[tpcds_bin_partitioned_orc_3.customer_demographics]])
  HiveProject(i_item_sk=[$0], i_item_id=[$1])
HiveTableScan(table=[[tpcds_bin_partitioned_orc_3.item]])

2015-04-20 21:09:58,056 DEBUG [main]: cost.HiveCostModel 
(HiveCostModel.java:getJoinCost(69)) - CommonJoin cost: {6.553102534841269E8 
rows, 4.0217814199458417E18 cpu, 3.499540319862703E7 io}
2015-04-20 21:09:58,056 DEBUG [main]: cost.HiveCostModel 
(HiveCostModel.java:getJoinCost(69)) - MapJoin cost: {6.553102534841269E8 rows, 
2.1362E11 cpu, 1.07207098E7 io}
2015-04-20 21:09:58,056 DEBUG [main]: cost.HiveCostModel 
(HiveCostModel.java:getJoinCost(78)) - MapJoin selected
2015-04-20 21:09:58,057 DEBUG [main]: cost.HiveCostModel 
(HiveCostModel.java:getJoinCost(60)) - Join algorithm selection for:
HiveJoin(condition=[=($1, $8)], joinType=[inner], algorithm=[none], cost=[not 
available])
  HiveJoin(condition=[=($0, $6)], joinType=[inner], algorithm=[MapJoin], 
cost=[{8.2511341939E10 rows, 2.1362E11 cpu, 1.07207098E7 io}])
HiveProject(ss_item_sk=[$1], ss_cdemo_sk=[$3], ss_quantity=[$9], 
ss_list_price=[$11], ss_sales_price=[$12], ss_coupon_amt=[$18])
  HiveTableScan(table=[[tpcds_bin_partitioned_orc_3.store_sales]])
HiveProject(i_item_sk=[$0], i_item_id=[$1])
  HiveTableScan(table=[[tpcds_bin_partitioned_orc_3.item]])
  HiveProject(cd_demo_sk=[$0], cd_gender=[$1], cd_marital_status=[$2], 
cd_education_status=[$3])
HiveFilter(condition=[AND(=($1, 'F'), =($2, 'W'), =($3, 'Primary'))])
  
HiveTableScan(table=[[tpcds_bin_partitioned_orc_3.customer_demographics]])

2015-04-20 21:09:58,058 DEBUG [main]: cost.HiveCostModel 
(HiveCostModel.java:getJoinCost(69)) - CommonJoin cost: {8.25108951834E10 
rows, 2.6089279242468144E21 cpu, 4.901146588836599E9 io}
2015-04-20 21:09:58,058 DEBUG [main]: cost.HiveCostModel 
(HiveCostModel.java:getJoinCost(69)) - MapJoin cost: {8.25108951834E10 
rows, 2.324083308641975E8 cpu, 275417.56 io}
{code}



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)


[jira] [Created] (HIVE-10369) CBO: Don't use HiveDefaultCostModel when With Tez and hive.cbo.costmodel.extended enabled

2015-04-16 Thread Mostafa Mokhtar (JIRA)
Mostafa Mokhtar created HIVE-10369:
--

 Summary: CBO: Don't use HiveDefaultCostModel when With Tez and 
hive.cbo.costmodel.extended enabled 
 Key: HIVE-10369
 URL: https://issues.apache.org/jira/browse/HIVE-10369
 Project: Hive
  Issue Type: Sub-task
  Components: CBO
Affects Versions: 1.2.0
Reporter: Mostafa Mokhtar
Assignee: Laljo John Pullokkaran
 Fix For: 1.2.0


When calculating parallelism, we end up using  HiveDefaultCostModel. 
getSplitCount which returns null instead of  HiveOnTezCostModel.getSplitCount 
which results in wrong parallelism.

This happens for this join 
{code}
org.apache.calcite.plan.RelOptUtil.toString(join)
 (java.lang.String) HiveJoin(condition=[=($1, $3)], joinType=[inner], 
algorithm=[none], cost=[not available])
  HiveProject(cs_sold_date_sk=[$0], cs_bill_customer_sk=[$3], 
cs_sales_price=[$21])
HiveTableScan(table=[[tpcds_bin_orc_200.catalog_sales]])
  HiveJoin(condition=[=($1, $2)], joinType=[inner], algorithm=[MapJoin], 
cost=[{240.0 rows, 6.48E11 cpu, 1294.6098 io}])
HiveProject(c_customer_sk=[$0], c_current_addr_sk=[$4])
  HiveTableScan(table=[[tpcds_bin_orc_200.customer]])
HiveProject(ca_address_sk=[$0], ca_state=[$8], ca_zip=[$9])
  HiveTableScan(table=[[tpcds_bin_orc_200.customer_address]])
{code}


The issue appears to be happening very early when calling 
{code}
if (pushDownTree != null) {
  costPushDown =
  RelMetadataQuery.getCumulativeCost(pushDownTree.getJoinTree());
}
{code}

As pushDownTree.getJoinTree().joinAlgorithm = 
HiveOnTezCostModel$TezMapJoinAlgorithm


Call stack.
{code}
HiveDefaultCostModel$DefaultJoinAlgorithm.getSplitCount(HiveJoin) line: 114 
HiveJoin.getSplitCount() line: 136  
HiveRelMdParallelism.splitCount(HiveJoin) line: 63  
NativeMethodAccessorImpl.invoke0(Method, Object, Object[]) line: not available 
[native method]  
NativeMethodAccessorImpl.invoke(Object, Object[]) line: 57  
DelegatingMethodAccessorImpl.invoke(Object, Object[]) line: 43  
Method.invoke(Object, Object...) line: 606  
ReflectiveRelMetadataProvider$1$1.invoke(Object, Method, Object[]) line: 182
$Proxy46.splitCount() line: not available   
GeneratedMethodAccessor26.invoke(Object, Object[]) line: not available  
DelegatingMethodAccessorImpl.invoke(Object, Object[]) line: 43  
Method.invoke(Object, Object...) line: 606  
ChainedRelMetadataProvider$ChainedInvocationHandler.invoke(Object, Method, 
Object[]) line: 109  
$Proxy46.splitCount() line: not available   
GeneratedMethodAccessor26.invoke(Object, Object[]) line: not available  
DelegatingMethodAccessorImpl.invoke(Object, Object[]) line: 43  
Method.invoke(Object, Object...) line: 606  
ChainedRelMetadataProvider$ChainedInvocationHandler.invoke(Object, Method, 
Object[]) line: 109  
$Proxy46.splitCount() line: not available   
GeneratedMethodAccessor26.invoke(Object, Object[]) line: not available  
DelegatingMethodAccessorImpl.invoke(Object, Object[]) line: 43  
Method.invoke(Object, Object...) line: 606  
CachingRelMetadataProvider$CachingInvocationHandler.invoke(Object, Method, 
Object[]) line: 132  
$Proxy46.splitCount() line: not available   
RelMetadataQuery.splitCount(RelNode) line: 401  
HiveOnTezCostModel$TezMapJoinAlgorithm.getCost(HiveJoin) line: 255  
HiveOnTezCostModel(HiveCostModel).getJoinCost(HiveJoin) line: 64
HiveRelMdCost.getNonCumulativeCost(HiveJoin) line: 56   
NativeMethodAccessorImpl.invoke0(Method, Object, Object[]) line: not available 
[native method]  
NativeMethodAccessorImpl.invoke(Object, Object[]) line: 57  
DelegatingMethodAccessorImpl.invoke(Object, Object[]) line: 43  
Method.invoke(Object, Object...) line: 606  
ReflectiveRelMetadataProvider$1$1.invoke(Object, Method, Object[]) line: 182
$Proxy41.getNonCumulativeCost() line: not available 
GeneratedMethodAccessor22.invoke(Object, Object[]) line: not available  
DelegatingMethodAccessorImpl.invoke(Object, Object[]) line: 43  
Method.invoke(Object, Object...) line: 606  
ChainedRelMetadataProvider$ChainedInvocationHandler.invoke(Object, Method, 
Object[]) line: 109  
$Proxy41.getNonCumulativeCost() line: not available 
GeneratedMethodAccessor22.invoke(Object, Object[]) line: not available  
DelegatingMethodAccessorImpl.invoke(Object, Object[]) line: 43  
Method.invoke(Object, Object...) line: 606  
ChainedRelMetadataProvider$ChainedInvocationHandler.invoke(Object, Method, 
Object[]) line: 109  
$Proxy41.getNonCumulativeCost() line: not available 
GeneratedMethodAccessor22.invoke(Object, Object[]) line: not available  
DelegatingMethodAccessorImpl.invoke(Object, Object[]) line: 43  
Method.invoke(Object, Object...) line: 606  
ChainedRelMetadataProvider$ChainedInvocationHandler.invoke(Object, Method, 
Object[]) line: 109  

[jira] [Created] (HIVE-10350) CBO: With hive.cbo.costmodel.extended enabled IO cost is negative

2015-04-15 Thread Mostafa Mokhtar (JIRA)
Mostafa Mokhtar created HIVE-10350:
--

 Summary: CBO: With hive.cbo.costmodel.extended enabled IO cost is 
negative
 Key: HIVE-10350
 URL: https://issues.apache.org/jira/browse/HIVE-10350
 Project: Hive
  Issue Type: Bug
  Components: CBO
Affects Versions: 1.2.0
Reporter: Mostafa Mokhtar
Assignee: Mostafa Mokhtar
 Fix For: 1.2.0


Not an overflow but parallelism from ends up being -1
{code}
 final int parallelism = RelMetadataQuery.splitCount(join) == null
  ? 1 : RelMetadataQuery.splitCount(join);
{code}


{code}
2015-04-13 18:19:09,154 DEBUG [main]: cost.HiveCostModel 
(HiveCostModel.java:getJoinCost(62)) - COMMON_JOIN cost: {1600892.857142857 
rows, 2.4463782008994658E7 cpu, 8.54445445875E10 io}
2015-04-13 18:19:09,155 DEBUG [main]: cost.HiveCostModel 
(HiveCostModel.java:getJoinCost(62)) - MAP_JOIN cost: {1600892.857142857 rows, 
1601785.714285714 cpu, -1698787.48 io}
2015-04-13 18:19:09,155 DEBUG [main]: cost.HiveCostModel 
(HiveCostModel.java:getJoinCost(72)) - MAP_JOIN selected
2015-04-13 18:19:09,157 DEBUG [main]: parse.CalcitePlanner 
(CalcitePlanner.java:apply(862)) - Plan After Join Reordering:
HiveSort(fetch=[100]): rowcount = 6006.726049749041, cumulative cost = 
{1.1468867492063493E8 rows, 1.166177684126984E8 cpu, -1.1757664816220238E9 io}, 
id = 3000
  HiveSort(sort0=[$0], dir0=[ASC]): rowcount = 6006.726049749041, cumulative 
cost = {1.1468867492063493E8 rows, 1.166177684126984E8 cpu, 
-1.1757664816220238E9 io}, id = 2998
HiveProject(customer_id=[$4], customername=[concat($9, ', ', $8)]): 
rowcount = 6006.726049749041, cumulative cost = {1.1468867492063493E8 rows, 
1.166177684126984E8 cpu, -1.1757664816220238E9 io}, id = 3136
  HiveJoin(condition=[=($1, $5)], joinType=[inner], 
joinAlgorithm=[map_join], cost=[{5.557820341269841E7 rows, 5.557840182539682E7 
cpu, -4299694.122023809 io}]): rowcount = 6006.726049749041, cumulative cost = 
{1.1468867492063493E8 rows, 1.166177684126984E8 cpu, -1.1757664816220238E9 io}, 
id = 3132
HiveJoin(condition=[=($0, $1)], joinType=[inner], 
joinAlgorithm=[map_join], cost=[{5.7498805E7 rows, 5.9419605E7 cpu, -1.15248E9 
io}]): rowcount = 5.5578005E7, cumulative cost = {5.7498805E7 rows, 5.9419605E7 
cpu, -1.15248E9 io}, id = 3100
  HiveProject(sr_cdemo_sk=[$4]): rowcount = 5.5578005E7, cumulative 
cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 2992
HiveTableScan(table=[[tpcds_bin_orc_200.store_returns]]): rowcount 
= 5.5578005E7, cumulative cost = {0}, id = 2878
  HiveProject(cd_demo_sk=[$0]): rowcount = 1920800.0, cumulative cost = 
{0.0 rows, 0.0 cpu, 0.0 io}, id = 2978
HiveTableScan(table=[[tpcds_bin_orc_200.customer_demographics]]): 
rowcount = 1920800.0, cumulative cost = {0}, id = 2868
HiveJoin(condition=[=($10, $1)], joinType=[inner], 
joinAlgorithm=[map_join], cost=[{1787.9365079365077 rows, 1790.15873015873 cpu, 
-8000.0 io}]): rowcount = 198.4126984126984, cumulative cost = 
{1611666.507936508 rows, 1619761.5873015872 cpu, -1.89867875E7 io}, id = 3130
  HiveJoin(condition=[=($0, $4)], joinType=[inner], 
joinAlgorithm=[map_join], cost=[{8985.714285714286 rows, 16185.714285714286 
cpu, -1.728E7 io}]): rowcount = 1785.7142857142856, cumulative cost = 
{1609878.5714285714 rows, 1617971.4285714284 cpu, -1.89787875E7 io}, id = 3128
HiveProject(hd_demo_sk=[$0], hd_income_band_sk=[$1]): rowcount = 
7200.0, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 2982
  
HiveTableScan(table=[[tpcds_bin_orc_200.household_demographics]]): rowcount = 
7200.0, cumulative cost = {0}, id = 2871
HiveJoin(condition=[=($3, $6)], joinType=[inner], 
joinAlgorithm=[map_join], cost=[{1600892.857142857 rows, 1601785.714285714 cpu, 
-1698787.48 io}]): rowcount = 1785.7142857142856, cumulative cost = 
{1600892.857142857 rows, 1601785.714285714 cpu, -1698787.48 io}, id = 
3105
  HiveProject(c_customer_id=[$1], c_current_cdemo_sk=[$2], 
c_current_hdemo_sk=[$3], c_current_addr_sk=[$4], c_first_name=[$8], 
c_last_name=[$9]): rowcount = 160.0, cumulative cost = {0.0 rows, 0.0 cpu, 
0.0 io}, id = 2970
HiveTableScan(table=[[tpcds_bin_orc_200.customer]]): rowcount = 
160.0, cumulative cost = {0}, id = 2862
  HiveProject(ca_address_sk=[$0], ca_city=[$6]): rowcount = 
892.8571428571428, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 2974
HiveFilter(condition=[=($6, 'Hopewell')]): rowcount = 
892.8571428571428, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 2972
  HiveTableScan(table=[[tpcds_bin_orc_200.customer_address]]): 
rowcount = 80.0, cumulative cost = {0}, id = 2864
  HiveProject(ib_income_band_sk=[$0], ib_lower_bound=[$1], 
ib_upper_bound=[$2]): rowcount = 2.2223, cumulative cost = 

[jira] [Created] (HIVE-10331) ORC : Is null SARG filters out all row groups written in old ORC format

2015-04-14 Thread Mostafa Mokhtar (JIRA)
Mostafa Mokhtar created HIVE-10331:
--

 Summary: ORC : Is null SARG filters out all row groups written in 
old ORC format
 Key: HIVE-10331
 URL: https://issues.apache.org/jira/browse/HIVE-10331
 Project: Hive
  Issue Type: Bug
  Components: Hive
Affects Versions: 1.1.0
Reporter: Mostafa Mokhtar
Assignee: Prasanth Jayachandran
 Fix For: 1.2.0


Queries are returning wrong results as all row groups gets filtered out and no 
rows get scanned.

{code}
SELECT 
  count(*)
FROM
store_sales
WHERE
ss_addr_sk IS NULL
{code}

With hive.optimize.index.filter disabled we get the correct results



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)


[jira] [Created] (HIVE-10337) CBO (Calcite Return Path): java.lang.IndexOutOfBoundsException for query with rank() over(partition ...)

2015-04-14 Thread Mostafa Mokhtar (JIRA)
Mostafa Mokhtar created HIVE-10337:
--

 Summary: CBO (Calcite Return Path): 
java.lang.IndexOutOfBoundsException for query with rank() over(partition ...)
 Key: HIVE-10337
 URL: https://issues.apache.org/jira/browse/HIVE-10337
 Project: Hive
  Issue Type: Bug
Reporter: Mostafa Mokhtar
Assignee: Jesus Camacho Rodriguez


CBO throws Index out of bound exception for TPC-DS Q70.

Query 
{code}

explain
select
sum(ss_net_profit) as total_sum
   ,s_state
   ,s_county
   ,grouping__id as lochierarchy
   , rank() over(partition by grouping__id, case when grouping__id == 2 then 
s_state end order by sum(ss_net_profit)) as rank_within_parent
from
store_sales ss join date_dim d1 on d1.d_date_sk = ss.ss_sold_date_sk
join store s on s.s_store_sk  = ss.ss_store_sk
 where
d1.d_month_seq between 1193 and 1193+11
 and s.s_state in
 ( select s_state
   from  (select s_state as s_state, sum(ss_net_profit),
 rank() over ( partition by s_state order by 
sum(ss_net_profit) desc) as ranking
  from   store_sales, store, date_dim
  where  d_month_seq between 1193 and 1193+11
and date_dim.d_date_sk = store_sales.ss_sold_date_sk
and store.s_store_sk  = store_sales.ss_store_sk
  group by s_state
 ) tmp1
   where ranking = 5
 )
 group by s_state,s_county with rollup
order by
   lochierarchy desc
  ,case when lochierarchy = 0 then s_state end
  ,rank_within_parent
 limit 100
{code}

Exception 
{code}
15/04/14 02:42:52 [main]: ERROR parse.CalcitePlanner: CBO failed, skipping CBO.
java.lang.IndexOutOfBoundsException: Index: 5, Size: 5
at java.util.ArrayList.rangeCheck(ArrayList.java:635)
at java.util.ArrayList.get(ArrayList.java:411)
at 
org.apache.hadoop.hive.ql.optimizer.calcite.translator.ASTConverter$RexVisitor.visitInputRef(ASTConverter.java:395)
at 
org.apache.hadoop.hive.ql.optimizer.calcite.translator.ASTConverter$RexVisitor.visitInputRef(ASTConverter.java:372)
at org.apache.calcite.rex.RexInputRef.accept(RexInputRef.java:112)
at 
org.apache.hadoop.hive.ql.optimizer.calcite.translator.ASTConverter$RexVisitor.visitCall(ASTConverter.java:543)
at 
org.apache.hadoop.hive.ql.optimizer.calcite.translator.ASTConverter$RexVisitor.visitCall(ASTConverter.java:372)
at org.apache.calcite.rex.RexCall.accept(RexCall.java:107)
at 
org.apache.hadoop.hive.ql.optimizer.calcite.translator.ASTConverter$RexVisitor.visitCall(ASTConverter.java:543)
at 
org.apache.hadoop.hive.ql.optimizer.calcite.translator.ASTConverter$RexVisitor.visitCall(ASTConverter.java:372)
at org.apache.calcite.rex.RexCall.accept(RexCall.java:107)
at 
org.apache.hadoop.hive.ql.optimizer.calcite.translator.ASTConverter.convertOBToASTNode(ASTConverter.java:252)
at 
org.apache.hadoop.hive.ql.optimizer.calcite.translator.ASTConverter.convert(ASTConverter.java:208)
at 
org.apache.hadoop.hive.ql.optimizer.calcite.translator.ASTConverter.convert(ASTConverter.java:98)
at 
org.apache.hadoop.hive.ql.parse.CalcitePlanner.getOptimizedAST(CalcitePlanner.java:607)
at 
org.apache.hadoop.hive.ql.parse.CalcitePlanner.genOPTree(CalcitePlanner.java:239)
at 
org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.analyzeInternal(SemanticAnalyzer.java:10003)
at 
org.apache.hadoop.hive.ql.parse.CalcitePlanner.analyzeInternal(CalcitePlanner.java:202)
at 
org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.analyze(BaseSemanticAnalyzer.java:224)
at 
org.apache.hadoop.hive.ql.parse.ExplainSemanticAnalyzer.analyzeInternal(ExplainSemanticAnalyzer.java:74)
at 
org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.analyze(BaseSemanticAnalyzer.java:224)
at org.apache.hadoop.hive.ql.Driver.compile(Driver.java:424)
at org.apache.hadoop.hive.ql.Driver.compile(Driver.java:308)
at org.apache.hadoop.hive.ql.Driver.compileInternal(Driver.java:1122)
at org.apache.hadoop.hive.ql.Driver.runInternal(Driver.java:1170)
at org.apache.hadoop.hive.ql.Driver.run(Driver.java:1059)
at org.apache.hadoop.hive.ql.Driver.run(Driver.java:1049)
at 
org.apache.hadoop.hive.cli.CliDriver.processLocalCmd(CliDriver.java:213)
at org.apache.hadoop.hive.cli.CliDriver.processCmd(CliDriver.java:165)
at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:376)
at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:311)
at 
org.apache.hadoop.hive.cli.CliDriver.processReader(CliDriver.java:409)
at org.apache.hadoop.hive.cli.CliDriver.processFile(CliDriver.java:425)
at 

[jira] [Created] (HIVE-10244) Vectorization : TPC-DS Q80 fails with java.lang.ClassCastException when hive.vectorized.execution.reduce.enabled is enabled

2015-04-07 Thread Mostafa Mokhtar (JIRA)
Mostafa Mokhtar created HIVE-10244:
--

 Summary: Vectorization : TPC-DS Q80 fails with 
java.lang.ClassCastException when hive.vectorized.execution.reduce.enabled is 
enabled
 Key: HIVE-10244
 URL: https://issues.apache.org/jira/browse/HIVE-10244
 Project: Hive
  Issue Type: Bug
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Matt McCline


Query 
{code}
set hive.vectorized.execution.reduce.enabled=true;
with ssr as
 (select  s_store_id as store_id,
  sum(ss_ext_sales_price) as sales,
  sum(coalesce(sr_return_amt, 0)) as returns,
  sum(ss_net_profit - coalesce(sr_net_loss, 0)) as profit
  from store_sales left outer join store_returns on
 (ss_item_sk = sr_item_sk and ss_ticket_number = sr_ticket_number),
 date_dim,
 store,
 item,
 promotion
 where ss_sold_date_sk = d_date_sk
   and d_date between cast('1998-08-04' as date) 
  and (cast('1998-09-04' as date))
   and ss_store_sk = s_store_sk
   and ss_item_sk = i_item_sk
   and i_current_price  50
   and ss_promo_sk = p_promo_sk
   and p_channel_tv = 'N'
 group by s_store_id)
 ,
 csr as
 (select  cp_catalog_page_id as catalog_page_id,
  sum(cs_ext_sales_price) as sales,
  sum(coalesce(cr_return_amount, 0)) as returns,
  sum(cs_net_profit - coalesce(cr_net_loss, 0)) as profit
  from catalog_sales left outer join catalog_returns on
 (cs_item_sk = cr_item_sk and cs_order_number = cr_order_number),
 date_dim,
 catalog_page,
 item,
 promotion
 where cs_sold_date_sk = d_date_sk
   and d_date between cast('1998-08-04' as date)
  and (cast('1998-09-04' as date))
and cs_catalog_page_sk = cp_catalog_page_sk
   and cs_item_sk = i_item_sk
   and i_current_price  50
   and cs_promo_sk = p_promo_sk
   and p_channel_tv = 'N'
group by cp_catalog_page_id)
 ,
 wsr as
 (select  web_site_id,
  sum(ws_ext_sales_price) as sales,
  sum(coalesce(wr_return_amt, 0)) as returns,
  sum(ws_net_profit - coalesce(wr_net_loss, 0)) as profit
  from web_sales left outer join web_returns on
 (ws_item_sk = wr_item_sk and ws_order_number = wr_order_number),
 date_dim,
 web_site,
 item,
 promotion
 where ws_sold_date_sk = d_date_sk
   and d_date between cast('1998-08-04' as date)
  and (cast('1998-09-04' as date))
and ws_web_site_sk = web_site_sk
   and ws_item_sk = i_item_sk
   and i_current_price  50
   and ws_promo_sk = p_promo_sk
   and p_channel_tv = 'N'
group by web_site_id)
  select  channel
, id
, sum(sales) as sales
, sum(returns) as returns
, sum(profit) as profit
 from 
 (select 'store channel' as channel
, concat('store', store_id) as id
, sales
, returns
, profit
 from   ssr
 union all
 select 'catalog channel' as channel
, concat('catalog_page', catalog_page_id) as id
, sales
, returns
, profit
 from  csr
 union all
 select 'web channel' as channel
, concat('web_site', web_site_id) as id
, sales
, returns
, profit
 from   wsr
 ) x
 group by channel, id with rollup
 order by channel
 ,id
 limit 100
{code}

Exception 
{code}
Vertex failed, vertexName=Reducer 5, vertexId=vertex_1426707664723_1377_1_22, 
diagnostics=[Task failed, taskId=task_1426707664723_1377_1_22_00, 
diagnostics=[TaskAttempt 0 failed, info=[Error: Failure while running 
task:java.lang.RuntimeException: java.lang.RuntimeException: 
org.apache.hadoop.hive.ql.metadata.HiveException: Hive Runtime Error while 
processing vector batch (tag=0) 
\N\N09.285817653506076E84.639990363237801E7-1.1814318134887291E8
\N\N04.682909323885761E82.2415242712669864E7-5.966176123188091E7
\N\N01.2847032699693155E96.300096113768728E7-5.94963316209578E8
at 
org.apache.hadoop.hive.ql.exec.tez.TezProcessor.initializeAndRunProcessor(TezProcessor.java:171)
at 
org.apache.hadoop.hive.ql.exec.tez.TezProcessor.run(TezProcessor.java:137)
at 
org.apache.tez.runtime.LogicalIOProcessorRuntimeTask.run(LogicalIOProcessorRuntimeTask.java:330)
at 
org.apache.tez.runtime.task.TezTaskRunner$TaskRunnerCallable$1.run(TezTaskRunner.java:179)
at 
org.apache.tez.runtime.task.TezTaskRunner$TaskRunnerCallable$1.run(TezTaskRunner.java:171)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at 
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1628)
at 
org.apache.tez.runtime.task.TezTaskRunner$TaskRunnerCallable.callInternal(TezTaskRunner.java:171)
at 

[jira] [Created] (HIVE-10193) CBO (Calcite Return Path): Q94 generates cross product

2015-04-02 Thread Mostafa Mokhtar (JIRA)
Mostafa Mokhtar created HIVE-10193:
--

 Summary: CBO (Calcite Return Path): Q94 generates cross product 
 Key: HIVE-10193
 URL: https://issues.apache.org/jira/browse/HIVE-10193
 Project: Hive
  Issue Type: Sub-task
  Components: CBO
Affects Versions: cbo-branch
Reporter: Mostafa Mokhtar
Assignee: Jesus Camacho Rodriguez
 Fix For: cbo-branch






--
This message was sent by Atlassian JIRA
(v6.3.4#6332)


[jira] [Created] (HIVE-10194) CBO (Calcite Return Path): Q94 generates cross product

2015-04-02 Thread Mostafa Mokhtar (JIRA)
Mostafa Mokhtar created HIVE-10194:
--

 Summary: CBO (Calcite Return Path): Q94 generates cross product 
 Key: HIVE-10194
 URL: https://issues.apache.org/jira/browse/HIVE-10194
 Project: Hive
  Issue Type: Sub-task
Reporter: Mostafa Mokhtar
Assignee: Jesus Camacho Rodriguez


Query 
{code}
SELECT count(distinct ws_order_number) as order_count,
   sum(ws_ext_ship_cost) as total_shipping_cost,
   sum(ws_net_profit) as total_net_profit
FROM web_sales ws1
JOIN customer_address ca ON (ws1.ws_ship_addr_sk = ca.ca_address_sk)
JOIN web_site s ON (ws1.ws_web_site_sk = s.web_site_sk)
JOIN date_dim d ON (ws1.ws_ship_date_sk = d.d_date_sk)
LEFT SEMI JOIN (SELECT ws2.ws_order_number as ws_order_number
   FROM web_sales ws2 JOIN web_sales ws3
   ON (ws2.ws_order_number = ws3.ws_order_number)
   WHERE ws2.ws_warehouse_sk  ws3.ws_warehouse_sk
) ws_wh1
ON (ws1.ws_order_number = ws_wh1.ws_order_number)
LEFT OUTER JOIN web_returns wr1 ON (ws1.ws_order_number = wr1.wr_order_number)
WHERE d.d_date between '1999-05-01' and '1999-07-01' and
   ca.ca_state = 'TX' and
   s.web_company_name = 'pri' and
   wr1.wr_order_number is null
limit 100
{code}

Plan
{code}
OK
Time taken: 0.23 seconds
Warning: Map Join MAPJOIN[83][bigTable=ws1] in task 'Map 2' is a cross product
OK
STAGE DEPENDENCIES:
  Stage-1 is a root stage
  Stage-0 depends on stages: Stage-1

STAGE PLANS:
  Stage: Stage-1
Tez
  Edges:
Map 2 - Map 1 (BROADCAST_EDGE)
Map 8 - Reducer 4 (BROADCAST_EDGE)
Reducer 3 - Map 2 (SIMPLE_EDGE), Map 5 (BROADCAST_EDGE), Map 6 
(BROADCAST_EDGE), Map 7 (SIMPLE_EDGE)
Reducer 4 - Map 10 (SIMPLE_EDGE), Reducer 3 (SIMPLE_EDGE)
Reducer 9 - Map 8 (SIMPLE_EDGE)
  DagName: mmokhtar_20150402132417_1bc8688b-59a0-4909-82a4-b9d386065bbd:3
  Vertices:
Map 1
Map Operator Tree:
TableScan
  alias: ws1
  filterExpr: (((ws_ship_addr_sk = ws_order_number) and 
(ws_ship_date_sk  ws_web_site_sk)) and ws_ship_addr_sk is not null) (type: 
boolean)
  Statistics: Num rows: 143966864 Data size: 33110363004 Basic 
stats: COMPLETE Column stats: COMPLETE
  Filter Operator
predicate: (((ws_ship_addr_sk = ws_order_number) and 
(ws_ship_date_sk  ws_web_site_sk)) and ws_ship_addr_sk is not null) (type: 
boolean)
Statistics: Num rows: 71974471 Data size: 1151483592 Basic 
stats: COMPLETE Column stats: COMPLETE
Select Operator
  expressions: ws_ship_addr_sk (type: int)
  outputColumnNames: _col1
  Statistics: Num rows: 71974471 Data size: 287862044 Basic 
stats: COMPLETE Column stats: COMPLETE
  Reduce Output Operator
sort order:
Statistics: Num rows: 71974471 Data size: 287862044 
Basic stats: COMPLETE Column stats: COMPLETE
value expressions: _col1 (type: int)
Execution mode: vectorized
Map 10
Map Operator Tree:
TableScan
  alias: wr1
  Statistics: Num rows: 13749816 Data size: 2585240312 Basic 
stats: COMPLETE Column stats: COMPLETE
  Reduce Output Operator
key expressions: wr_order_number (type: int)
sort order: +
Map-reduce partition columns: wr_order_number (type: int)
Statistics: Num rows: 13749816 Data size: 2585240312 Basic 
stats: COMPLETE Column stats: COMPLETE
Execution mode: vectorized
Map 2
Map Operator Tree:
TableScan
  alias: ws1
  Statistics: Num rows: 143966864 Data size: 33110363004 Basic 
stats: COMPLETE Column stats: COMPLETE
  Map Join Operator
condition map:
 Inner Join 0 to 1
keys:
  0
  1
outputColumnNames: _col1
input vertices:
  0 Map 1
Statistics: Num rows: 5180969438964472 Data size: 
20723877755857888 Basic stats: COMPLETE Column stats: COMPLETE
Select Operator
  expressions: _col1 (type: int)
  outputColumnNames: _col0
  Statistics: Num rows: 5180969438964472 Data size: 
20723877755857888 Basic stats: COMPLETE Column stats: COMPLETE
  Group By Operator
keys: _col0 (type: int)
mode: hash
   

[jira] [Created] (HIVE-10153) CBO (Calcite Return Path): TPC-DS Q15 in-efficient join order

2015-03-30 Thread Mostafa Mokhtar (JIRA)
Mostafa Mokhtar created HIVE-10153:
--

 Summary: CBO (Calcite Return Path): TPC-DS Q15 in-efficient join 
order 
 Key: HIVE-10153
 URL: https://issues.apache.org/jira/browse/HIVE-10153
 Project: Hive
  Issue Type: Bug
  Components: CBO
Affects Versions: cbo-branch
Reporter: Mostafa Mokhtar
Assignee: Laljo John Pullokkaran
 Fix For: cbo-branch


TPC-DS Q15 joins catalog_sales with date_dim last where it should be the first 
join.

Query 
{code}
select  ca_zip
   ,sum(cs_sales_price)
 from catalog_sales
 ,customer
 ,customer_address
 ,date_dim
 where catalog_sales.cs_bill_customer_sk = customer.c_customer_sk
  and customer.c_current_addr_sk = customer_address.ca_address_sk 
  and ( substr(ca_zip,1,5) in ('85669', '86197','88274','83405','86475',
   '85392', '85460', '80348', '81792')
   or customer_address.ca_state in ('CA','WA','GA')
   or catalog_sales.cs_sales_price  500)
  and catalog_sales.cs_sold_date_sk = date_dim.d_date_sk
  and date_dim.d_qoy = 2 and date_dim.d_year = 2000
 group by ca_zip
 order by ca_zip
 limit 100;
{code}

Logical plan 
{code}

HiveSort(fetch=[100]): rowcount = 7171.0, cumulative cost = 
{7.507729983730065E8 rows, 7.553113550983669E8 cpu, 9.08546638062188E10 io}, id 
= 2207
  HiveSort(sort0=[$0], dir0=[ASC]): rowcount = 7171.0, cumulative cost = 
{7.502636967200102E8 rows, 7.553041840983669E8 cpu, 9.08546638062188E10 io}, id 
= 2205
HiveAggregate(group=[{0}], agg#0=[sum($1)]): rowcount = 7171.0, cumulative 
cost = {7.497543950670139E8 rows, 7.552970130983669E8 cpu, 9.08546638062188E10 
io}, id = 2203
  HiveProject($f0=[$7], $f1=[$1]): rowcount = 272862.9537571146, cumulative 
cost = {7.494815321132567E8 rows, 7.518816625578996E8 cpu, 8.75951724E10 io}, 
id = 2201
HiveJoin(condition=[=($2, $8)], joinType=[inner], 
joinAlgorithm=[map_join], cost=[{1.36661031991844E8 rows, 1.3666116243648687E8 
cpu, 0.0 io}]): rowcount = 272862.9537571146, cumulative cost = 
{7.494815321132567E8 rows, 7.518816625578996E8 cpu, 8.75951724E10 io}, id = 2242
  HiveFilter(condition=[OR(in(substr($7, 1, 5), '85669', '86197', 
'88274', '83405', '86475', '85392', '85460', '80348', '81792'), in($6, 'CA', 
'WA', 'GA'), ($1, 5E2))]): rowcount = 1.3666090154720113E8, cumulative cost = 
{6.128205001214128E8 rows, 6.152205001214128E8 cpu, 8.75951724E10 io}, id = 2195
HiveJoin(condition=[=($4, $5)], joinType=[inner], 
joinAlgorithm=[map_join], cost=[{3.246707731214128E8 rows, 3.254707731214128E8 
cpu, 4.91951724E10 io}]): rowcount = 3.6605287632468826E8, cumulative cost = 
{6.128205001214128E8 rows, 6.152205001214128E8 cpu, 8.75951724E10 io}, id = 2238
  HiveJoin(condition=[=($0, $3)], joinType=[inner], 
joinAlgorithm=[map_join], cost=[{2.88149727E8 rows, 2.89749727E8 cpu, 3.84E10 
io}]): rowcount = 3.238707731214128E8, cumulative cost = {2.88149727E8 rows, 
2.89749727E8 cpu, 3.84E10 io}, id  = 

HiveTableScan(table=[[tpcds_bin_partitioned_orc_200_1.catalog_sales]]): 
rowcount = 2.86549727E8, cumulative cost = {0}, id = 2134

HiveTableScan(table=[[tpcds_bin_partitioned_orc_200_1.customer]]): rowcount = 
160.0, cumulative cost = {0}, id = 2135
  
HiveTableScan(table=[[tpcds_bin_partitioned_orc_200_1.customer_address]]): 
rowcount = 80.0, cumulative cost = {0}, id = 2137
  HiveFilter(condition=[AND(=($2, 2), =($1, 2000))]): rowcount = 
130.44464285714287, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 2197
HiveTableScan(table=[[tpcds_bin_partitioned_orc_200_1.date_dim]]): 
rowcount = 73049.0, cumulative cost = {0}, id = 2140
{code}

— Re-write 
{code}
with cs as 
 ( select cs_sales_price,cs_bill_customer_sk
 from catalog_sales
 ,date_dim
where  
  cs_sold_date_sk = d_date_sk
  and date_dim.d_qoy = 2 and d_year = 2000)
  select  ca_zip
   ,sum(cs_sales_price)
 from cs
 ,customer
 ,customer_address
 where cs.cs_bill_customer_sk = customer.c_customer_sk
  and customer.c_current_addr_sk = customer_address.ca_address_sk 
  and ( substr(ca_zip,1,5) in ('85669', '86197','88274','83405','86475',
   '85392', '85460', '80348', '81792')
   or customer_address.ca_state in ('CA','WA','GA')
   or cs.cs_sales_price  500)
 group by ca_zip
 order by ca_zip
 limit 100
 {code}

— plan for re-write 
{code}
HiveSort(fetch=[100]): rowcount = 7171.0, cumulative cost = 
{2.9146011517152977E8 rows, 2.949706092384584E8 cpu, 3.261369809075945E9 io}, 
id = 1990
  HiveSort(sort0=[$0], dir0=[ASC]): rowcount = 7171.0, cumulative cost = 
{2.909508135185335E8 rows, 2.949634382384584E8 cpu, 3.261369809075945E9 io}, id 
= 1988
HiveAggregate(group=[{0}], agg#0=[sum($1)]): rowcount = 7171.0, cumulative 
cost = {2.904415118655373E8 rows, 2.949562672384584E8 cpu, 

[jira] [Created] (HIVE-10118) CBO :

2015-03-27 Thread Mostafa Mokhtar (JIRA)
Mostafa Mokhtar created HIVE-10118:
--

 Summary: CBO : 
 Key: HIVE-10118
 URL: https://issues.apache.org/jira/browse/HIVE-10118
 Project: Hive
  Issue Type: Sub-task
Reporter: Mostafa Mokhtar






--
This message was sent by Atlassian JIRA
(v6.3.4#6332)


[jira] [Created] (HIVE-10123) Hybrid grace Hash join : Use estimate key count from stats to initialize BytesBytesMultiHashMap

2015-03-27 Thread Mostafa Mokhtar (JIRA)
Mostafa Mokhtar created HIVE-10123:
--

 Summary: Hybrid grace Hash join : Use estimate key count from 
stats to initialize BytesBytesMultiHashMap
 Key: HIVE-10123
 URL: https://issues.apache.org/jira/browse/HIVE-10123
 Project: Hive
  Issue Type: Bug
  Components: Hive
Affects Versions: 1.2.0
Reporter: Mostafa Mokhtar
Assignee: Mostafa Mokhtar
 Fix For: 1.2.0


Hybrid grace Hash join is not using estimated number of rows from the 
statistics to initialize BytesBytesMultiHashMap. 

Add some logging to BytesBytesMultiHashMap to track get probes and use msec for 
expandAndRehash as us overflow.



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)


[jira] [Created] (HIVE-10106) Regression : Dynamic partition pruning not working after HIVE-9976

2015-03-26 Thread Mostafa Mokhtar (JIRA)
Mostafa Mokhtar created HIVE-10106:
--

 Summary: Regression : Dynamic partition pruning not working after 
HIVE-9976
 Key: HIVE-10106
 URL: https://issues.apache.org/jira/browse/HIVE-10106
 Project: Hive
  Issue Type: Bug
  Components: Hive
Affects Versions: 1.2.0
Reporter: Mostafa Mokhtar
Assignee: Siddharth Seth
 Fix For: 1.2.0


After HIVE-9976 got checked in dynamic partition pruning doesn't work.
Partitions are pruned and later show up in splits.



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)


[jira] [Created] (HIVE-10107) Union All : Vertex missing stats resulting in OOM and in-efficient plans

2015-03-26 Thread Mostafa Mokhtar (JIRA)
Mostafa Mokhtar created HIVE-10107:
--

 Summary: Union All : Vertex missing stats resulting in OOM and 
in-efficient plans
 Key: HIVE-10107
 URL: https://issues.apache.org/jira/browse/HIVE-10107
 Project: Hive
  Issue Type: Bug
  Components: Physical Optimizer
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Prasanth Jayachandran
 Fix For: 1.2.0


Reducer Vertices sending data to a Union all edge are missing statistics and as 
a result we either use very few reducers in the UNION ALL edge or decide to 
broadcast the results of UNION ALL.

Query
{code}
select 
count(*) rowcount
from
(select 
ss_item_sk, ss_ticket_number, ss_store_sk
from
store_sales a, store_returns b
where
a.ss_item_sk = b.sr_item_sk
and a.ss_ticket_number = b.sr_ticket_number union all select 
ss_item_sk, ss_ticket_number, ss_store_sk
from
store_sales c, store_returns d
where
c.ss_item_sk = d.sr_item_sk
and c.ss_ticket_number = d.sr_ticket_number) t
group by t.ss_store_sk , t.ss_item_sk , t.ss_ticket_number
having rowcount  1;
{code}

Plan snippet 
{code}
 Edges:
Reducer 2 - Map 1 (SIMPLE_EDGE), Map 5 (SIMPLE_EDGE), Union 3 
(CONTAINS)
Reducer 4 - Union 3 (SIMPLE_EDGE)
Reducer 7 - Map 6 (SIMPLE_EDGE), Map 8 (SIMPLE_EDGE), Union 3 
(CONTAINS)

  Reducer 4
Reduce Operator Tree:
  Group By Operator
aggregations: count(VALUE._col0)
keys: KEY._col0 (type: int), KEY._col1 (type: int), KEY._col2 
(type: int)
mode: mergepartial
outputColumnNames: _col0, _col1, _col2, _col3
Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE 
Column stats: COMPLETE
Filter Operator
  predicate: (_col3  1) (type: boolean)
  Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column 
stats: COMPLETE
  Select Operator
expressions: _col3 (type: bigint)
outputColumnNames: _col0
Statistics: Num rows: 0 Data size: 0 Basic stats: NONE 
Column stats: COMPLETE
File Output Operator
  compressed: false
  Statistics: Num rows: 0 Data size: 0 Basic stats: NONE 
Column stats: COMPLETE
  table:
  input format: org.apache.hadoop.mapred.TextInputFormat
  output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
  serde: 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
Reducer 7
Reduce Operator Tree:
  Merge Join Operator
condition map:
 Inner Join 0 to 1
keys:
  0 ss_item_sk (type: int), ss_ticket_number (type: int)
  1 sr_item_sk (type: int), sr_ticket_number (type: int)
outputColumnNames: _col1, _col6, _col8, _col27, _col34
Filter Operator
  predicate: ((_col1 = _col27) and (_col8 = _col34)) (type: 
boolean)
  Select Operator
expressions: _col1 (type: int), _col8 (type: int), _col6 
(type: int)
outputColumnNames: _col0, _col1, _col2
Group By Operator
  aggregations: count()
  keys: _col2 (type: int), _col0 (type: int), _col1 (type: 
int)
  mode: hash
  outputColumnNames: _col0, _col1, _col2, _col3
  Reduce Output Operator
key expressions: _col0 (type: int), _col1 (type: int), 
_col2 (type: int)
sort order: +++
Map-reduce partition columns: _col0 (type: int), _col1 
(type: int), _col2 (type: int)
value expressions: _col3 (type: bigint)
{code}

The full explain plan 
{code}
STAGE DEPENDENCIES:
  Stage-1 is a root stage
  Stage-0 depends on stages: Stage-1

STAGE PLANS:
  Stage: Stage-1
Tez
  Edges:
Reducer 2 - Map 1 (SIMPLE_EDGE), Map 5 (SIMPLE_EDGE), Union 3 
(CONTAINS)
Reducer 4 - Union 3 (SIMPLE_EDGE)
Reducer 7 - Map 6 (SIMPLE_EDGE), Map 8 (SIMPLE_EDGE), Union 3 
(CONTAINS)
  DagName: mmokhtar_20150214132727_95878ea1-ee6a-4b7e-bc86-843abd5cf664:7
  Vertices:
Map 1
Map Operator Tree:
TableScan
  alias: a
  filterExpr: (ss_item_sk is not null and ss_ticket_number is 
not null) (type: boolean)
  Statistics: Num rows: 550076554 Data size: 47370018896 Basic 
stats: COMPLETE Column stats: COMPLETE
  Filter Operator

[jira] [Commented] (HIVE-9647) Discrepancy in cardinality estimates between partitioned and un-partitioned tables

2015-02-18 Thread Mostafa Mokhtar (JIRA)

[ 
https://issues.apache.org/jira/browse/HIVE-9647?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanelfocusedCommentId=14326844#comment-14326844
 ] 

Mostafa Mokhtar commented on HIVE-9647:
---

Awesome :) 
I am happy we can fix this.






 Discrepancy in cardinality estimates between partitioned and un-partitioned 
 tables 
 ---

 Key: HIVE-9647
 URL: https://issues.apache.org/jira/browse/HIVE-9647
 Project: Hive
  Issue Type: Bug
  Components: CBO
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Pengcheng Xiong
 Fix For: 1.2.0

 Attachments: HIVE-9647.01.patch


 High-level summary
 HiveRelMdSelectivity.computeInnerJoinSelectivity relies on per column number 
 of distinct value to estimate join selectivity.
 The way statistics are aggregated for partitioned tables results in 
 discrepancy in number of distinct values which results in different plans 
 between partitioned and un-partitioned schemas.
 The table below summarizes the NDVs in computeInnerJoinSelectivity which are 
 used to estimate selectivity of joins.
 ||Column  ||Partitioned count distincts|| Un-Partitioned count 
 distincts 
 |sr_customer_sk   |71,245 |1,415,625|
 |sr_item_sk   |38,846|62,562|
 |sr_ticket_number |71,245 |34,931,085|
 |ss_customer_sk   |88,476|1,415,625|
 |ss_item_sk   |38,846|62,562|
 |ss_ticket_number|100,756 |56,256,175|
   
 The discrepancy is because NDV calculation for a partitioned table assumes 
 that the NDV range is contained within each partition and is calculates as 
 select max(NUM_DISTINCTS) from PART_COL_STATS” .
 This is problematic for columns like ticket number which are naturally 
 increasing with the partitioned date column ss_sold_date_sk.
 Suggestions
 Use Hyper Log Log as suggested by Gopal, there is an HLL implementation for 
 HBASE co-porccessors which we can use as a reference here 
 Using the global stats from TAB_COL_STATS and the per partition stats from 
 PART_COL_STATS extrapolate the NDV for the qualified partitions as in :
 Max ( (NUM_DISTINCTS from TAB_COL_STATS) x (Number of qualified partitions) / 
 (Number of Partitions), max(NUM_DISTINCTS) from PART_COL_STATS))
 More details
 While doing TPC-DS Partitioned vs. Un-Partitioned runs I noticed that many of 
 the plans are different, then I dumped the CBO logical plan and I found that 
 join estimates are drastically different
 Unpartitioned schema :
 {code}
 2015-02-10 11:33:27,624 DEBUG [main]: parse.SemanticAnalyzer 
 (SemanticAnalyzer.java:apply(12624)) - Plan After Join Reordering:
 HiveProjectRel(store_sales_quantitycount=[$0], store_sales_quantityave=[$1], 
 store_sales_quantitystdev=[$2], store_sales_quantitycov=[/($2, $1)], 
 as_store_returns_quantitycount=[$3], as_store_returns_quantityave=[$4], 
 as_store_returns_quantitystdev=[$5], store_returns_quantitycov=[/($5, $4)]): 
 rowcount = 1.0, cumulative cost = {6.056835407771381E8 rows, 0.0 cpu, 0.0 
 io}, id = 2956
   HiveAggregateRel(group=[{}], agg#0=[count($0)], agg#1=[avg($0)], 
 agg#2=[stddev_samp($0)], agg#3=[count($1)], agg#4=[avg($1)], 
 agg#5=[stddev_samp($1)]): rowcount = 1.0, cumulative cost = 
 {6.056835407771381E8 rows, 0.0 cpu, 0.0 io}, id = 2954
 HiveProjectRel($f0=[$4], $f1=[$8]): rowcount = 40.05611776795562, 
 cumulative cost = {6.056835407771381E8 rows, 0.0 cpu, 0.0 io}, id = 2952
   HiveProjectRel(ss_sold_date_sk=[$0], ss_item_sk=[$1], 
 ss_customer_sk=[$2], ss_ticket_number=[$3], ss_quantity=[$4], 
 sr_item_sk=[$5], sr_customer_sk=[$6], sr_ticket_number=[$7], 
 sr_return_quantity=[$8], d_date_sk=[$9], d_quarter_name=[$10]): rowcount = 
 40.05611776795562, cumulative cost = {6.056835407771381E8 rows, 0.0 cpu, 0.0 
 io}, id = 2982
 HiveJoinRel(condition=[=($9, $0)], joinType=[inner]): rowcount = 
 40.05611776795562, cumulative cost = {6.056835407771381E8 rows, 0.0 cpu, 0.0 
 io}, id = 2980
   HiveJoinRel(condition=[AND(AND(=($2, $6), =($1, $5)), =($3, $7))], 
 joinType=[inner]): rowcount = 28880.460910696, cumulative cost = 
 {6.05654559E8 rows, 0.0 cpu, 0.0 io}, id = 2964
 HiveProjectRel(ss_sold_date_sk=[$0], ss_item_sk=[$2], 
 ss_customer_sk=[$3], ss_ticket_number=[$9], ss_quantity=[$10]): rowcount = 
 5.50076554E8, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 2920
   HiveTableScanRel(table=[[tpcds_bin_orc_200.store_sales]]): 
 rowcount = 5.50076554E8, cumulative cost = {0}, id = 2822
 HiveProjectRel(sr_item_sk=[$2], sr_customer_sk=[$3], 
 sr_ticket_number=[$9], sr_return_quantity=[$10]): rowcount = 5.5578005E7, 
 cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 2923
   HiveTableScanRel(table=[[tpcds_bin_orc_200.store_returns]]): 
 rowcount = 5.5578005E7, cumulative 

[jira] [Created] (HIVE-9713) CBO : inefficient join order created for left join outer condition

2015-02-17 Thread Mostafa Mokhtar (JIRA)
Mostafa Mokhtar created HIVE-9713:
-

 Summary: CBO : inefficient join order created for left join outer 
condition
 Key: HIVE-9713
 URL: https://issues.apache.org/jira/browse/HIVE-9713
 Project: Hive
  Issue Type: Bug
  Components: CBO
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Laljo John Pullokkaran
 Fix For: 1.2.0


For the query below which is a subset of TPC-DS Query 66, CBO joins 
catalog_sales with catalog_returns first although the CE of the join is 
relatively high.
catalog_sales should be joined with the selective dimension tables first.

{code}
select count(*)
from
  catalog_sales
 ,warehouse
 ,date_dim
 ,time_dim
 ,ship_mode
 where
catalog_sales.cs_warehouse_sk =  warehouse.w_warehouse_sk
and catalog_sales.cs_sold_date_sk = date_dim.d_date_sk
and catalog_sales.cs_sold_time_sk = time_dim.t_time_sk
and catalog_sales.cs_ship_mode_sk = ship_mode.sm_ship_mode_sk
and d_year = 2002
and t_time between 49530 AND 49530+28800 
and sm_carrier in ('DIAMOND','AIRBORNE')
 group by 
w_warehouse_name
,w_warehouse_sq_ft
,w_city
,w_county
,w_state
,w_country
   ,d_year
{code}


Logical plan from CBO debug logs 
{code}
2015-02-17 22:34:04,577 DEBUG [main]: parse.CalcitePlanner 
(CalcitePlanner.java:apply(743)) - Plan After Join Reordering:
HiveProject(catalog_page_id=[$0], sales=[$1], returns=[$2], profit=[$3]): 
rowcount = 10590.0, cumulative cost = {8.25242586823495E15 rows, 0.0 cpu, 0.0 
io}, id = 1395
  HiveAggregate(group=[{0}], agg#0=[sum($1)], agg#1=[sum($2)], 
agg#2=[sum($3)]): rowcount = 10590.0, cumulative cost = {8.25242586823495E15 
rows, 0.0 cpu, 0.0 io}, id = 1393
HiveProject($f0=[$14], $f1=[$5], $f2=[coalesce($9, 0)], $f3=[-($6, 
coalesce($10, 0))]): rowcount = 1.368586152225262E8, cumulative cost = 
{8.25242586823495E15 rows, 0.0 cpu, 0.0 io}, id = 1391
  HiveJoin(condition=[=($3, $17)], joinType=[inner]): rowcount = 
1.368586152225262E8, cumulative cost = {8.25242586823495E15 rows, 0.0 cpu, 0.0 
io}, id = 1508
HiveJoin(condition=[=($2, $15)], joinType=[inner]): rowcount = 
2.737172304450524E8, cumulative cost = {8.252425594517495E15 rows, 0.0 cpu, 0.0 
io}, id = 1506
  HiveJoin(condition=[=($1, $13)], joinType=[inner]): rowcount = 
8.211516913351573E8, cumulative cost = {8.252424773349804E15 rows, 0.0 cpu, 0.0 
io}, id = 1504
HiveJoin(condition=[=($0, $11)], joinType=[inner]): rowcount = 
1.1296953399027347E11, cumulative cost = {8.252311803804096E15 rows, 0.0 cpu, 
0.0 io}, id = 1418
  HiveJoin(condition=[AND(=($2, $7), =($4, $8))], joinType=[left]): 
rowcount = 8.252311488455487E15, cumulative cost = {3.15348608E8 rows, 0.0 cpu, 
0.0 io}, id = 1413
HiveProject(cs_sold_date_sk=[$0], cs_catalog_page_sk=[$12], 
cs_item_sk=[$15], cs_promo_sk=[$16], cs_order_number=[$17], 
cs_ext_sales_price=[$23], cs_net_profit=[$33]): rowcount = 2.86549727E8, 
cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 1324
  HiveTableScan(table=[[tpcds_bin_orc_200.catalog_sales]]): 
rowcount = 2.86549727E8, cumulative cost = {0}, id = 1136
HiveProject(cr_item_sk=[$2], cr_order_number=[$16], 
cr_return_amount=[$18], cr_net_loss=[$26]): rowcount = 2.8798881E7, cumulative 
cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 1327
  HiveTableScan(table=[[tpcds_bin_orc_200.catalog_returns]]): 
rowcount = 2.8798881E7, cumulative cost = {0}, id = 1137
  HiveProject(d_date_sk=[$0], d_date=[$2]): rowcount = 1.0, 
cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 1371
HiveFilter(condition=[between(false, $2, 
CAST('1998-08-04'):DATE, CAST('1998-09-04'):DATE)]): rowcount = 1.0, cumulative 
cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 1369
  HiveTableScan(table=[[tpcds_bin_orc_200.date_dim]]): rowcount 
= 73049.0, cumulative cost = {0}, id = 1138
HiveProject(cp_catalog_page_sk=[$0], cp_catalog_page_id=[$1]): 
rowcount = 11718.0, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 1375
  HiveTableScan(table=[[tpcds_bin_orc_200.catalog_page]]): rowcount 
= 11718.0, cumulative cost = {0}, id = 1139
  HiveProject(i_item_sk=[$0], i_current_price=[$5]): rowcount = 
16000.0, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 1381
HiveFilter(condition=[($5, 5E1)]): rowcount = 16000.0, cumulative 
cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 1379
  HiveTableScan(table=[[tpcds_bin_orc_200.item]]): rowcount = 
48000.0, cumulative cost = {0}, id = 1140
HiveProject(p_promo_sk=[$0], p_channel_tv=[$11]): rowcount = 225.0, 
cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 1387
  HiveFilter(condition=[=($11, 'N')]): 

[jira] [Updated] (HIVE-9713) CBO : inefficient join order created for left join outer condition

2015-02-17 Thread Mostafa Mokhtar (JIRA)

 [ 
https://issues.apache.org/jira/browse/HIVE-9713?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Mostafa Mokhtar updated HIVE-9713:
--
Description: 
For the query below which is a subset of TPC-DS Query 80, CBO joins 
catalog_sales with catalog_returns first although the CE of the join is 
relatively high.
catalog_sales should be joined with the selective dimension tables first.

{code}
select count(*)
from
  catalog_sales
 ,warehouse
 ,date_dim
 ,time_dim
 ,ship_mode
 where
catalog_sales.cs_warehouse_sk =  warehouse.w_warehouse_sk
and catalog_sales.cs_sold_date_sk = date_dim.d_date_sk
and catalog_sales.cs_sold_time_sk = time_dim.t_time_sk
and catalog_sales.cs_ship_mode_sk = ship_mode.sm_ship_mode_sk
and d_year = 2002
and t_time between 49530 AND 49530+28800 
and sm_carrier in ('DIAMOND','AIRBORNE')
 group by 
w_warehouse_name
,w_warehouse_sq_ft
,w_city
,w_county
,w_state
,w_country
   ,d_year
{code}


Logical plan from CBO debug logs 
{code}
2015-02-17 22:34:04,577 DEBUG [main]: parse.CalcitePlanner 
(CalcitePlanner.java:apply(743)) - Plan After Join Reordering:
HiveProject(catalog_page_id=[$0], sales=[$1], returns=[$2], profit=[$3]): 
rowcount = 10590.0, cumulative cost = {8.25242586823495E15 rows, 0.0 cpu, 0.0 
io}, id = 1395
  HiveAggregate(group=[{0}], agg#0=[sum($1)], agg#1=[sum($2)], 
agg#2=[sum($3)]): rowcount = 10590.0, cumulative cost = {8.25242586823495E15 
rows, 0.0 cpu, 0.0 io}, id = 1393
HiveProject($f0=[$14], $f1=[$5], $f2=[coalesce($9, 0)], $f3=[-($6, 
coalesce($10, 0))]): rowcount = 1.368586152225262E8, cumulative cost = 
{8.25242586823495E15 rows, 0.0 cpu, 0.0 io}, id = 1391
  HiveJoin(condition=[=($3, $17)], joinType=[inner]): rowcount = 
1.368586152225262E8, cumulative cost = {8.25242586823495E15 rows, 0.0 cpu, 0.0 
io}, id = 1508
HiveJoin(condition=[=($2, $15)], joinType=[inner]): rowcount = 
2.737172304450524E8, cumulative cost = {8.252425594517495E15 rows, 0.0 cpu, 0.0 
io}, id = 1506
  HiveJoin(condition=[=($1, $13)], joinType=[inner]): rowcount = 
8.211516913351573E8, cumulative cost = {8.252424773349804E15 rows, 0.0 cpu, 0.0 
io}, id = 1504
HiveJoin(condition=[=($0, $11)], joinType=[inner]): rowcount = 
1.1296953399027347E11, cumulative cost = {8.252311803804096E15 rows, 0.0 cpu, 
0.0 io}, id = 1418
  HiveJoin(condition=[AND(=($2, $7), =($4, $8))], joinType=[left]): 
rowcount = 8.252311488455487E15, cumulative cost = {3.15348608E8 rows, 0.0 cpu, 
0.0 io}, id = 1413
HiveProject(cs_sold_date_sk=[$0], cs_catalog_page_sk=[$12], 
cs_item_sk=[$15], cs_promo_sk=[$16], cs_order_number=[$17], 
cs_ext_sales_price=[$23], cs_net_profit=[$33]): rowcount = 2.86549727E8, 
cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 1324
  HiveTableScan(table=[[tpcds_bin_orc_200.catalog_sales]]): 
rowcount = 2.86549727E8, cumulative cost = {0}, id = 1136
HiveProject(cr_item_sk=[$2], cr_order_number=[$16], 
cr_return_amount=[$18], cr_net_loss=[$26]): rowcount = 2.8798881E7, cumulative 
cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 1327
  HiveTableScan(table=[[tpcds_bin_orc_200.catalog_returns]]): 
rowcount = 2.8798881E7, cumulative cost = {0}, id = 1137
  HiveProject(d_date_sk=[$0], d_date=[$2]): rowcount = 1.0, 
cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 1371
HiveFilter(condition=[between(false, $2, 
CAST('1998-08-04'):DATE, CAST('1998-09-04'):DATE)]): rowcount = 1.0, cumulative 
cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 1369
  HiveTableScan(table=[[tpcds_bin_orc_200.date_dim]]): rowcount 
= 73049.0, cumulative cost = {0}, id = 1138
HiveProject(cp_catalog_page_sk=[$0], cp_catalog_page_id=[$1]): 
rowcount = 11718.0, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 1375
  HiveTableScan(table=[[tpcds_bin_orc_200.catalog_page]]): rowcount 
= 11718.0, cumulative cost = {0}, id = 1139
  HiveProject(i_item_sk=[$0], i_current_price=[$5]): rowcount = 
16000.0, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 1381
HiveFilter(condition=[($5, 5E1)]): rowcount = 16000.0, cumulative 
cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 1379
  HiveTableScan(table=[[tpcds_bin_orc_200.item]]): rowcount = 
48000.0, cumulative cost = {0}, id = 1140
HiveProject(p_promo_sk=[$0], p_channel_tv=[$11]): rowcount = 225.0, 
cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 1387
  HiveFilter(condition=[=($11, 'N')]): rowcount = 225.0, cumulative 
cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 1385
HiveTableScan(table=[[tpcds_bin_orc_200.promotion]]): rowcount = 
450.0, cumulative cost = {0}, id = 1141
{code}

Explain plan 
{code}
STAGE DEPENDENCIES:
  Stage-1 is a root stage
  

[jira] [Created] (HIVE-9714) Physical optimizer : Under estimation for left join outer condition

2015-02-17 Thread Mostafa Mokhtar (JIRA)
Mostafa Mokhtar created HIVE-9714:
-

 Summary: Physical optimizer : Under estimation for left join outer 
condition 
 Key: HIVE-9714
 URL: https://issues.apache.org/jira/browse/HIVE-9714
 Project: Hive
  Issue Type: Bug
  Components: Physical Optimizer
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Prasanth Jayachandran


For the query below which is a subset of TPC-DS Query 80, CBO joins 
catalog_sales with catalog_returns first although the CE of the join is 
relatively high.
The physical optimizer estimates for catalog_sales left out join 
catalog_returns 2,911 rows while the actual is 285,470,584 rows.

Database used was un-partitioned.

{code}
select count(*)
from
  catalog_sales
 ,warehouse
 ,date_dim
 ,time_dim
 ,ship_mode
 where
catalog_sales.cs_warehouse_sk =  warehouse.w_warehouse_sk
and catalog_sales.cs_sold_date_sk = date_dim.d_date_sk
and catalog_sales.cs_sold_time_sk = time_dim.t_time_sk
and catalog_sales.cs_ship_mode_sk = ship_mode.sm_ship_mode_sk
and d_year = 2002
and t_time between 49530 AND 49530+28800 
and sm_carrier in ('DIAMOND','AIRBORNE')
 group by 
w_warehouse_name
,w_warehouse_sq_ft
,w_city
,w_county
,w_state
,w_country
   ,d_year
{code}

Explain plan 
{code}
STAGE DEPENDENCIES:
  Stage-1 is a root stage
  Stage-0 depends on stages: Stage-1

STAGE PLANS:
  Stage: Stage-1
Tez
  Edges:
Map 1 - Map 2 (BROADCAST_EDGE)
Map 3 - Map 1 (BROADCAST_EDGE)
Map 4 - Map 3 (BROADCAST_EDGE), Map 6 (BROADCAST_EDGE), Map 7 
(BROADCAST_EDGE)
Reducer 5 - Map 4 (SIMPLE_EDGE)
  DagName: mmokhtar_20150217223434_d0ab6fa9-a1a3-47a5-8138-ba7435d9aea5:4
  Vertices:
Map 1
Map Operator Tree:
TableScan
  alias: catalog_sales
  filterExpr: (((cs_sold_date_sk is not null and 
cs_catalog_page_sk is not null) and cs_item_sk is not null) and cs_promo_sk is 
not null) (type: boolean)
  Statistics: Num rows: 286549727 Data size: 65825832570 Basic 
stats: COMPLETE Column stats: COMPLETE
  Filter Operator
predicate: (((cs_sold_date_sk is not null and 
cs_catalog_page_sk is not null) and cs_item_sk is not null) and cs_promo_sk is 
not null) (type: boolean)
Statistics: Num rows: 285112475 Data size: 7974560516 Basic 
stats: COMPLETE Column stats: COMPLETE
Select Operator
  expressions: cs_sold_date_sk (type: int), 
cs_catalog_page_sk (type: int), cs_item_sk (type: int), cs_promo_sk (type: 
int), cs_order_number (type: int), cs_ext_sales_price (type: float), 
cs_net_profit (type: float)
  outputColumnNames: _col0, _col1, _col2, _col3, _col4, 
_col5, _col6
  Statistics: Num rows: 285112475 Data size: 7974560516 
Basic stats: COMPLETE Column stats: COMPLETE
  Map Join Operator
condition map:
 Left Outer Join0 to 1
keys:
  0 _col2 (type: int), _col4 (type: int)
  1 _col0 (type: int), _col1 (type: int)
outputColumnNames: _col0, _col1, _col2, _col3, _col5, 
_col6, _col9, _col10
input vertices:
  1 Map 2
Statistics: Num rows: 2911 Data size: 93152 Basic 
stats: COMPLETE Column stats: COMPLETE
Reduce Output Operator
  key expressions: _col0 (type: int)
  sort order: +
  Map-reduce partition columns: _col0 (type: int)
  Statistics: Num rows: 2911 Data size: 93152 Basic 
stats: COMPLETE Column stats: COMPLETE
  value expressions: _col1 (type: int), _col2 (type: 
int), _col3 (type: int), _col5 (type: float), _col6 (type: float), _col9 (type: 
float), _col10 (type: float)
Execution mode: vectorized
Map 2
Map Operator Tree:
TableScan
  alias: catalog_returns
  filterExpr: cr_item_sk is not null (type: boolean)
  Statistics: Num rows: 28798881 Data size: 5764329494 Basic 
stats: COMPLETE Column stats: COMPLETE
  Filter Operator
predicate: cr_item_sk is not null (type: boolean)
Statistics: Num rows: 28798881 Data size: 456171072 Basic 
stats: COMPLETE Column stats: COMPLETE
Select Operator
  expressions: cr_item_sk (type: int), cr_order_number 
(type: int), cr_return_amount (type: float), cr_net_loss 

[jira] [Created] (HIVE-9712) Hive : Row count and data size are set to LONG.MAX when filter is applied on an estimate of 0

2015-02-17 Thread Mostafa Mokhtar (JIRA)
Mostafa Mokhtar created HIVE-9712:
-

 Summary: Hive : Row count and data size are set to LONG.MAX when 
filter is applied on an estimate of 0
 Key: HIVE-9712
 URL: https://issues.apache.org/jira/browse/HIVE-9712
 Project: Hive
  Issue Type: Bug
  Components: Physical Optimizer
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Prasanth Jayachandran


TPC-DS Q66 generates and in-efficient plan because cardinality estimate of 
dimension table gets set to 9223372036854775807.

{code}
Map 10 
Map Operator Tree:
TableScan
  alias: ship_mode
  filterExpr: ((sm_carrier) IN ('DIAMOND', 'AIRBORNE') and 
sm_ship_mode_sk is not null) (type: boolean)
  Statistics: Num rows: 0 Data size: 47 Basic stats: PARTIAL 
Column stats: COMPLETE
  Filter Operator
predicate: ((sm_carrier) IN ('DIAMOND', 'AIRBORNE') and 
sm_ship_mode_sk is not null) (type: boolean)
Statistics: Num rows: 9223372036854775807 Data size: 
9223372036854775807 Basic stats: COMPLETE Column stats: COMPLETE
Select Operator
  expressions: sm_ship_mode_sk (type: int)
  outputColumnNames: _col0
  Statistics: Num rows: 9223372036854775807 Data size: 
9223372036854775807 Basic stats: COMPLETE Column stats: COMPLETE
  Reduce Output Operator
key expressions: _col0 (type: int)
sort order: +
Map-reduce partition columns: _col0 (type: int)
Statistics: Num rows: 9223372036854775807 Data size: 
9223372036854775807 Basic stats: COMPLETE Column stats: COMPLETE
Execution mode: vectorized
{code}

Full plan 
{code}
explain  

select   
 w_warehouse_name
,w_warehouse_sq_ft
,w_city
,w_county
,w_state
,w_country
,ship_carriers
,year
,sum(jan_sales) as jan_sales
,sum(feb_sales) as feb_sales
,sum(mar_sales) as mar_sales
,sum(apr_sales) as apr_sales
,sum(may_sales) as may_sales
,sum(jun_sales) as jun_sales
,sum(jul_sales) as jul_sales
,sum(aug_sales) as aug_sales
,sum(sep_sales) as sep_sales
,sum(oct_sales) as oct_sales
,sum(nov_sales) as nov_sales
,sum(dec_sales) as dec_sales
,sum(jan_sales/w_warehouse_sq_ft) as jan_sales_per_sq_foot
,sum(feb_sales/w_warehouse_sq_ft) as feb_sales_per_sq_foot
,sum(mar_sales/w_warehouse_sq_ft) as mar_sales_per_sq_foot
,sum(apr_sales/w_warehouse_sq_ft) as apr_sales_per_sq_foot
,sum(may_sales/w_warehouse_sq_ft) as may_sales_per_sq_foot
,sum(jun_sales/w_warehouse_sq_ft) as jun_sales_per_sq_foot
,sum(jul_sales/w_warehouse_sq_ft) as jul_sales_per_sq_foot
,sum(aug_sales/w_warehouse_sq_ft) as aug_sales_per_sq_foot
,sum(sep_sales/w_warehouse_sq_ft) as sep_sales_per_sq_foot
,sum(oct_sales/w_warehouse_sq_ft) as oct_sales_per_sq_foot
,sum(nov_sales/w_warehouse_sq_ft) as nov_sales_per_sq_foot
,sum(dec_sales/w_warehouse_sq_ft) as dec_sales_per_sq_foot
,sum(jan_net) as jan_net
,sum(feb_net) as feb_net
,sum(mar_net) as mar_net
,sum(apr_net) as apr_net
,sum(may_net) as may_net
,sum(jun_net) as jun_net
,sum(jul_net) as jul_net
,sum(aug_net) as aug_net
,sum(sep_net) as sep_net
,sum(oct_net) as oct_net
,sum(nov_net) as nov_net
,sum(dec_net) as dec_net
 from (
select 
w_warehouse_name
,w_warehouse_sq_ft
,w_city
,w_county
,w_state
,w_country
,concat('DIAMOND', ',', 'AIRBORNE') as ship_carriers
,d_year as year
,sum(case when d_moy = 1 
then ws_sales_price* ws_quantity else 0 end) as jan_sales
,sum(case when d_moy = 2 
then ws_sales_price* ws_quantity else 0 end) as feb_sales
,sum(case when d_moy = 3 
then ws_sales_price* ws_quantity else 0 end) as mar_sales
,sum(case when d_moy = 4 
then ws_sales_price* ws_quantity else 0 end) as apr_sales
,sum(case when d_moy = 5 
then ws_sales_price* ws_quantity else 0 end) as may_sales
,sum(case when d_moy = 6 
then ws_sales_price* ws_quantity else 0 end) as jun_sales
,sum(case when d_moy = 7 
then ws_sales_price* ws_quantity else 0 end) as jul_sales
,sum(case when d_moy = 8 
then ws_sales_price* ws_quantity else 0 end) as aug_sales
,sum(case when d_moy = 9 
then ws_sales_price* ws_quantity else 0 end) as sep_sales

[jira] [Updated] (HIVE-9712) Hive : Row count and data size are set to LONG.MAX when source table has 0 rows

2015-02-17 Thread Mostafa Mokhtar (JIRA)

 [ 
https://issues.apache.org/jira/browse/HIVE-9712?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Mostafa Mokhtar updated HIVE-9712:
--
Summary: Hive : Row count and data size are set to LONG.MAX when source 
table has 0 rows  (was: Hive : Row count and data size are set to LONG.MAX when 
filter is applied on an estimate of 0)

 Hive : Row count and data size are set to LONG.MAX when source table has 0 
 rows
 ---

 Key: HIVE-9712
 URL: https://issues.apache.org/jira/browse/HIVE-9712
 Project: Hive
  Issue Type: Bug
  Components: Physical Optimizer
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Prasanth Jayachandran

 TPC-DS Q66 generates and in-efficient plan because cardinality estimate of 
 dimension table gets set to 9223372036854775807.
 {code}
 Map 10 
 Map Operator Tree:
 TableScan
   alias: ship_mode
   filterExpr: ((sm_carrier) IN ('DIAMOND', 'AIRBORNE') and 
 sm_ship_mode_sk is not null) (type: boolean)
   Statistics: Num rows: 0 Data size: 47 Basic stats: PARTIAL 
 Column stats: COMPLETE
   Filter Operator
 predicate: ((sm_carrier) IN ('DIAMOND', 'AIRBORNE') and 
 sm_ship_mode_sk is not null) (type: boolean)
 Statistics: Num rows: 9223372036854775807 Data size: 
 9223372036854775807 Basic stats: COMPLETE Column stats: COMPLETE
 Select Operator
   expressions: sm_ship_mode_sk (type: int)
   outputColumnNames: _col0
   Statistics: Num rows: 9223372036854775807 Data size: 
 9223372036854775807 Basic stats: COMPLETE Column stats: COMPLETE
   Reduce Output Operator
 key expressions: _col0 (type: int)
 sort order: +
 Map-reduce partition columns: _col0 (type: int)
 Statistics: Num rows: 9223372036854775807 Data size: 
 9223372036854775807 Basic stats: COMPLETE Column stats: COMPLETE
 Execution mode: vectorized
 {code}
 Full plan 
 {code}
 explain  
 select   
  w_warehouse_name
   ,w_warehouse_sq_ft
   ,w_city
   ,w_county
   ,w_state
   ,w_country
 ,ship_carriers
 ,year
   ,sum(jan_sales) as jan_sales
   ,sum(feb_sales) as feb_sales
   ,sum(mar_sales) as mar_sales
   ,sum(apr_sales) as apr_sales
   ,sum(may_sales) as may_sales
   ,sum(jun_sales) as jun_sales
   ,sum(jul_sales) as jul_sales
   ,sum(aug_sales) as aug_sales
   ,sum(sep_sales) as sep_sales
   ,sum(oct_sales) as oct_sales
   ,sum(nov_sales) as nov_sales
   ,sum(dec_sales) as dec_sales
   ,sum(jan_sales/w_warehouse_sq_ft) as jan_sales_per_sq_foot
   ,sum(feb_sales/w_warehouse_sq_ft) as feb_sales_per_sq_foot
   ,sum(mar_sales/w_warehouse_sq_ft) as mar_sales_per_sq_foot
   ,sum(apr_sales/w_warehouse_sq_ft) as apr_sales_per_sq_foot
   ,sum(may_sales/w_warehouse_sq_ft) as may_sales_per_sq_foot
   ,sum(jun_sales/w_warehouse_sq_ft) as jun_sales_per_sq_foot
   ,sum(jul_sales/w_warehouse_sq_ft) as jul_sales_per_sq_foot
   ,sum(aug_sales/w_warehouse_sq_ft) as aug_sales_per_sq_foot
   ,sum(sep_sales/w_warehouse_sq_ft) as sep_sales_per_sq_foot
   ,sum(oct_sales/w_warehouse_sq_ft) as oct_sales_per_sq_foot
   ,sum(nov_sales/w_warehouse_sq_ft) as nov_sales_per_sq_foot
   ,sum(dec_sales/w_warehouse_sq_ft) as dec_sales_per_sq_foot
   ,sum(jan_net) as jan_net
   ,sum(feb_net) as feb_net
   ,sum(mar_net) as mar_net
   ,sum(apr_net) as apr_net
   ,sum(may_net) as may_net
   ,sum(jun_net) as jun_net
   ,sum(jul_net) as jul_net
   ,sum(aug_net) as aug_net
   ,sum(sep_net) as sep_net
   ,sum(oct_net) as oct_net
   ,sum(nov_net) as nov_net
   ,sum(dec_net) as dec_net
  from (
 select 
   w_warehouse_name
   ,w_warehouse_sq_ft
   ,w_city
   ,w_county
   ,w_state
   ,w_country
   ,concat('DIAMOND', ',', 'AIRBORNE') as ship_carriers
 ,d_year as year
   ,sum(case when d_moy = 1 
   then ws_sales_price* ws_quantity else 0 end) as jan_sales
   ,sum(case when d_moy = 2 
   then ws_sales_price* ws_quantity else 0 end) as feb_sales
   ,sum(case when d_moy = 3 
   then ws_sales_price* ws_quantity else 0 end) as mar_sales
   ,sum(case when d_moy = 4 
   then ws_sales_price* ws_quantity else 0 end) as apr_sales
   ,sum(case when d_moy = 5 
   then ws_sales_price* ws_quantity else 0 end) as may_sales
   ,sum(case when d_moy = 6 
   then ws_sales_price* ws_quantity else 0 end) as 

[jira] [Created] (HIVE-9695) Redundant filter operator in reducer Vertex when CBO is disabled

2015-02-15 Thread Mostafa Mokhtar (JIRA)
Mostafa Mokhtar created HIVE-9695:
-

 Summary: Redundant filter operator in reducer Vertex when CBO is 
disabled
 Key: HIVE-9695
 URL: https://issues.apache.org/jira/browse/HIVE-9695
 Project: Hive
  Issue Type: Bug
  Components: Physical Optimizer
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Gunther Hagleitner
 Fix For: 1.2.0


There is a redundant filter operator in reducer Vertex when CBO is disabled.

Query 
{code}
select 
ss_item_sk, ss_ticket_number, ss_store_sk
from
store_sales a, store_returns b, store
where
a.ss_item_sk = b.sr_item_sk
and a.ss_ticket_number = b.sr_ticket_number 
and ss_sold_date_sk between 2450816 and 2451500
and sr_returned_date_sk between 2450816 and 2451500
and s_store_sk = ss_store_sk;
{code}
Plan snippet 
{code}
  Statistics: Num rows: 57439344 Data size: 1838059008 Basic stats: COMPLETE 
Column stats: COMPLETE
  Filter Operator
predicate: (_col1 = _col27) and (_col8 = _col34)) and 
_col22 BETWEEN 2450816 AND 2451500) and _col45 BETWEEN 2450816 AND 2451500) and 
(_col49 = _col6)) (type: boolean)
{code}

Full plan with CBO disabled
{code}
STAGE DEPENDENCIES:
  Stage-1 is a root stage
  Stage-0 depends on stages: Stage-1

STAGE PLANS:
  Stage: Stage-1
Tez
  Edges:
Reducer 2 - Map 1 (SIMPLE_EDGE), Map 3 (BROADCAST_EDGE), Map 4 
(SIMPLE_EDGE)
  DagName: mmokhtar_20150214182626_ad6820c7-b667-4652-ab25-cb60deed1a6d:13
  Vertices:
Map 1
Map Operator Tree:
TableScan
  alias: b
  filterExpr: ((sr_item_sk is not null and sr_ticket_number is 
not null) and sr_returned_date_sk BETWEEN 2450816 AND 2451500) (type: boolean)
  Statistics: Num rows: 2370038095 Data size: 170506118656 
Basic stats: COMPLETE Column stats: COMPLETE
  Filter Operator
predicate: (sr_item_sk is not null and sr_ticket_number is 
not null) (type: boolean)
Statistics: Num rows: 706893063 Data size: 6498502768 Basic 
stats: COMPLETE Column stats: COMPLETE
Reduce Output Operator
  key expressions: sr_item_sk (type: int), sr_ticket_number 
(type: int)
  sort order: ++
  Map-reduce partition columns: sr_item_sk (type: int), 
sr_ticket_number (type: int)
  Statistics: Num rows: 706893063 Data size: 6498502768 
Basic stats: COMPLETE Column stats: COMPLETE
  value expressions: sr_returned_date_sk (type: int)
Execution mode: vectorized
Map 3
Map Operator Tree:
TableScan
  alias: store
  filterExpr: s_store_sk is not null (type: boolean)
  Statistics: Num rows: 1704 Data size: 3256276 Basic stats: 
COMPLETE Column stats: COMPLETE
  Filter Operator
predicate: s_store_sk is not null (type: boolean)
Statistics: Num rows: 1704 Data size: 6816 Basic stats: 
COMPLETE Column stats: COMPLETE
Reduce Output Operator
  key expressions: s_store_sk (type: int)
  sort order: +
  Map-reduce partition columns: s_store_sk (type: int)
  Statistics: Num rows: 1704 Data size: 6816 Basic stats: 
COMPLETE Column stats: COMPLETE
Execution mode: vectorized
Map 4
Map Operator Tree:
TableScan
  alias: a
  filterExpr: (((ss_item_sk is not null and ss_ticket_number is 
not null) and ss_store_sk is not null) and ss_sold_date_sk BETWEEN 2450816 AND 
2451500) (type: boolean)
  Statistics: Num rows: 28878719387 Data size: 2405805439460 
Basic stats: COMPLETE Column stats: COMPLETE
  Filter Operator
predicate: ((ss_item_sk is not null and ss_ticket_number is 
not null) and ss_store_sk is not null) (type: boolean)
Statistics: Num rows: 8405840828 Data size: 110101408700 
Basic stats: COMPLETE Column stats: COMPLETE
Reduce Output Operator
  key expressions: ss_item_sk (type: int), ss_ticket_number 
(type: int)
  sort order: ++
  Map-reduce partition columns: ss_item_sk (type: int), 
ss_ticket_number (type: int)
  Statistics: Num rows: 8405840828 Data size: 110101408700 
Basic stats: COMPLETE Column stats: COMPLETE
  value expressions: ss_store_sk (type: int), 
ss_sold_date_sk (type: int)
Execution mode: vectorized
Reducer 2
Reduce 

[jira] [Commented] (HIVE-9495) Map Side aggregation affecting map performance

2015-02-12 Thread Mostafa Mokhtar (JIRA)

[ 
https://issues.apache.org/jira/browse/HIVE-9495?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanelfocusedCommentId=14319633#comment-14319633
 ] 

Mostafa Mokhtar commented on HIVE-9495:
---

[~navis]

I believe this is why this operation is slow 
{code}
hashAggregations = new HashMapKeyWrapper, AggregationBuffer[](256);
{code}

We should be using the estimate row count to correctly size this HashMap to 
avoid excessive chaining and resizing, yet if we estimate too much this can 
cause OOM.
Inserting a million rows into a hash map with an initial size of 256 is likely 
to result in bad performance.

Something like this from HashTableLoader.load
{code}
MapInteger, Long parentKeyCounts = desc.getParentKeyCounts();
Long keyCountObj = parentKeyCounts.get(pos);
long keyCount = (keyCountObj == null) ? -1 : keyCountObj.longValue();
{code}


Ideally the decision to enable map side aggregation should be driven by CE 
(cardinality estimate) and NDV.
Based on these two we can estimate how much reduction we get from the map side 
aggregation, in other words if NDV = CE then skip map side aggregation.

For TPC-H each l_orderkey is repeated ~4 times, so we are better off skipping 
the map side aggregation (local agg).

 Map Side aggregation affecting map performance
 --

 Key: HIVE-9495
 URL: https://issues.apache.org/jira/browse/HIVE-9495
 Project: Hive
  Issue Type: Bug
  Components: Query Processor
Affects Versions: 0.14.0
 Environment: RHEL 6.4
 Hortonworks Hadoop 2.2
Reporter: Anand Sridharan
 Attachments: HIVE-9495.1.patch.txt, profiler_screenshot.PNG


 When trying to run a simple aggregation query with hive.map.aggr=true, map 
 tasks take a lot of time in Hive 0.14 as against  with hive.map.aggr=false.
 e.g.
 Consider the query:
 {code}
 INSERT OVERWRITE TABLE lineitem_tgt_agg
 select alias.a0 as a0,
  alias.a2 as a1,
  alias.a1 as a2,
  alias.a3 as a3,
  alias.a4 as a4
 from (
  select alias.a0 as a0,
   SUM(alias.a1) as a1,
   SUM(alias.a2) as a2,
   SUM(alias.a3) as a3,
   SUM(alias.a4) as a4
  from (
   select lineitem_sf500.l_orderkey as a0,
CAST(lineitem_sf500.l_quantity * lineitem_sf500.l_extendedprice * (1 - 
 lineitem_sf500.l_discount) * (1 + lineitem_sf500.l_tax) as double) as a1,
lineitem_sf500.l_quantity as a2,
CAST(lineitem_sf500.l_quantity * lineitem_sf500.l_extendedprice * 
 lineitem_sf500.l_discount as double) as a3,
CAST(lineitem_sf500.l_quantity * lineitem_sf500.l_extendedprice * 
 lineitem_sf500.l_tax as double) as a4
   from lineitem_sf500
   ) alias
  group by alias.a0
  ) alias;
 {code}
 The above query was run with ~376GB of data / ~3billion records in the source.
 It takes ~10 minutes with hive.map.aggr=false.
 With map side aggregation set to true, the map tasks don't complete even 
 after an hour.



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)


[jira] [Created] (HIVE-9647) Discrepancy in CBO between partitioned and un-partitioned tables

2015-02-10 Thread Mostafa Mokhtar (JIRA)
Mostafa Mokhtar created HIVE-9647:
-

 Summary: Discrepancy in CBO between partitioned and un-partitioned 
tables 
 Key: HIVE-9647
 URL: https://issues.apache.org/jira/browse/HIVE-9647
 Project: Hive
  Issue Type: Bug
  Components: CBO
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Gunther Hagleitner
 Fix For: 1.2.0


High-level summary
HiveRelMdSelectivity.computeInnerJoinSelectivity relies on per column number of 
distinct value to estimate join selectivity.
The way statistics are aggregated for partitioned tables results in discrepancy 
in number of distinct values which results in different plans between 
partitioned and un-partitioned schemas.

The table below summarizes the NDVs in computeInnerJoinSelectivity which are 
used to estimate selectivity of joins.
{code}
Column  Partitioned count distincts Un-Partitioned count distincts 
sr_customer_sk  71,245  1,415,625
sr_item_sk  38,846  62,562
sr_ticket_number71,245  34,931,085
ss_customer_sk  88,476  1,415,625
ss_item_sk  38,846  62,562
ss_ticket_number100,756 56,256,175
{code}

The discrepancy is because NDV calculation for a partitioned table assumes that 
the NDV range is contained within each partition and is calculates as select 
max(NUM_DISTINCTS) from PART_COL_STATS” .
This is problematic for columns like ticket number which are naturally 
increasing with the partitioned date column ss_sold_date_sk.
Suggestions
Use Hyper Log Log as suggested by Gopal, there is an HLL implementation for 
HBASE co-porccessors which we can use as a reference here 
Using the global stats from TAB_COL_STATS and the per partition stats from 
PART_COL_STATS extrapolate the NDV for the qualified partitions as in :
Max ( (NUM_DISTINCTS from TAB_COL_STATS) x (Number of qualified partitions) / 
(Number of Partitions), max(NUM_DISTINCTS) from PART_COL_STATS))
More details
While doing TPC-DS Partitioned vs. Un-Partitioned runs I noticed that many of 
the plans are different, then I dumped the CBO logical plan and I found that 
join estimates are drastically different

Unpartitioned schema :
{code}
2015-02-10 11:33:27,624 DEBUG [main]: parse.SemanticAnalyzer 
(SemanticAnalyzer.java:apply(12624)) - Plan After Join Reordering:
HiveProjectRel(store_sales_quantitycount=[$0], store_sales_quantityave=[$1], 
store_sales_quantitystdev=[$2], store_sales_quantitycov=[/($2, $1)], 
as_store_returns_quantitycount=[$3], as_store_returns_quantityave=[$4], 
as_store_returns_quantitystdev=[$5], store_returns_quantitycov=[/($5, $4)]): 
rowcount = 1.0, cumulative cost = {6.056835407771381E8 rows, 0.0 cpu, 0.0 io}, 
id = 2956
  HiveAggregateRel(group=[{}], agg#0=[count($0)], agg#1=[avg($0)], 
agg#2=[stddev_samp($0)], agg#3=[count($1)], agg#4=[avg($1)], 
agg#5=[stddev_samp($1)]): rowcount = 1.0, cumulative cost = 
{6.056835407771381E8 rows, 0.0 cpu, 0.0 io}, id = 2954
HiveProjectRel($f0=[$4], $f1=[$8]): rowcount = 40.05611776795562, 
cumulative cost = {6.056835407771381E8 rows, 0.0 cpu, 0.0 io}, id = 2952
  HiveProjectRel(ss_sold_date_sk=[$0], ss_item_sk=[$1], 
ss_customer_sk=[$2], ss_ticket_number=[$3], ss_quantity=[$4], sr_item_sk=[$5], 
sr_customer_sk=[$6], sr_ticket_number=[$7], sr_return_quantity=[$8], 
d_date_sk=[$9], d_quarter_name=[$10]): rowcount = 40.05611776795562, cumulative 
cost = {6.056835407771381E8 rows, 0.0 cpu, 0.0 io}, id = 2982
HiveJoinRel(condition=[=($9, $0)], joinType=[inner]): rowcount = 
40.05611776795562, cumulative cost = {6.056835407771381E8 rows, 0.0 cpu, 0.0 
io}, id = 2980
  HiveJoinRel(condition=[AND(AND(=($2, $6), =($1, $5)), =($3, $7))], 
joinType=[inner]): rowcount = 28880.460910696, cumulative cost = {6.05654559E8 
rows, 0.0 cpu, 0.0 io}, id = 2964
HiveProjectRel(ss_sold_date_sk=[$0], ss_item_sk=[$2], 
ss_customer_sk=[$3], ss_ticket_number=[$9], ss_quantity=[$10]): rowcount = 
5.50076554E8, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 2920
  HiveTableScanRel(table=[[tpcds_bin_orc_200.store_sales]]): 
rowcount = 5.50076554E8, cumulative cost = {0}, id = 2822
HiveProjectRel(sr_item_sk=[$2], sr_customer_sk=[$3], 
sr_ticket_number=[$9], sr_return_quantity=[$10]): rowcount = 5.5578005E7, 
cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 2923
  HiveTableScanRel(table=[[tpcds_bin_orc_200.store_returns]]): 
rowcount = 5.5578005E7, cumulative cost = {0}, id = 2823
  HiveProjectRel(d_date_sk=[$0], d_quarter_name=[$15]): rowcount = 
101.31622746185853, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 2948
HiveFilterRel(condition=[=($15, '2000Q1')]): rowcount = 
101.31622746185853, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 2946
  HiveTableScanRel(table=[[tpcds_bin_orc_200.date_dim]]): rowcount 
= 73049.0, cumulative cost = {0}, id = 2821
{code}


[jira] [Updated] (HIVE-9647) Discrepancy in CBO between partitioned and un-partitioned tables

2015-02-10 Thread Mostafa Mokhtar (JIRA)

 [ 
https://issues.apache.org/jira/browse/HIVE-9647?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Mostafa Mokhtar updated HIVE-9647:
--
Description: 
High-level summary
HiveRelMdSelectivity.computeInnerJoinSelectivity relies on per column number of 
distinct value to estimate join selectivity.
The way statistics are aggregated for partitioned tables results in discrepancy 
in number of distinct values which results in different plans between 
partitioned and un-partitioned schemas.

The table below summarizes the NDVs in computeInnerJoinSelectivity which are 
used to estimate selectivity of joins.

||Column||Partitioned count distincts|| Un-Partitioned count 
distincts 
|sr_customer_sk |71,245 |1,415,625|
|sr_item_sk |38,846|62,562|
|sr_ticket_number   |71,245 |34,931,085|
|ss_customer_sk |88,476|1,415,625|
|ss_item_sk |38,846|62,562|
|ss_ticket_number|  100,756 |56,256,175|


The discrepancy is because NDV calculation for a partitioned table assumes that 
the NDV range is contained within each partition and is calculates as select 
max(NUM_DISTINCTS) from PART_COL_STATS” .
This is problematic for columns like ticket number which are naturally 
increasing with the partitioned date column ss_sold_date_sk.
Suggestions
Use Hyper Log Log as suggested by Gopal, there is an HLL implementation for 
HBASE co-porccessors which we can use as a reference here 
Using the global stats from TAB_COL_STATS and the per partition stats from 
PART_COL_STATS extrapolate the NDV for the qualified partitions as in :
Max ( (NUM_DISTINCTS from TAB_COL_STATS) x (Number of qualified partitions) / 
(Number of Partitions), max(NUM_DISTINCTS) from PART_COL_STATS))
More details
While doing TPC-DS Partitioned vs. Un-Partitioned runs I noticed that many of 
the plans are different, then I dumped the CBO logical plan and I found that 
join estimates are drastically different

Unpartitioned schema :
{code}
2015-02-10 11:33:27,624 DEBUG [main]: parse.SemanticAnalyzer 
(SemanticAnalyzer.java:apply(12624)) - Plan After Join Reordering:
HiveProjectRel(store_sales_quantitycount=[$0], store_sales_quantityave=[$1], 
store_sales_quantitystdev=[$2], store_sales_quantitycov=[/($2, $1)], 
as_store_returns_quantitycount=[$3], as_store_returns_quantityave=[$4], 
as_store_returns_quantitystdev=[$5], store_returns_quantitycov=[/($5, $4)]): 
rowcount = 1.0, cumulative cost = {6.056835407771381E8 rows, 0.0 cpu, 0.0 io}, 
id = 2956
  HiveAggregateRel(group=[{}], agg#0=[count($0)], agg#1=[avg($0)], 
agg#2=[stddev_samp($0)], agg#3=[count($1)], agg#4=[avg($1)], 
agg#5=[stddev_samp($1)]): rowcount = 1.0, cumulative cost = 
{6.056835407771381E8 rows, 0.0 cpu, 0.0 io}, id = 2954
HiveProjectRel($f0=[$4], $f1=[$8]): rowcount = 40.05611776795562, 
cumulative cost = {6.056835407771381E8 rows, 0.0 cpu, 0.0 io}, id = 2952
  HiveProjectRel(ss_sold_date_sk=[$0], ss_item_sk=[$1], 
ss_customer_sk=[$2], ss_ticket_number=[$3], ss_quantity=[$4], sr_item_sk=[$5], 
sr_customer_sk=[$6], sr_ticket_number=[$7], sr_return_quantity=[$8], 
d_date_sk=[$9], d_quarter_name=[$10]): rowcount = 40.05611776795562, cumulative 
cost = {6.056835407771381E8 rows, 0.0 cpu, 0.0 io}, id = 2982
HiveJoinRel(condition=[=($9, $0)], joinType=[inner]): rowcount = 
40.05611776795562, cumulative cost = {6.056835407771381E8 rows, 0.0 cpu, 0.0 
io}, id = 2980
  HiveJoinRel(condition=[AND(AND(=($2, $6), =($1, $5)), =($3, $7))], 
joinType=[inner]): rowcount = 28880.460910696, cumulative cost = {6.05654559E8 
rows, 0.0 cpu, 0.0 io}, id = 2964
HiveProjectRel(ss_sold_date_sk=[$0], ss_item_sk=[$2], 
ss_customer_sk=[$3], ss_ticket_number=[$9], ss_quantity=[$10]): rowcount = 
5.50076554E8, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 2920
  HiveTableScanRel(table=[[tpcds_bin_orc_200.store_sales]]): 
rowcount = 5.50076554E8, cumulative cost = {0}, id = 2822
HiveProjectRel(sr_item_sk=[$2], sr_customer_sk=[$3], 
sr_ticket_number=[$9], sr_return_quantity=[$10]): rowcount = 5.5578005E7, 
cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 2923
  HiveTableScanRel(table=[[tpcds_bin_orc_200.store_returns]]): 
rowcount = 5.5578005E7, cumulative cost = {0}, id = 2823
  HiveProjectRel(d_date_sk=[$0], d_quarter_name=[$15]): rowcount = 
101.31622746185853, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 2948
HiveFilterRel(condition=[=($15, '2000Q1')]): rowcount = 
101.31622746185853, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 2946
  HiveTableScanRel(table=[[tpcds_bin_orc_200.date_dim]]): rowcount 
= 73049.0, cumulative cost = {0}, id = 2821
{code}

Partitioned schema :
{code}
2015-02-10 11:32:16,880 DEBUG [main]: parse.SemanticAnalyzer 
(SemanticAnalyzer.java:apply(12624)) - Plan After Join Reordering:
HiveProjectRel(store_sales_quantitycount=[$0], store_sales_quantityave=[$1], 

[jira] [Updated] (HIVE-9647) Discrepancy in CBO between partitioned and un-partitioned tables

2015-02-10 Thread Mostafa Mokhtar (JIRA)

 [ 
https://issues.apache.org/jira/browse/HIVE-9647?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Mostafa Mokhtar updated HIVE-9647:
--
Description: 
High-level summary
HiveRelMdSelectivity.computeInnerJoinSelectivity relies on per column number of 
distinct value to estimate join selectivity.
The way statistics are aggregated for partitioned tables results in discrepancy 
in number of distinct values which results in different plans between 
partitioned and un-partitioned schemas.

The table below summarizes the NDVs in computeInnerJoinSelectivity which are 
used to estimate selectivity of joins.

||Column||Partitioned count distincts|| Un-Partitioned count 
distincts 
|sr_customer_sk |71,245 |1,415,625|
|sr_item_sk |38,846|62,562|
|sr_ticket_number   |71,245 |34,931,085|
|ss_customer_sk |88,476|1,415,625|
|ss_item_sk |38,846|62,562|
|ss_ticket_number|  100,756 |56,256,175|


The discrepancy is because NDV calculation for a partitioned table assumes that 
the NDV range is contained within each partition and is calculates as select 
max(NUM_DISTINCTS) from PART_COL_STATS” .
This is problematic for columns like ticket number which are naturally 
increasing with the partitioned date column ss_sold_date_sk.
Suggestions
Use Hyper Log Log as suggested by Gopal, there is an HLL implementation for 
HBASE co-porccessors which we can use as a reference here 
Using the global stats from TAB_COL_STATS and the per partition stats from 
PART_COL_STATS extrapolate the NDV for the qualified partitions as in :
Max ( (NUM_DISTINCTS from TAB_COL_STATS) x (Number of qualified partitions) / 
(Number of Partitions), max(NUM_DISTINCTS) from PART_COL_STATS))
More details
While doing TPC-DS Partitioned vs. Un-Partitioned runs I noticed that many of 
the plans are different, then I dumped the CBO logical plan and I found that 
join estimates are drastically different

Unpartitioned schema :
{code}
2015-02-10 11:33:27,624 DEBUG [main]: parse.SemanticAnalyzer 
(SemanticAnalyzer.java:apply(12624)) - Plan After Join Reordering:
HiveProjectRel(store_sales_quantitycount=[$0], store_sales_quantityave=[$1], 
store_sales_quantitystdev=[$2], store_sales_quantitycov=[/($2, $1)], 
as_store_returns_quantitycount=[$3], as_store_returns_quantityave=[$4], 
as_store_returns_quantitystdev=[$5], store_returns_quantitycov=[/($5, $4)]): 
rowcount = 1.0, cumulative cost = {6.056835407771381E8 rows, 0.0 cpu, 0.0 io}, 
id = 2956
  HiveAggregateRel(group=[{}], agg#0=[count($0)], agg#1=[avg($0)], 
agg#2=[stddev_samp($0)], agg#3=[count($1)], agg#4=[avg($1)], 
agg#5=[stddev_samp($1)]): rowcount = 1.0, cumulative cost = 
{6.056835407771381E8 rows, 0.0 cpu, 0.0 io}, id = 2954
HiveProjectRel($f0=[$4], $f1=[$8]): rowcount = 40.05611776795562, 
cumulative cost = {6.056835407771381E8 rows, 0.0 cpu, 0.0 io}, id = 2952
  HiveProjectRel(ss_sold_date_sk=[$0], ss_item_sk=[$1], 
ss_customer_sk=[$2], ss_ticket_number=[$3], ss_quantity=[$4], sr_item_sk=[$5], 
sr_customer_sk=[$6], sr_ticket_number=[$7], sr_return_quantity=[$8], 
d_date_sk=[$9], d_quarter_name=[$10]): rowcount = 40.05611776795562, cumulative 
cost = {6.056835407771381E8 rows, 0.0 cpu, 0.0 io}, id = 2982
HiveJoinRel(condition=[=($9, $0)], joinType=[inner]): rowcount = 
40.05611776795562, cumulative cost = {6.056835407771381E8 rows, 0.0 cpu, 0.0 
io}, id = 2980
  HiveJoinRel(condition=[AND(AND(=($2, $6), =($1, $5)), =($3, $7))], 
joinType=[inner]): rowcount = 28880.460910696, cumulative cost = {6.05654559E8 
rows, 0.0 cpu, 0.0 io}, id = 2964
HiveProjectRel(ss_sold_date_sk=[$0], ss_item_sk=[$2], 
ss_customer_sk=[$3], ss_ticket_number=[$9], ss_quantity=[$10]): rowcount = 
5.50076554E8, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 2920
  HiveTableScanRel(table=[[tpcds_bin_orc_200.store_sales]]): 
rowcount = 5.50076554E8, cumulative cost = {0}, id = 2822
HiveProjectRel(sr_item_sk=[$2], sr_customer_sk=[$3], 
sr_ticket_number=[$9], sr_return_quantity=[$10]): rowcount = 5.5578005E7, 
cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 2923
  HiveTableScanRel(table=[[tpcds_bin_orc_200.store_returns]]): 
rowcount = 5.5578005E7, cumulative cost = {0}, id = 2823
  HiveProjectRel(d_date_sk=[$0], d_quarter_name=[$15]): rowcount = 
101.31622746185853, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 2948
HiveFilterRel(condition=[=($15, '2000Q1')]): rowcount = 
101.31622746185853, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 2946
  HiveTableScanRel(table=[[tpcds_bin_orc_200.date_dim]]): rowcount 
= 73049.0, cumulative cost = {0}, id = 2821
{code}

Partitioned schema :
{code}
2015-02-10 11:32:16,880 DEBUG [main]: parse.SemanticAnalyzer 
(SemanticAnalyzer.java:apply(12624)) - Plan After Join Reordering:
HiveProjectRel(store_sales_quantitycount=[$0], store_sales_quantityave=[$1], 

[jira] [Updated] (HIVE-9647) Discrepancy in CBO between partitioned and un-partitioned tables

2015-02-10 Thread Mostafa Mokhtar (JIRA)

 [ 
https://issues.apache.org/jira/browse/HIVE-9647?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Mostafa Mokhtar updated HIVE-9647:
--
Description: 
High-level summary
HiveRelMdSelectivity.computeInnerJoinSelectivity relies on per column number of 
distinct value to estimate join selectivity.
The way statistics are aggregated for partitioned tables results in discrepancy 
in number of distinct values which results in different plans between 
partitioned and un-partitioned schemas.

The table below summarizes the NDVs in computeInnerJoinSelectivity which are 
used to estimate selectivity of joins.

||Column||Partitioned count distincts|| Un-Partitioned count 
distincts 
|sr_customer_sk |71,245 |1,415,625|
|sr_item_sk |38,846|62,562|
|sr_ticket_number   |71,245 |34,931,085|
|ss_customer_sk |88,476|1,415,625|
|ss_item_sk |38,846|62,562|
|ss_ticket_number|  100,756 |56,256,175|


The discrepancy is because NDV calculation for a partitioned table assumes that 
the NDV range is contained within each partition and is calculates as select 
max(NUM_DISTINCTS) from PART_COL_STATS” .
This is problematic for columns like ticket number which are naturally 
increasing with the partitioned date column ss_sold_date_sk.
Suggestions
Use Hyper Log Log as suggested by Gopal, there is an HLL implementation for 
HBASE co-porccessors which we can use as a reference here 
Using the global stats from TAB_COL_STATS and the per partition stats from 
PART_COL_STATS extrapolate the NDV for the qualified partitions as in :
Max ( (NUM_DISTINCTS from TAB_COL_STATS) x (Number of qualified partitions) / 
(Number of Partitions), max(NUM_DISTINCTS) from PART_COL_STATS))
More details
While doing TPC-DS Partitioned vs. Un-Partitioned runs I noticed that many of 
the plans are different, then I dumped the CBO logical plan and I found that 
join estimates are drastically different

Unpartitioned schema :
{code}
2015-02-10 11:33:27,624 DEBUG [main]: parse.SemanticAnalyzer 
(SemanticAnalyzer.java:apply(12624)) - Plan After Join Reordering:
HiveProjectRel(store_sales_quantitycount=[$0], store_sales_quantityave=[$1], 
store_sales_quantitystdev=[$2], store_sales_quantitycov=[/($2, $1)], 
as_store_returns_quantitycount=[$3], as_store_returns_quantityave=[$4], 
as_store_returns_quantitystdev=[$5], store_returns_quantitycov=[/($5, $4)]): 
rowcount = 1.0, cumulative cost = {6.056835407771381E8 rows, 0.0 cpu, 0.0 io}, 
id = 2956
  HiveAggregateRel(group=[{}], agg#0=[count($0)], agg#1=[avg($0)], 
agg#2=[stddev_samp($0)], agg#3=[count($1)], agg#4=[avg($1)], 
agg#5=[stddev_samp($1)]): rowcount = 1.0, cumulative cost = 
{6.056835407771381E8 rows, 0.0 cpu, 0.0 io}, id = 2954
HiveProjectRel($f0=[$4], $f1=[$8]): rowcount = 40.05611776795562, 
cumulative cost = {6.056835407771381E8 rows, 0.0 cpu, 0.0 io}, id = 2952
  HiveProjectRel(ss_sold_date_sk=[$0], ss_item_sk=[$1], 
ss_customer_sk=[$2], ss_ticket_number=[$3], ss_quantity=[$4], sr_item_sk=[$5], 
sr_customer_sk=[$6], sr_ticket_number=[$7], sr_return_quantity=[$8], 
d_date_sk=[$9], d_quarter_name=[$10]): rowcount = 40.05611776795562, cumulative 
cost = {6.056835407771381E8 rows, 0.0 cpu, 0.0 io}, id = 2982
HiveJoinRel(condition=[=($9, $0)], joinType=[inner]): rowcount = 
40.05611776795562, cumulative cost = {6.056835407771381E8 rows, 0.0 cpu, 0.0 
io}, id = 2980
  HiveJoinRel(condition=[AND(AND(=($2, $6), =($1, $5)), =($3, $7))], 
joinType=[inner]): rowcount = 28880.460910696, cumulative cost = {6.05654559E8 
rows, 0.0 cpu, 0.0 io}, id = 2964
HiveProjectRel(ss_sold_date_sk=[$0], ss_item_sk=[$2], 
ss_customer_sk=[$3], ss_ticket_number=[$9], ss_quantity=[$10]): rowcount = 
5.50076554E8, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 2920
  HiveTableScanRel(table=[[tpcds_bin_orc_200.store_sales]]): 
rowcount = 5.50076554E8, cumulative cost = {0}, id = 2822
HiveProjectRel(sr_item_sk=[$2], sr_customer_sk=[$3], 
sr_ticket_number=[$9], sr_return_quantity=[$10]): rowcount = 5.5578005E7, 
cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 2923
  HiveTableScanRel(table=[[tpcds_bin_orc_200.store_returns]]): 
rowcount = 5.5578005E7, cumulative cost = {0}, id = 2823
  HiveProjectRel(d_date_sk=[$0], d_quarter_name=[$15]): rowcount = 
101.31622746185853, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 2948
HiveFilterRel(condition=[=($15, '2000Q1')]): rowcount = 
101.31622746185853, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 2946
  HiveTableScanRel(table=[[tpcds_bin_orc_200.date_dim]]): rowcount 
= 73049.0, cumulative cost = {0}, id = 2821
{code}

Partitioned schema :
{code}
2015-02-10 11:32:16,880 DEBUG [main]: parse.SemanticAnalyzer 
(SemanticAnalyzer.java:apply(12624)) - Plan After Join Reordering:
HiveProjectRel(store_sales_quantitycount=[$0], store_sales_quantityave=[$1], 

[jira] [Updated] (HIVE-9647) Discrepancy in cardinality estimates between partitioned and un-partitioned tables

2015-02-10 Thread Mostafa Mokhtar (JIRA)

 [ 
https://issues.apache.org/jira/browse/HIVE-9647?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Mostafa Mokhtar updated HIVE-9647:
--
Summary: Discrepancy in cardinality estimates between partitioned and 
un-partitioned tables   (was: Discrepancy in CBO between partitioned and 
un-partitioned tables )

 Discrepancy in cardinality estimates between partitioned and un-partitioned 
 tables 
 ---

 Key: HIVE-9647
 URL: https://issues.apache.org/jira/browse/HIVE-9647
 Project: Hive
  Issue Type: Bug
  Components: CBO
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Gunther Hagleitner
 Fix For: 1.2.0


 High-level summary
 HiveRelMdSelectivity.computeInnerJoinSelectivity relies on per column number 
 of distinct value to estimate join selectivity.
 The way statistics are aggregated for partitioned tables results in 
 discrepancy in number of distinct values which results in different plans 
 between partitioned and un-partitioned schemas.
 The table below summarizes the NDVs in computeInnerJoinSelectivity which are 
 used to estimate selectivity of joins.
 ||Column  ||Partitioned count distincts|| Un-Partitioned count 
 distincts 
 |sr_customer_sk   |71,245 |1,415,625|
 |sr_item_sk   |38,846|62,562|
 |sr_ticket_number |71,245 |34,931,085|
 |ss_customer_sk   |88,476|1,415,625|
 |ss_item_sk   |38,846|62,562|
 |ss_ticket_number|100,756 |56,256,175|
   
 The discrepancy is because NDV calculation for a partitioned table assumes 
 that the NDV range is contained within each partition and is calculates as 
 select max(NUM_DISTINCTS) from PART_COL_STATS” .
 This is problematic for columns like ticket number which are naturally 
 increasing with the partitioned date column ss_sold_date_sk.
 Suggestions
 Use Hyper Log Log as suggested by Gopal, there is an HLL implementation for 
 HBASE co-porccessors which we can use as a reference here 
 Using the global stats from TAB_COL_STATS and the per partition stats from 
 PART_COL_STATS extrapolate the NDV for the qualified partitions as in :
 Max ( (NUM_DISTINCTS from TAB_COL_STATS) x (Number of qualified partitions) / 
 (Number of Partitions), max(NUM_DISTINCTS) from PART_COL_STATS))
 More details
 While doing TPC-DS Partitioned vs. Un-Partitioned runs I noticed that many of 
 the plans are different, then I dumped the CBO logical plan and I found that 
 join estimates are drastically different
 Unpartitioned schema :
 {code}
 2015-02-10 11:33:27,624 DEBUG [main]: parse.SemanticAnalyzer 
 (SemanticAnalyzer.java:apply(12624)) - Plan After Join Reordering:
 HiveProjectRel(store_sales_quantitycount=[$0], store_sales_quantityave=[$1], 
 store_sales_quantitystdev=[$2], store_sales_quantitycov=[/($2, $1)], 
 as_store_returns_quantitycount=[$3], as_store_returns_quantityave=[$4], 
 as_store_returns_quantitystdev=[$5], store_returns_quantitycov=[/($5, $4)]): 
 rowcount = 1.0, cumulative cost = {6.056835407771381E8 rows, 0.0 cpu, 0.0 
 io}, id = 2956
   HiveAggregateRel(group=[{}], agg#0=[count($0)], agg#1=[avg($0)], 
 agg#2=[stddev_samp($0)], agg#3=[count($1)], agg#4=[avg($1)], 
 agg#5=[stddev_samp($1)]): rowcount = 1.0, cumulative cost = 
 {6.056835407771381E8 rows, 0.0 cpu, 0.0 io}, id = 2954
 HiveProjectRel($f0=[$4], $f1=[$8]): rowcount = 40.05611776795562, 
 cumulative cost = {6.056835407771381E8 rows, 0.0 cpu, 0.0 io}, id = 2952
   HiveProjectRel(ss_sold_date_sk=[$0], ss_item_sk=[$1], 
 ss_customer_sk=[$2], ss_ticket_number=[$3], ss_quantity=[$4], 
 sr_item_sk=[$5], sr_customer_sk=[$6], sr_ticket_number=[$7], 
 sr_return_quantity=[$8], d_date_sk=[$9], d_quarter_name=[$10]): rowcount = 
 40.05611776795562, cumulative cost = {6.056835407771381E8 rows, 0.0 cpu, 0.0 
 io}, id = 2982
 HiveJoinRel(condition=[=($9, $0)], joinType=[inner]): rowcount = 
 40.05611776795562, cumulative cost = {6.056835407771381E8 rows, 0.0 cpu, 0.0 
 io}, id = 2980
   HiveJoinRel(condition=[AND(AND(=($2, $6), =($1, $5)), =($3, $7))], 
 joinType=[inner]): rowcount = 28880.460910696, cumulative cost = 
 {6.05654559E8 rows, 0.0 cpu, 0.0 io}, id = 2964
 HiveProjectRel(ss_sold_date_sk=[$0], ss_item_sk=[$2], 
 ss_customer_sk=[$3], ss_ticket_number=[$9], ss_quantity=[$10]): rowcount = 
 5.50076554E8, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 2920
   HiveTableScanRel(table=[[tpcds_bin_orc_200.store_sales]]): 
 rowcount = 5.50076554E8, cumulative cost = {0}, id = 2822
 HiveProjectRel(sr_item_sk=[$2], sr_customer_sk=[$3], 
 sr_ticket_number=[$9], sr_return_quantity=[$10]): rowcount = 5.5578005E7, 
 cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 2923
   HiveTableScanRel(table=[[tpcds_bin_orc_200.store_returns]]): 
 

[jira] [Created] (HIVE-9624) NullPointerException in MapJoinOperator.processOp(MapJoinOperator.java:253) for TPC-DS Q75 against un-partitioned schema

2015-02-09 Thread Mostafa Mokhtar (JIRA)
Mostafa Mokhtar created HIVE-9624:
-

 Summary: NullPointerException in 
MapJoinOperator.processOp(MapJoinOperator.java:253) for TPC-DS Q75 against 
un-partitioned schema
 Key: HIVE-9624
 URL: https://issues.apache.org/jira/browse/HIVE-9624
 Project: Hive
  Issue Type: Bug
  Components: Query Processor
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Gunther Hagleitner
 Fix For: 1.2.0


Running TPC-DS Q75 against a non-partitioned schema fails with 
{code}
Caused by: org.apache.hadoop.hive.ql.metadata.HiveException: Unexpected 
exception: null
at 
org.apache.hadoop.hive.ql.exec.MapJoinOperator.processOp(MapJoinOperator.java:314)
at org.apache.hadoop.hive.ql.exec.Operator.forward(Operator.java:815)
at 
org.apache.hadoop.hive.ql.exec.CommonJoinOperator.internalForward(CommonJoinOperator.java:638)
at 
org.apache.hadoop.hive.ql.exec.CommonJoinOperator.createForwardJoinObject(CommonJoinOperator.java:433)
at 
org.apache.hadoop.hive.ql.exec.CommonJoinOperator.genObject(CommonJoinOperator.java:525)
at 
org.apache.hadoop.hive.ql.exec.CommonJoinOperator.genObject(CommonJoinOperator.java:522)
at 
org.apache.hadoop.hive.ql.exec.CommonJoinOperator.genJoinObject(CommonJoinOperator.java:451)
at 
org.apache.hadoop.hive.ql.exec.CommonJoinOperator.checkAndGenObject(CommonJoinOperator.java:752)
at 
org.apache.hadoop.hive.ql.exec.CommonMergeJoinOperator.joinObject(CommonMergeJoinOperator.java:248)
at 
org.apache.hadoop.hive.ql.exec.CommonMergeJoinOperator.joinOneGroup(CommonMergeJoinOperator.java:213)
at 
org.apache.hadoop.hive.ql.exec.CommonMergeJoinOperator.processOp(CommonMergeJoinOperator.java:196)
at 
org.apache.hadoop.hive.ql.exec.tez.ReduceRecordSource$GroupIterator.next(ReduceRecordSource.java:328)
... 16 more
Caused by: java.lang.NullPointerException
at 
org.apache.hadoop.hive.ql.exec.MapJoinOperator.processOp(MapJoinOperator.java:253)
... 27 more
{code}

This line maps to hashMapRowGetters = new 
ReusableGetAdaptor[mapJoinTables.length] in the code snippet below
{code}
 alias = (byte) tag;
  if (hashMapRowGetters == null) {
hashMapRowGetters = new ReusableGetAdaptor[mapJoinTables.length];
MapJoinKey refKey = getRefKey(alias);
for (byte pos = 0; pos  order.length; pos++) {
  if (pos != alias) {
hashMapRowGetters[pos] = mapJoinTables[pos].createGetter(refKey);
  }
}
  }
{code}

Query 
{code}

WITH all_sales AS (
 SELECT d_year
   ,i_brand_id
   ,i_class_id
   ,i_category_id
   ,i_manufact_id
   ,SUM(sales_cnt) AS sales_cnt
   ,SUM(sales_amt) AS sales_amt
 FROM (SELECT d_year
 ,i_brand_id
 ,i_class_id
 ,i_category_id
 ,i_manufact_id
 ,cs_quantity - COALESCE(cr_return_quantity,0) AS sales_cnt
 ,cs_ext_sales_price - COALESCE(cr_return_amount,0.0) AS sales_amt
   FROM catalog_sales JOIN item ON i_item_sk=cs_item_sk
  JOIN date_dim ON d_date_sk=cs_sold_date_sk
  LEFT JOIN catalog_returns ON 
(cs_order_number=cr_order_number 
AND cs_item_sk=cr_item_sk)
   WHERE i_category='Sports'
   UNION ALL
   SELECT d_year
 ,i_brand_id
 ,i_class_id
 ,i_category_id
 ,i_manufact_id
 ,ss_quantity - COALESCE(sr_return_quantity,0) AS sales_cnt
 ,ss_ext_sales_price - COALESCE(sr_return_amt,0.0) AS sales_amt
   FROM store_sales JOIN item ON i_item_sk=ss_item_sk
JOIN date_dim ON d_date_sk=ss_sold_date_sk
LEFT JOIN store_returns ON 
(ss_ticket_number=sr_ticket_number 
AND ss_item_sk=sr_item_sk)
   WHERE i_category='Sports'
   UNION ALL
   SELECT d_year
 ,i_brand_id
 ,i_class_id
 ,i_category_id
 ,i_manufact_id
 ,ws_quantity - COALESCE(wr_return_quantity,0) AS sales_cnt
 ,ws_ext_sales_price - COALESCE(wr_return_amt,0.0) AS sales_amt
   FROM web_sales JOIN item ON i_item_sk=ws_item_sk
  JOIN date_dim ON d_date_sk=ws_sold_date_sk
  LEFT JOIN web_returns ON (ws_order_number=wr_order_number 
AND ws_item_sk=wr_item_sk)
   WHERE i_category='Sports') sales_detail
 GROUP BY d_year, i_brand_id, i_class_id, i_category_id, i_manufact_id)
 SELECT  prev_yr.d_year AS prev_year
  ,curr_yr.d_year AS year
  ,curr_yr.i_brand_id
  ,curr_yr.i_class_id
  ,curr_yr.i_category_id
  

[jira] [Created] (HIVE-9623) NullPointerException in MapJoinOperator.processOp(MapJoinOperator.java:253) for TPC-DS Q75 against un-partitioned schema

2015-02-09 Thread Mostafa Mokhtar (JIRA)
Mostafa Mokhtar created HIVE-9623:
-

 Summary: NullPointerException in 
MapJoinOperator.processOp(MapJoinOperator.java:253) for TPC-DS Q75 against 
un-partitioned schema
 Key: HIVE-9623
 URL: https://issues.apache.org/jira/browse/HIVE-9623
 Project: Hive
  Issue Type: Bug
  Components: Query Processor
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Gunther Hagleitner
 Fix For: 1.2.0


Running TPC-DS Q75 against a non-partitioned schema fails with 
{code}
Caused by: org.apache.hadoop.hive.ql.metadata.HiveException: Unexpected 
exception: null
at 
org.apache.hadoop.hive.ql.exec.MapJoinOperator.processOp(MapJoinOperator.java:314)
at org.apache.hadoop.hive.ql.exec.Operator.forward(Operator.java:815)
at 
org.apache.hadoop.hive.ql.exec.CommonJoinOperator.internalForward(CommonJoinOperator.java:638)
at 
org.apache.hadoop.hive.ql.exec.CommonJoinOperator.createForwardJoinObject(CommonJoinOperator.java:433)
at 
org.apache.hadoop.hive.ql.exec.CommonJoinOperator.genObject(CommonJoinOperator.java:525)
at 
org.apache.hadoop.hive.ql.exec.CommonJoinOperator.genObject(CommonJoinOperator.java:522)
at 
org.apache.hadoop.hive.ql.exec.CommonJoinOperator.genJoinObject(CommonJoinOperator.java:451)
at 
org.apache.hadoop.hive.ql.exec.CommonJoinOperator.checkAndGenObject(CommonJoinOperator.java:752)
at 
org.apache.hadoop.hive.ql.exec.CommonMergeJoinOperator.joinObject(CommonMergeJoinOperator.java:248)
at 
org.apache.hadoop.hive.ql.exec.CommonMergeJoinOperator.joinOneGroup(CommonMergeJoinOperator.java:213)
at 
org.apache.hadoop.hive.ql.exec.CommonMergeJoinOperator.processOp(CommonMergeJoinOperator.java:196)
at 
org.apache.hadoop.hive.ql.exec.tez.ReduceRecordSource$GroupIterator.next(ReduceRecordSource.java:328)
... 16 more
Caused by: java.lang.NullPointerException
at 
org.apache.hadoop.hive.ql.exec.MapJoinOperator.processOp(MapJoinOperator.java:253)
... 27 more
{code}

This line maps to hashMapRowGetters = new 
ReusableGetAdaptor[mapJoinTables.length] in the code snippet below
{code}
 alias = (byte) tag;
  if (hashMapRowGetters == null) {
hashMapRowGetters = new ReusableGetAdaptor[mapJoinTables.length];
MapJoinKey refKey = getRefKey(alias);
for (byte pos = 0; pos  order.length; pos++) {
  if (pos != alias) {
hashMapRowGetters[pos] = mapJoinTables[pos].createGetter(refKey);
  }
}
  }
{code}

Query 
{code}

WITH all_sales AS (
 SELECT d_year
   ,i_brand_id
   ,i_class_id
   ,i_category_id
   ,i_manufact_id
   ,SUM(sales_cnt) AS sales_cnt
   ,SUM(sales_amt) AS sales_amt
 FROM (SELECT d_year
 ,i_brand_id
 ,i_class_id
 ,i_category_id
 ,i_manufact_id
 ,cs_quantity - COALESCE(cr_return_quantity,0) AS sales_cnt
 ,cs_ext_sales_price - COALESCE(cr_return_amount,0.0) AS sales_amt
   FROM catalog_sales JOIN item ON i_item_sk=cs_item_sk
  JOIN date_dim ON d_date_sk=cs_sold_date_sk
  LEFT JOIN catalog_returns ON 
(cs_order_number=cr_order_number 
AND cs_item_sk=cr_item_sk)
   WHERE i_category='Sports'
   UNION ALL
   SELECT d_year
 ,i_brand_id
 ,i_class_id
 ,i_category_id
 ,i_manufact_id
 ,ss_quantity - COALESCE(sr_return_quantity,0) AS sales_cnt
 ,ss_ext_sales_price - COALESCE(sr_return_amt,0.0) AS sales_amt
   FROM store_sales JOIN item ON i_item_sk=ss_item_sk
JOIN date_dim ON d_date_sk=ss_sold_date_sk
LEFT JOIN store_returns ON 
(ss_ticket_number=sr_ticket_number 
AND ss_item_sk=sr_item_sk)
   WHERE i_category='Sports'
   UNION ALL
   SELECT d_year
 ,i_brand_id
 ,i_class_id
 ,i_category_id
 ,i_manufact_id
 ,ws_quantity - COALESCE(wr_return_quantity,0) AS sales_cnt
 ,ws_ext_sales_price - COALESCE(wr_return_amt,0.0) AS sales_amt
   FROM web_sales JOIN item ON i_item_sk=ws_item_sk
  JOIN date_dim ON d_date_sk=ws_sold_date_sk
  LEFT JOIN web_returns ON (ws_order_number=wr_order_number 
AND ws_item_sk=wr_item_sk)
   WHERE i_category='Sports') sales_detail
 GROUP BY d_year, i_brand_id, i_class_id, i_category_id, i_manufact_id)
 SELECT  prev_yr.d_year AS prev_year
  ,curr_yr.d_year AS year
  ,curr_yr.i_brand_id
  ,curr_yr.i_class_id
  ,curr_yr.i_category_id
  

[jira] [Created] (HIVE-9602) Hive on Tez : Launch an application master only when needed

2015-02-06 Thread Mostafa Mokhtar (JIRA)
Mostafa Mokhtar created HIVE-9602:
-

 Summary: Hive on Tez : Launch an application master only when 
needed
 Key: HIVE-9602
 URL: https://issues.apache.org/jira/browse/HIVE-9602
 Project: Hive
  Issue Type: New Feature
  Components: CLI
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Siddharth Seth


When all Yarn containers are busy a new Hive CLI session with Tez set as the 
execution engine won't start as it attempts to allocate a container for the AM.

The request is to launch an AM only when needed.

Analysts log into hive using CLI, write queries , check the schema analyze 
explain etc.. then launch a query, with the current model they are blocked from 
doing if the queue is full or if all containers are busy.



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)


[jira] [Created] (HIVE-9604) CBO : Presence of hybrid join condition sets of join order optimizations

2015-02-06 Thread Mostafa Mokhtar (JIRA)
Mostafa Mokhtar created HIVE-9604:
-

 Summary: CBO : Presence of hybrid join condition sets of join 
order optimizations
 Key: HIVE-9604
 URL: https://issues.apache.org/jira/browse/HIVE-9604
 Project: Hive
  Issue Type: New Feature
Reporter: Mostafa Mokhtar
Assignee: Laljo John Pullokkaran


When a query has a join between two tables on hybrid join condition 
(conjunction + disjunction) the resulting join order from CBO is suboptimal.
Re-writing the query results in 9x performance improvement. 

This was observed in several TPC-DS queries like Q72 and Q64.

The culprit join conditions are :
{code}
catalog_sales.cs_item_sk = inventory.inv_item_sk
  and inv_quantity_on_hand  cs_quantity 
{code}

This is a simplified version of Q72.
{code}
select  count(*) total_cnt
from catalog_sales
join inventory on (catalog_sales.cs_item_sk = inventory.inv_item_sk)
join warehouse on (warehouse.w_warehouse_sk=inventory.inv_warehouse_sk)
join item on (item.i_item_sk = catalog_sales.cs_item_sk)
join customer_demographics on (catalog_sales.cs_bill_cdemo_sk = 
customer_demographics.cd_demo_sk)
join household_demographics on (catalog_sales.cs_bill_hdemo_sk = 
household_demographics.hd_demo_sk)
join date_dim d1 on (catalog_sales.cs_sold_date_sk = d1.d_date_sk)
join date_dim d2 on (inventory.inv_date_sk = d2.d_date_sk)
join date_dim d3 on (catalog_sales.cs_ship_date_sk = d3.d_date_sk)
where d1.d_week_seq = d2.d_week_seq
  and inv_quantity_on_hand  cs_quantity 
  and d3.d_date  d1.d_date + 5
  and hd_buy_potential = '1001-5000'
  and d1.d_year = 2001
  and hd_buy_potential = '1001-5000'
  and cd_marital_status = 'M'
  and d1.d_year = 2001;
{code}

If the inventory table is moved down in the join order a more efficient plan is 
generated 

Modified query
{code}
select  count(*) total_cnt
from catalog_sales
join item on (item.i_item_sk = catalog_sales.cs_item_sk)
join customer_demographics on (catalog_sales.cs_bill_cdemo_sk = 
customer_demographics.cd_demo_sk)
join household_demographics on (catalog_sales.cs_bill_hdemo_sk = 
household_demographics.hd_demo_sk)
join date_dim d1 on (catalog_sales.cs_sold_date_sk = d1.d_date_sk)
join date_dim d3 on (catalog_sales.cs_ship_date_sk = d3.d_date_sk)
join inventory on (catalog_sales.cs_item_sk = inventory.inv_item_sk)
join warehouse on (warehouse.w_warehouse_sk=inventory.inv_warehouse_sk)
join date_dim d2 on (inventory.inv_date_sk = d2.d_date_sk)
where d1.d_week_seq = d2.d_week_seq
  and inv_quantity_on_hand  cs_quantity 
  and d3.d_date  d1.d_date + 5
  and hd_buy_potential = '1001-5000'
  and d1.d_year = 2001
  and hd_buy_potential = '1001-5000'
  and cd_marital_status = 'M'
  and d1.d_year = 2001;
{code}

Plan with base query notice how catalog_sales joins with inventory first
{code}
STAGE PLANS:
  Stage: Stage-1
Tez
  Edges:
Map 3 - Map 1 (BROADCAST_EDGE), Map 10 (BROADCAST_EDGE), Map 11 
(BROADCAST_EDGE), Map 12 (BROADCAST_EDGE), Map 13 (BROADCAST_EDGE), Map 2 
(BROADCAST_EDGE), Map 6 (BROADCAST_EDGE), Map 7 (BROADCAST_EDGE), Map 8 
(BROADCAST_EDGE), Map 9 (BROADCAST_EDGE)
Reducer 4 - Map 3 (SIMPLE_EDGE)
Reducer 5 - Reducer 4 (SIMPLE_EDGE)
  DagName: mmokhtar_20141015151414_a08eae06-7250-4833-9e1d-8e58eb69780e:1
  Vertices:
Map 1 
Map Operator Tree:
TableScan
  alias: d1
  filterExpr: (d_date_sk is not null and d_week_seq is not 
null) (type: boolean)
  Statistics: Num rows: 73049 Data size: 81741831 Basic stats: 
COMPLETE Column stats: COMPLETE
  Filter Operator
predicate: (d_date_sk is not null and d_week_seq is not 
null) (type: boolean)
Statistics: Num rows: 73049 Data size: 584392 Basic stats: 
COMPLETE Column stats: COMPLETE
Select Operator
  expressions: d_date_sk (type: int), d_week_seq (type: int)
  outputColumnNames: _col0, _col1
  Statistics: Num rows: 73049 Data size: 584392 Basic 
stats: COMPLETE Column stats: COMPLETE
  Reduce Output Operator
key expressions: _col0 (type: int), _col1 (type: int)
sort order: ++
Map-reduce partition columns: _col0 (type: int), _col1 
(type: int)
Statistics: Num rows: 73049 Data size: 584392 Basic 
stats: COMPLETE Column stats: COMPLETE
  Select Operator
expressions: _col0 (type: int)
outputColumnNames: _col0
Statistics: Num rows: 73049 Data size: 0 Basic stats: 
PARTIAL Column stats: COMPLETE
Group By Operator
  keys: _col0 (type: int)
  mode: hash
  

[jira] [Updated] (HIVE-9604) CBO : Presence of hybrid join condition sets of join order optimizations

2015-02-06 Thread Mostafa Mokhtar (JIRA)

 [ 
https://issues.apache.org/jira/browse/HIVE-9604?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Mostafa Mokhtar updated HIVE-9604:
--
  Component/s: CBO
Affects Version/s: 0.14.0

 CBO : Presence of hybrid join condition sets of join order optimizations
 

 Key: HIVE-9604
 URL: https://issues.apache.org/jira/browse/HIVE-9604
 Project: Hive
  Issue Type: New Feature
  Components: CBO
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Laljo John Pullokkaran

 When a query has a join between two tables on hybrid join condition 
 (conjunction + disjunction) the resulting join order from CBO is suboptimal.
 Re-writing the query results in 9x performance improvement. 
 This was observed in several TPC-DS queries like Q72 and Q64.
 The culprit join conditions are :
 {code}
 catalog_sales.cs_item_sk = inventory.inv_item_sk
   and inv_quantity_on_hand  cs_quantity 
 {code}
 This is a simplified version of Q72.
 {code}
 select  count(*) total_cnt
 from catalog_sales
 join inventory on (catalog_sales.cs_item_sk = inventory.inv_item_sk)
 join warehouse on (warehouse.w_warehouse_sk=inventory.inv_warehouse_sk)
 join item on (item.i_item_sk = catalog_sales.cs_item_sk)
 join customer_demographics on (catalog_sales.cs_bill_cdemo_sk = 
 customer_demographics.cd_demo_sk)
 join household_demographics on (catalog_sales.cs_bill_hdemo_sk = 
 household_demographics.hd_demo_sk)
 join date_dim d1 on (catalog_sales.cs_sold_date_sk = d1.d_date_sk)
 join date_dim d2 on (inventory.inv_date_sk = d2.d_date_sk)
 join date_dim d3 on (catalog_sales.cs_ship_date_sk = d3.d_date_sk)
 where d1.d_week_seq = d2.d_week_seq
   and inv_quantity_on_hand  cs_quantity 
   and d3.d_date  d1.d_date + 5
   and hd_buy_potential = '1001-5000'
   and d1.d_year = 2001
   and hd_buy_potential = '1001-5000'
   and cd_marital_status = 'M'
   and d1.d_year = 2001;
 {code}
 If the inventory table is moved down in the join order a more efficient plan 
 is generated 
 Modified query
 {code}
 select  count(*) total_cnt
 from catalog_sales
 join item on (item.i_item_sk = catalog_sales.cs_item_sk)
 join customer_demographics on (catalog_sales.cs_bill_cdemo_sk = 
 customer_demographics.cd_demo_sk)
 join household_demographics on (catalog_sales.cs_bill_hdemo_sk = 
 household_demographics.hd_demo_sk)
 join date_dim d1 on (catalog_sales.cs_sold_date_sk = d1.d_date_sk)
 join date_dim d3 on (catalog_sales.cs_ship_date_sk = d3.d_date_sk)
 join inventory on (catalog_sales.cs_item_sk = inventory.inv_item_sk)
 join warehouse on (warehouse.w_warehouse_sk=inventory.inv_warehouse_sk)
 join date_dim d2 on (inventory.inv_date_sk = d2.d_date_sk)
 where d1.d_week_seq = d2.d_week_seq
   and inv_quantity_on_hand  cs_quantity 
   and d3.d_date  d1.d_date + 5
   and hd_buy_potential = '1001-5000'
   and d1.d_year = 2001
   and hd_buy_potential = '1001-5000'
   and cd_marital_status = 'M'
   and d1.d_year = 2001;
 {code}
 Plan with base query notice how catalog_sales joins with inventory first
 {code}
 STAGE PLANS:
   Stage: Stage-1
 Tez
   Edges:
 Map 3 - Map 1 (BROADCAST_EDGE), Map 10 (BROADCAST_EDGE), Map 11 
 (BROADCAST_EDGE), Map 12 (BROADCAST_EDGE), Map 13 (BROADCAST_EDGE), Map 2 
 (BROADCAST_EDGE), Map 6 (BROADCAST_EDGE), Map 7 (BROADCAST_EDGE), Map 8 
 (BROADCAST_EDGE), Map 9 (BROADCAST_EDGE)
 Reducer 4 - Map 3 (SIMPLE_EDGE)
 Reducer 5 - Reducer 4 (SIMPLE_EDGE)
   DagName: mmokhtar_20141015151414_a08eae06-7250-4833-9e1d-8e58eb69780e:1
   Vertices:
 Map 1 
 Map Operator Tree:
 TableScan
   alias: d1
   filterExpr: (d_date_sk is not null and d_week_seq is not 
 null) (type: boolean)
   Statistics: Num rows: 73049 Data size: 81741831 Basic 
 stats: COMPLETE Column stats: COMPLETE
   Filter Operator
 predicate: (d_date_sk is not null and d_week_seq is not 
 null) (type: boolean)
 Statistics: Num rows: 73049 Data size: 584392 Basic 
 stats: COMPLETE Column stats: COMPLETE
 Select Operator
   expressions: d_date_sk (type: int), d_week_seq (type: 
 int)
   outputColumnNames: _col0, _col1
   Statistics: Num rows: 73049 Data size: 584392 Basic 
 stats: COMPLETE Column stats: COMPLETE
   Reduce Output Operator
 key expressions: _col0 (type: int), _col1 (type: int)
 sort order: ++
 Map-reduce partition columns: _col0 (type: int), 
 _col1 (type: int)
 Statistics: Num rows: 73049 Data size: 584392 Basic 
 stats: COMPLETE Column stats: COMPLETE
   Select Operator

[jira] [Updated] (HIVE-9392) JoinStatsRule miscalculates join cardinality as incorrect NDV is used due to column names having duplicated fqColumnName

2015-01-15 Thread Mostafa Mokhtar (JIRA)

 [ 
https://issues.apache.org/jira/browse/HIVE-9392?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Mostafa Mokhtar updated HIVE-9392:
--
Summary: JoinStatsRule miscalculates join cardinality as incorrect NDV is 
used due to column names having duplicated fqColumnName  (was: Hive : 
JoinStatsRule overwrites the column statistics in HashMap because column names 
have duplicate fqColumnName)

 JoinStatsRule miscalculates join cardinality as incorrect NDV is used due to 
 column names having duplicated fqColumnName
 

 Key: HIVE-9392
 URL: https://issues.apache.org/jira/browse/HIVE-9392
 Project: Hive
  Issue Type: Bug
  Components: Physical Optimizer
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Prasanth Jayachandran
Priority: Critical
 Fix For: 0.15.0


 In JoinStatsRule.process the join column statistics are stored in HashMap  
 joinedColStats, the key used which is the ColStatistics.fqColName is 
 duplicated between join column in the same vertex, as a result distinctVals 
 ends up having duplicated values which negatively affects the join 
 cardinality estimation.
 The duplicate keys are usually named KEY.reducesinkkey0.



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)


[jira] [Created] (HIVE-9392) Hive : JoinStatsRule overwrites the column statistics in HashMap because column names have duplicate fqColumnName

2015-01-15 Thread Mostafa Mokhtar (JIRA)
Mostafa Mokhtar created HIVE-9392:
-

 Summary: Hive : JoinStatsRule overwrites the column statistics in 
HashMap because column names have duplicate fqColumnName
 Key: HIVE-9392
 URL: https://issues.apache.org/jira/browse/HIVE-9392
 Project: Hive
  Issue Type: Bug
  Components: Physical Optimizer
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Prasanth Jayachandran
Priority: Critical
 Fix For: 0.15.0


In JoinStatsRule.process the join column statistics are stored in HashMap  
joinedColStats, the key used which is the ColStatistics.fqColName is duplicated 
between join column in the same vertex, as a result distinctVals ends up having 
duplicated values which negatively affects the join cardinality estimation.

The duplicate keys are usually named KEY.reducesinkkey0.



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)


[jira] [Updated] (HIVE-9368) Physical optimizer : Join order in Explain is different from join order provided by Calcite

2015-01-13 Thread Mostafa Mokhtar (JIRA)

 [ 
https://issues.apache.org/jira/browse/HIVE-9368?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Mostafa Mokhtar updated HIVE-9368:
--
Attachment: explain_fetch_column_stats_on.txt
explain_fetch_column_stats_off.txt

 Physical optimizer : Join order in Explain is different from join order 
 provided by Calcite
 ---

 Key: HIVE-9368
 URL: https://issues.apache.org/jira/browse/HIVE-9368
 Project: Hive
  Issue Type: Bug
  Components: Physical Optimizer
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Vikram Dixit K
 Fix For: 0.15.0

 Attachments: explain_fetch_column_stats_off.txt, 
 explain_fetch_column_stats_on.txt


 Join order in explain is different from that provided by Calcite, this was 
 observed during the Fidelity POC. 
 Logical plan from Calcite :
 {code}
 2015-01-13 18:54:42,892 DEBUG [main]: parse.CalcitePlanner 
 (CalcitePlanner.java:apply(743)) - Plan After Join Reordering:
 HiveProject(scale=[$0], time_key_num=[$1], dataset_code=[$2], 
 cost_center_lvl1_id=[$3], cost_pool_lvl6_id=[$4], lvl5_id=[$5], 
 view_lvl1_id=[$6], from_lvl1_id=[$7], plan_id=[$8], client_id=[$9], 
 lob_id=[$10], product_id=[$11], fprs_lvl5_id=[$12], ssn_id=[$13], 
 account_id=[$14], mtd_balance=[$15]): rowcount = 2.53152774E8, cumulative 
 cost = {3.057177094767754E9 rows, 0.0 cpu, 0.0 io}, id = 636
   HiveAggregate(group=[{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}], 
 agg#0=[SUM($15)]): rowcount = 2.53152774E8, cumulative cost = 
 {3.057177094767754E9 rows, 0.0 cpu, 0.0 io}, id = 634
 HiveProject($f0=[$0], $f1=[$1], $f2=[$2], $f3=[$3], $f4=[$4], $f5=[$24], 
 $f6=[$6], $f7=[$7], $f8=[$8], $f9=[$9], $f10=[$10], $f11=[$11], $f12=[$21], 
 $f13=[$18], $f14=[$19], $f15=[*($13, $20)]): rowcount = 3.401053197411791E11, 
 cumulative cost = {3.057177094767754E9 rows, 0.0 cpu, 0.0 io}, id = 632
   HiveProject(scale=[$7], time_key_num=[$8], dataset_code=[$9], 
 cost_center_lvl1_id=[$10], cost_pool_lvl6_id=[$11], activity_id=[$12], 
 view_lvl1_id=[$13], from_lvl1_id=[$14], plan_id=[$15], client_id=[$16], 
 lob_id=[$17], product_id=[$18], fprs_id=[$19], mtd_balance=[$20], 
 time_key_num0=[$0], activity_id0=[$1], plan_id0=[$2], fprs_id0=[$3], 
 ssn_id=[$4], account_id=[$5], driver_pct=[$6], lvl5_id=[$25], 
 current_ind=[$26], fprs_id1=[$27], lvl5_id0=[$21], rollup_key=[$22], 
 current_ind0=[$23], activity_id1=[$24]): rowcount = 3.401053197411791E11, 
 cumulative cost = {3.057177094767754E9 rows, 0.0 cpu, 0.0 io}, id = 692
 HiveJoin(condition=[AND(AND(AND(=($8, $0), =($15, $2)), =($19, $3)), 
 =($12, $1))], joinType=[inner]): rowcount = 3.401053197411791E11, cumulative 
 cost = {3.057177094767754E9 rows, 0.0 cpu, 0.0 io}, id = 690
   HiveProject(time_key_num=[$0], activity_id=[$1], plan_id=[$2], 
 fprs_id=[$3], ssn_id=[$4], account_id=[$5], driver_pct=[$6]): rowcount = 
 2.926396239E9, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 590
 
 HiveTableScan(table=[[fidelity.fcap_drivers_part_exp_inter_bucket_256]]): 
 rowcount = 2.926396239E9, cumulative cost = {0}, id = 465
   HiveJoin(condition=[=($12, $20)], joinType=[inner]): rowcount = 
 1.0871372980143067E8, cumulative cost = {2.2067125966323376E7 rows, 0.0 cpu, 
 0.0 io}, id = 688
 HiveJoin(condition=[=($5, $17)], joinType=[inner]): rowcount = 
 1.4392118216323378E7, cumulative cost = {6880237.75 rows, 0.0 cpu, 0.0 io}, 
 id = 653
   HiveProject(scale=[$0], time_key_num=[$1], dataset_code=[$2], 
 cost_center_lvl1_id=[$3], cost_pool_lvl6_id=[$4], activity_id=[$5], 
 view_lvl1_id=[$6], from_lvl1_id=[$7], plan_id=[$8], client_id=[$9], 
 lob_id=[$10], product_id=[$11], fprs_id=[$12], mtd_balance=[$14]): rowcount = 
 6870067.0, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 587
 
 HiveTableScan(table=[[fidelity.fcap_agg_prod_exp_nofund_decimal]]): rowcount 
 = 6870067.0, cumulative cost = {0}, id = 464
   HiveProject(lvl5_id=[$36], rollup_key=[$48], current_ind=[$51], 
 activity_id=[$60]): rowcount = 10170.75, cumulative cost = {0.0 rows, 0.0 
 cpu, 0.0 io}, id = 628
 HiveFilter(condition=[AND(=($51, 'Y'), =($48, 'TOTACT'))]): 
 rowcount = 10170.75, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 626
   HiveTableScan(table=[[fidelity.fobi_activity_dim_mv]]): 
 rowcount = 40683.0, cumulative cost = {0}, id = 467
 HiveProject(lvl5_id=[$36], current_ind=[$51], fprs_id=[$58]): 
 rowcount = 794770.0, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 622
   HiveFilter(condition=[=($51, 'Y')]): rowcount = 794770.0, 
 cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 620
 

[jira] [Created] (HIVE-9368) Physical optimizer : Join order in Explain is different from join order provided by Calcite

2015-01-13 Thread Mostafa Mokhtar (JIRA)
Mostafa Mokhtar created HIVE-9368:
-

 Summary: Physical optimizer : Join order in Explain is different 
from join order provided by Calcite
 Key: HIVE-9368
 URL: https://issues.apache.org/jira/browse/HIVE-9368
 Project: Hive
  Issue Type: Bug
  Components: Physical Optimizer
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Vikram Dixit K
 Fix For: 0.15.0


Join order in explain is different from that provided by Calcite, this was 
observed during the Fidelity POC. 

Logical plan from Calcite :
{code}
2015-01-13 18:54:42,892 DEBUG [main]: parse.CalcitePlanner 
(CalcitePlanner.java:apply(743)) - Plan After Join Reordering:
HiveProject(scale=[$0], time_key_num=[$1], dataset_code=[$2], 
cost_center_lvl1_id=[$3], cost_pool_lvl6_id=[$4], lvl5_id=[$5], 
view_lvl1_id=[$6], from_lvl1_id=[$7], plan_id=[$8], client_id=[$9], 
lob_id=[$10], product_id=[$11], fprs_lvl5_id=[$12], ssn_id=[$13], 
account_id=[$14], mtd_balance=[$15]): rowcount = 2.53152774E8, cumulative cost 
= {3.057177094767754E9 rows, 0.0 cpu, 0.0 io}, id = 636
  HiveAggregate(group=[{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}], 
agg#0=[SUM($15)]): rowcount = 2.53152774E8, cumulative cost = 
{3.057177094767754E9 rows, 0.0 cpu, 0.0 io}, id = 634
HiveProject($f0=[$0], $f1=[$1], $f2=[$2], $f3=[$3], $f4=[$4], $f5=[$24], 
$f6=[$6], $f7=[$7], $f8=[$8], $f9=[$9], $f10=[$10], $f11=[$11], $f12=[$21], 
$f13=[$18], $f14=[$19], $f15=[*($13, $20)]): rowcount = 3.401053197411791E11, 
cumulative cost = {3.057177094767754E9 rows, 0.0 cpu, 0.0 io}, id = 632
  HiveProject(scale=[$7], time_key_num=[$8], dataset_code=[$9], 
cost_center_lvl1_id=[$10], cost_pool_lvl6_id=[$11], activity_id=[$12], 
view_lvl1_id=[$13], from_lvl1_id=[$14], plan_id=[$15], client_id=[$16], 
lob_id=[$17], product_id=[$18], fprs_id=[$19], mtd_balance=[$20], 
time_key_num0=[$0], activity_id0=[$1], plan_id0=[$2], fprs_id0=[$3], 
ssn_id=[$4], account_id=[$5], driver_pct=[$6], lvl5_id=[$25], 
current_ind=[$26], fprs_id1=[$27], lvl5_id0=[$21], rollup_key=[$22], 
current_ind0=[$23], activity_id1=[$24]): rowcount = 3.401053197411791E11, 
cumulative cost = {3.057177094767754E9 rows, 0.0 cpu, 0.0 io}, id = 692
HiveJoin(condition=[AND(AND(AND(=($8, $0), =($15, $2)), =($19, $3)), 
=($12, $1))], joinType=[inner]): rowcount = 3.401053197411791E11, cumulative 
cost = {3.057177094767754E9 rows, 0.0 cpu, 0.0 io}, id = 690
  HiveProject(time_key_num=[$0], activity_id=[$1], plan_id=[$2], 
fprs_id=[$3], ssn_id=[$4], account_id=[$5], driver_pct=[$6]): rowcount = 
2.926396239E9, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 590

HiveTableScan(table=[[fidelity.fcap_drivers_part_exp_inter_bucket_256]]): 
rowcount = 2.926396239E9, cumulative cost = {0}, id = 465
  HiveJoin(condition=[=($12, $20)], joinType=[inner]): rowcount = 
1.0871372980143067E8, cumulative cost = {2.2067125966323376E7 rows, 0.0 cpu, 
0.0 io}, id = 688
HiveJoin(condition=[=($5, $17)], joinType=[inner]): rowcount = 
1.4392118216323378E7, cumulative cost = {6880237.75 rows, 0.0 cpu, 0.0 io}, id 
= 653
  HiveProject(scale=[$0], time_key_num=[$1], dataset_code=[$2], 
cost_center_lvl1_id=[$3], cost_pool_lvl6_id=[$4], activity_id=[$5], 
view_lvl1_id=[$6], from_lvl1_id=[$7], plan_id=[$8], client_id=[$9], 
lob_id=[$10], product_id=[$11], fprs_id=[$12], mtd_balance=[$14]): rowcount = 
6870067.0, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 587

HiveTableScan(table=[[fidelity.fcap_agg_prod_exp_nofund_decimal]]): rowcount = 
6870067.0, cumulative cost = {0}, id = 464
  HiveProject(lvl5_id=[$36], rollup_key=[$48], current_ind=[$51], 
activity_id=[$60]): rowcount = 10170.75, cumulative cost = {0.0 rows, 0.0 cpu, 
0.0 io}, id = 628
HiveFilter(condition=[AND(=($51, 'Y'), =($48, 'TOTACT'))]): 
rowcount = 10170.75, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 626
  HiveTableScan(table=[[fidelity.fobi_activity_dim_mv]]): 
rowcount = 40683.0, cumulative cost = {0}, id = 467
HiveProject(lvl5_id=[$36], current_ind=[$51], fprs_id=[$58]): 
rowcount = 794770.0, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 622
  HiveFilter(condition=[=($51, 'Y')]): rowcount = 794770.0, 
cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 620
HiveTableScan(table=[[fidelity.fobi_fprs_dim_mv_orc]]): 
rowcount = 1589540.0, cumulative cost = {0}, id = 466
{code}

Plan #1 with Fetch column stats on 
{code}
OK
STAGE DEPENDENCIES:
  Stage-1 is a root stage
  Stage-2 depends on stages: Stage-1
  Stage-0 depends on stages: Stage-2
  Stage-3 depends on stages: Stage-0

STAGE PLANS:
  Stage: Stage-1
Tez
  Edges:
Map 4 - Map 5 (BROADCAST_EDGE), Map 6 (BROADCAST_EDGE)
Reducer 2 - Map 1 (SIMPLE_EDGE), Map 4 (SIMPLE_EDGE)

[jira] [Updated] (HIVE-9277) Hybrid Hybrid Grace Hash Join

2015-01-07 Thread Mostafa Mokhtar (JIRA)

 [ 
https://issues.apache.org/jira/browse/HIVE-9277?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Mostafa Mokhtar updated HIVE-9277:
--
Assignee: (was: Mostafa Mokhtar)

 Hybrid Hybrid Grace Hash Join
 -

 Key: HIVE-9277
 URL: https://issues.apache.org/jira/browse/HIVE-9277
 Project: Hive
  Issue Type: New Feature
  Components: Physical Optimizer
Reporter: Wei Zheng
  Labels: join

 We are proposing an enhanced hash join algorithm called “hybrid hybrid grace 
 hash join”. We can benefit from this feature as illustrated below:
 o The query will not fail even if the estimated memory requirement is 
 slightly wrong
 o Expensive garbage collection overhead can be avoided when hash table grows
 o Join execution using a Map join operator even though the small table 
 doesn't fit in memory as spilling some data from the build and probe sides 
 will still be cheaper than having to shuffle the large fact table
 The design was based on Hadoop’s parallel processing capability and 
 significant amount of memory available.



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)


[jira] [Created] (HIVE-9068) Hive : With CBO disabled Vectorization in Map join disabled causing 100% increase in elapsed time and CPU (possibly due to redundant filter operator)

2014-12-10 Thread Mostafa Mokhtar (JIRA)
Mostafa Mokhtar created HIVE-9068:
-

 Summary: Hive : With CBO disabled Vectorization in Map join 
disabled causing 100% increase in elapsed time and CPU (possibly due to 
redundant filter operator)
 Key: HIVE-9068
 URL: https://issues.apache.org/jira/browse/HIVE-9068
 Project: Hive
  Issue Type: Bug
  Components: Vectorization
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Matt McCline
 Fix For: 0.14.1


With CBO off there is a redundant filter operator 
{code}
 Filter Operator
  predicate: ((null is null and (_col22 = _col51)) and 
(_col1 = _col26)) (type: boolean)
{code}

Possibly this is why Vectorization is getting disabled with CBO off, this 
operator doesn't exist with CBO on.

Query 
{code}
select 
count(*)
from
(SELECT 
'store' as channel,
'ss_addr_sk' col_name,
d_year,
d_qoy,
i_category,
ss_ext_sales_price ext_sales_price
FROM
store_sales, item, date_dim
WHERE
ss_addr_sk IS NULL
AND store_sales.ss_sold_date_sk = date_dim.d_date_sk
AND store_sales.ss_item_sk = item.i_item_sk) a;
{code}

Explain with CBO OFF
{code}
STAGE DEPENDENCIES:
  Stage-1 is a root stage
  Stage-0 depends on stages: Stage-1

STAGE PLANS:
  Stage: Stage-1
Tez
  Edges:
Map 1 - Map 3 (BROADCAST_EDGE), Map 4 (BROADCAST_EDGE)
Reducer 2 - Map 1 (SIMPLE_EDGE)
  DagName: mmokhtar_20141210171212_02c36f60-ceea-4e18-a266-5baecfd023f2:6
  Vertices:
Map 1
Map Operator Tree:
TableScan
  alias: store_sales
  filterExpr: (ss_item_sk is not null and ss_addr_sk is null) 
(type: boolean)
  Statistics: Num rows: 82510879939 Data size: 6873789738208 
Basic stats: COMPLETE Column stats: COMPLETE
  Filter Operator
predicate: (ss_item_sk is not null and ss_addr_sk is null) 
(type: boolean)
Statistics: Num rows: 1946839900 Data size: 23178336456 
Basic stats: COMPLETE Column stats: COMPLETE
Map Join Operator
  condition map:
   Inner Join 0 to 1
  condition expressions:
0 {ss_item_sk} {ss_sold_date_sk}
1 {i_item_sk}
  keys:
0 ss_item_sk (type: int)
1 i_item_sk (type: int)
  outputColumnNames: _col1, _col22, _col26
  input vertices:
1 Map 4
  Statistics: Num rows: 1946839936 Data size: 23362079232 
Basic stats: COMPLETE Column stats: COMPLETE
  Map Join Operator
condition map:
 Inner Join 0 to 1
condition expressions:
  0 {_col1} {_col22} {_col26}
  1 {d_date_sk}
keys:
  0 _col22 (type: int)
  1 d_date_sk (type: int)
outputColumnNames: _col1, _col22, _col26, _col51
input vertices:
  1 Map 3
Statistics: Num rows: 2176800197 Data size: 34828803152 
Basic stats: COMPLETE Column stats: COMPLETE
Filter Operator
  predicate: ((null is null and (_col22 = _col51)) and 
(_col1 = _col26)) (type: boolean)
  Statistics: Num rows: 272100024 Data size: 4353600384 
Basic stats: COMPLETE Column stats: COMPLETE
  Select Operator
Statistics: Num rows: 272100024 Data size: 
4353600384 Basic stats: COMPLETE Column stats: COMPLETE
Group By Operator
  aggregations: count()
  mode: hash
  outputColumnNames: _col0
  Statistics: Num rows: 1 Data size: 8 Basic stats: 
COMPLETE Column stats: COMPLETE
  Reduce Output Operator
sort order:
Statistics: Num rows: 1 Data size: 8 Basic 
stats: COMPLETE Column stats: COMPLETE
value expressions: _col0 (type: bigint)
Map 3
Map Operator Tree:
TableScan
  alias: date_dim
  filterExpr: d_date_sk is not null (type: boolean)
  Statistics: Num rows: 73049 Data size: 81741831 Basic stats: 
COMPLETE Column stats: COMPLETE
  Filter Operator
predicate: d_date_sk is not null (type: boolean)
 

[jira] [Commented] (HIVE-7913) Simplify filter predicates for CBO

2014-12-10 Thread Mostafa Mokhtar (JIRA)

[ 
https://issues.apache.org/jira/browse/HIVE-7913?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanelfocusedCommentId=14241866#comment-14241866
 ] 

Mostafa Mokhtar commented on HIVE-7913:
---

[~jpullokkaran]
Looks like this is still an issue, some of the filters can be pushed down to 
the scan.

{code}
set hive.cbo.enable=true
set hive.stats.fetch.column.stats=true
set hive.exec.dynamic.partition.mode=nonstrict
set hive.tez.auto.reducer.parallelism=true
set hive.auto.convert.join.noconditionaltask.size=32000
set hive.exec.reducers.bytes.per.reducer=1
set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DummyTxnManager
set hive.support.concurrency=false
set hive.tez.exec.print.summary=true
explain  

select  substr(r_reason_desc,1,20) as r
   ,avg(ws_quantity) wq
   ,avg(wr_refunded_cash) ref
   ,avg(wr_fee) fee
 from web_sales, web_returns, web_page, customer_demographics cd1,
  customer_demographics cd2, customer_address, date_dim, reason 
 where web_sales.ws_web_page_sk = web_page.wp_web_page_sk
   and web_sales.ws_item_sk = web_returns.wr_item_sk
   and web_sales.ws_order_number = web_returns.wr_order_number
   and web_sales.ws_sold_date_sk = date_dim.d_date_sk and d_year = 1998
   and cd1.cd_demo_sk = web_returns.wr_refunded_cdemo_sk 
   and cd2.cd_demo_sk = web_returns.wr_returning_cdemo_sk
   and customer_address.ca_address_sk = web_returns.wr_refunded_addr_sk
   and reason.r_reason_sk = web_returns.wr_reason_sk
   and
   (
(
 cd1.cd_marital_status = 'M'
 and
 cd1.cd_marital_status = cd2.cd_marital_status
 and
 cd1.cd_education_status = '4 yr Degree'
 and 
 cd1.cd_education_status = cd2.cd_education_status
 and
 ws_sales_price between 100.00 and 150.00
)
   or
(
 cd1.cd_marital_status = 'D'
 and
 cd1.cd_marital_status = cd2.cd_marital_status
 and
 cd1.cd_education_status = 'Primary' 
 and
 cd1.cd_education_status = cd2.cd_education_status
 and
 ws_sales_price between 50.00 and 100.00
)
   or
(
 cd1.cd_marital_status = 'U'
 and
 cd1.cd_marital_status = cd2.cd_marital_status
 and
 cd1.cd_education_status = 'Advanced Degree'
 and
 cd1.cd_education_status = cd2.cd_education_status
 and
 ws_sales_price between 150.00 and 200.00
)
   )
   and
   (
(
 ca_country = 'United States'
 and
 ca_state in ('KY', 'GA', 'NM')
 and ws_net_profit between 100 and 200  
)
or
(
 ca_country = 'United States'
 and
 ca_state in ('MT', 'OR', 'IN')
 and ws_net_profit between 150 and 300  
)
or
(
 ca_country = 'United States'
 and
 ca_state in ('WI', 'MO', 'WV')
 and ws_net_profit between 50 and 250  
)
   )
group by r_reason_desc
order by r, wq, ref, fee
limit 100
OK
STAGE DEPENDENCIES:
  Stage-1 is a root stage
  Stage-0 depends on stages: Stage-1

STAGE PLANS:
  Stage: Stage-1
Tez
  Edges:
Map 9 - Map 1 (BROADCAST_EDGE)
Reducer 3 - Map 13 (SIMPLE_EDGE), Map 2 (SIMPLE_EDGE)
Reducer 4 - Map 9 (SIMPLE_EDGE), Reducer 3 (SIMPLE_EDGE)
Reducer 5 - Map 14 (SIMPLE_EDGE), Reducer 4 (SIMPLE_EDGE)
Reducer 6 - Map 10 (SIMPLE_EDGE), Map 11 (BROADCAST_EDGE), Map 12 
(BROADCAST_EDGE), Reducer 5 (SIMPLE_EDGE)
Reducer 7 - Reducer 6 (SIMPLE_EDGE)
Reducer 8 - Reducer 7 (SIMPLE_EDGE)
  DagName: mmokhtar_2014161818_f5fd23ba-d783-4b13-8507-7faa65851798:1
  Vertices:
Map 1 
Map Operator Tree:
TableScan
  alias: web_page
  filterExpr: wp_web_page_sk is not null (type: boolean)
  Statistics: Num rows: 4602 Data size: 2696178 Basic stats: 
COMPLETE Column stats: COMPLETE
  Filter Operator
predicate: wp_web_page_sk is not null (type: boolean)
Statistics: Num rows: 4602 Data size: 18408 Basic stats: 
COMPLETE Column stats: COMPLETE
Select Operator
  expressions: wp_web_page_sk (type: int)
  outputColumnNames: _col0
  Statistics: Num rows: 4602 Data size: 18408 Basic stats: 
COMPLETE Column stats: COMPLETE
  Reduce Output Operator
key expressions: _col0 (type: int)
sort order: +
Map-reduce partition columns: _col0 (type: int)
Statistics: Num rows: 4602 Data size: 18408 Basic 
stats: COMPLETE Column stats: COMPLETE
Execution mode: vectorized
Map 10 
Map Operator Tree:
TableScan
  alias: customer_address
  filterExpr: ((ca_country = 'United States') and ca_address_sk 
is not null) (type: boolean)
  Statistics: Num rows: 4000 Data size: 40595195284 Basic 

[jira] [Assigned] (HIVE-7913) Simplify filter predicates for CBO

2014-12-10 Thread Mostafa Mokhtar (JIRA)

 [ 
https://issues.apache.org/jira/browse/HIVE-7913?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Mostafa Mokhtar reassigned HIVE-7913:
-

Assignee: Mostafa Mokhtar  (was: Laljo John Pullokkaran)

 Simplify filter predicates for CBO
 --

 Key: HIVE-7913
 URL: https://issues.apache.org/jira/browse/HIVE-7913
 Project: Hive
  Issue Type: Bug
  Components: CBO
Affects Versions: 0.13.1
Reporter: Mostafa Mokhtar
Assignee: Mostafa Mokhtar
 Fix For: 0.14.0


 Simplify predicates for disjunctive predicates so that can get pushed down to 
 the scan.
 For TPC-DS query 13 we push down predicates in the following form 
 where c_martial_status in ('M','D','U') etc.. 
 {code}
 select avg(ss_quantity)
,avg(ss_ext_sales_price)
,avg(ss_ext_wholesale_cost)
,sum(ss_ext_wholesale_cost)
  from store_sales
  ,store
  ,customer_demographics
  ,household_demographics
  ,customer_address
  ,date_dim
  where store.s_store_sk = store_sales.ss_store_sk
  and  store_sales.ss_sold_date_sk = date_dim.d_date_sk and date_dim.d_year = 
 2001
  and((store_sales.ss_hdemo_sk=household_demographics.hd_demo_sk
   and customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk
   and customer_demographics.cd_marital_status = 'M'
   and customer_demographics.cd_education_status = '4 yr Degree'
   and store_sales.ss_sales_price between 100.00 and 150.00
   and household_demographics.hd_dep_count = 3   
  )or
  (store_sales.ss_hdemo_sk=household_demographics.hd_demo_sk
   and customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk
   and customer_demographics.cd_marital_status = 'D'
   and customer_demographics.cd_education_status = 'Primary'
   and store_sales.ss_sales_price between 50.00 and 100.00   
   and household_demographics.hd_dep_count = 1
  ) or 
  (store_sales.ss_hdemo_sk=household_demographics.hd_demo_sk
   and customer_demographics.cd_demo_sk = ss_cdemo_sk
   and customer_demographics.cd_marital_status = 'U'
   and customer_demographics.cd_education_status = 'Advanced Degree'
   and store_sales.ss_sales_price between 150.00 and 200.00 
   and household_demographics.hd_dep_count = 1  
  ))
  and((store_sales.ss_addr_sk = customer_address.ca_address_sk
   and customer_address.ca_country = 'United States'
   and customer_address.ca_state in ('KY', 'GA', 'NM')
   and store_sales.ss_net_profit between 100 and 200  
  ) or
  (store_sales.ss_addr_sk = customer_address.ca_address_sk
   and customer_address.ca_country = 'United States'
   and customer_address.ca_state in ('MT', 'OR', 'IN')
   and store_sales.ss_net_profit between 150 and 300  
  ) or
  (store_sales.ss_addr_sk = customer_address.ca_address_sk
   and customer_address.ca_country = 'United States'
   and customer_address.ca_state in ('WI', 'MO', 'WV')
   and store_sales.ss_net_profit between 50 and 250  
  ))
 ;
 {code}
 This is the plan currently generated without any predicate simplification 
 {code}
 STAGE DEPENDENCIES:
   Stage-1 is a root stage
   Stage-0 depends on stages: Stage-1
 STAGE PLANS:
   Stage: Stage-1
 Tez
   Edges:
 Map 7 - Map 8 (BROADCAST_EDGE)
 Map 8 - Map 5 (BROADCAST_EDGE), Map 6 (BROADCAST_EDGE)
 Reducer 2 - Map 1 (SIMPLE_EDGE), Map 4 (BROADCAST_EDGE), Map 7 
 (SIMPLE_EDGE)
 Reducer 3 - Reducer 2 (SIMPLE_EDGE)
   DagName: mmokhtar_20140828155050_7059c24b-501b-4683-86c0-4f3c023f0b0e:1
   Vertices:
 Map 1 
 Map Operator Tree:
 TableScan
   alias: customer_address
   Statistics: Num rows: 4000 Data size: 40595195284 Basic 
 stats: COMPLETE Column stats: NONE
   Select Operator
 expressions: ca_address_sk (type: int), ca_state (type: 
 string), ca_country (type: string)
 outputColumnNames: _col0, _col1, _col2
 Statistics: Num rows: 4000 Data size: 40595195284 
 Basic stats: COMPLETE Column stats: NONE
 Reduce Output Operator
   sort order: 
   Statistics: Num rows: 4000 Data size: 40595195284 
 Basic stats: COMPLETE Column stats: NONE
   value expressions: _col0 (type: int), _col1 (type: 
 string), _col2 (type: string)
 Execution mode: vectorized
 Map 4 
 Map Operator Tree:
 TableScan
   alias: date_dim
   filterExpr: ((d_year = 2001) and d_date_sk is not null) 
 (type: boolean)
   Statistics: Num rows: 73049 Data size: 81741831 Basic 
 stats: COMPLETE Column stats: NONE
   Filter Operator
 predicate: ((d_year = 2001) and d_date_sk is not null) 
 (type: boolean)
  

[jira] [Created] (HIVE-9069) Simplify filter predicates for CBO

2014-12-10 Thread Mostafa Mokhtar (JIRA)
Mostafa Mokhtar created HIVE-9069:
-

 Summary: Simplify filter predicates for CBO
 Key: HIVE-9069
 URL: https://issues.apache.org/jira/browse/HIVE-9069
 Project: Hive
  Issue Type: Bug
  Components: CBO
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Laljo John Pullokkaran
 Fix For: 0.14.1


Simplify predicates for disjunctive predicates so that can get pushed down to 
the scan.

Looks like this is still an issue, some of the filters can be pushed down to 
the scan.
{code}
set hive.cbo.enable=true
set hive.stats.fetch.column.stats=true
set hive.exec.dynamic.partition.mode=nonstrict
set hive.tez.auto.reducer.parallelism=true
set hive.auto.convert.join.noconditionaltask.size=32000
set hive.exec.reducers.bytes.per.reducer=1
set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DummyTxnManager
set hive.support.concurrency=false
set hive.tez.exec.print.summary=true
explain  

select  substr(r_reason_desc,1,20) as r
   ,avg(ws_quantity) wq
   ,avg(wr_refunded_cash) ref
   ,avg(wr_fee) fee
 from web_sales, web_returns, web_page, customer_demographics cd1,
  customer_demographics cd2, customer_address, date_dim, reason 
 where web_sales.ws_web_page_sk = web_page.wp_web_page_sk
   and web_sales.ws_item_sk = web_returns.wr_item_sk
   and web_sales.ws_order_number = web_returns.wr_order_number
   and web_sales.ws_sold_date_sk = date_dim.d_date_sk and d_year = 1998
   and cd1.cd_demo_sk = web_returns.wr_refunded_cdemo_sk 
   and cd2.cd_demo_sk = web_returns.wr_returning_cdemo_sk
   and customer_address.ca_address_sk = web_returns.wr_refunded_addr_sk
   and reason.r_reason_sk = web_returns.wr_reason_sk
   and
   (
(
 cd1.cd_marital_status = 'M'
 and
 cd1.cd_marital_status = cd2.cd_marital_status
 and
 cd1.cd_education_status = '4 yr Degree'
 and 
 cd1.cd_education_status = cd2.cd_education_status
 and
 ws_sales_price between 100.00 and 150.00
)
   or
(
 cd1.cd_marital_status = 'D'
 and
 cd1.cd_marital_status = cd2.cd_marital_status
 and
 cd1.cd_education_status = 'Primary' 
 and
 cd1.cd_education_status = cd2.cd_education_status
 and
 ws_sales_price between 50.00 and 100.00
)
   or
(
 cd1.cd_marital_status = 'U'
 and
 cd1.cd_marital_status = cd2.cd_marital_status
 and
 cd1.cd_education_status = 'Advanced Degree'
 and
 cd1.cd_education_status = cd2.cd_education_status
 and
 ws_sales_price between 150.00 and 200.00
)
   )
   and
   (
(
 ca_country = 'United States'
 and
 ca_state in ('KY', 'GA', 'NM')
 and ws_net_profit between 100 and 200  
)
or
(
 ca_country = 'United States'
 and
 ca_state in ('MT', 'OR', 'IN')
 and ws_net_profit between 150 and 300  
)
or
(
 ca_country = 'United States'
 and
 ca_state in ('WI', 'MO', 'WV')
 and ws_net_profit between 50 and 250  
)
   )
group by r_reason_desc
order by r, wq, ref, fee
limit 100
OK
STAGE DEPENDENCIES:
  Stage-1 is a root stage
  Stage-0 depends on stages: Stage-1

STAGE PLANS:
  Stage: Stage-1
Tez
  Edges:
Map 9 - Map 1 (BROADCAST_EDGE)
Reducer 3 - Map 13 (SIMPLE_EDGE), Map 2 (SIMPLE_EDGE)
Reducer 4 - Map 9 (SIMPLE_EDGE), Reducer 3 (SIMPLE_EDGE)
Reducer 5 - Map 14 (SIMPLE_EDGE), Reducer 4 (SIMPLE_EDGE)
Reducer 6 - Map 10 (SIMPLE_EDGE), Map 11 (BROADCAST_EDGE), Map 12 
(BROADCAST_EDGE), Reducer 5 (SIMPLE_EDGE)
Reducer 7 - Reducer 6 (SIMPLE_EDGE)
Reducer 8 - Reducer 7 (SIMPLE_EDGE)
  DagName: mmokhtar_2014161818_f5fd23ba-d783-4b13-8507-7faa65851798:1
  Vertices:
Map 1 
Map Operator Tree:
TableScan
  alias: web_page
  filterExpr: wp_web_page_sk is not null (type: boolean)
  Statistics: Num rows: 4602 Data size: 2696178 Basic stats: 
COMPLETE Column stats: COMPLETE
  Filter Operator
predicate: wp_web_page_sk is not null (type: boolean)
Statistics: Num rows: 4602 Data size: 18408 Basic stats: 
COMPLETE Column stats: COMPLETE
Select Operator
  expressions: wp_web_page_sk (type: int)
  outputColumnNames: _col0
  Statistics: Num rows: 4602 Data size: 18408 Basic stats: 
COMPLETE Column stats: COMPLETE
  Reduce Output Operator
key expressions: _col0 (type: int)
sort order: +
Map-reduce partition columns: _col0 (type: int)
Statistics: Num rows: 4602 Data size: 18408 Basic 
stats: COMPLETE Column stats: COMPLETE
Execution mode: vectorized
Map 10 

[jira] [Commented] (HIVE-8805) CBO skipped due to SemanticException: Line 0:-1 Both left and right aliases encountered in JOIN 'avg_cs_ext_discount_amt'

2014-11-11 Thread Mostafa Mokhtar (JIRA)

[ 
https://issues.apache.org/jira/browse/HIVE-8805?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanelfocusedCommentId=14206201#comment-14206201
 ] 

Mostafa Mokhtar commented on HIVE-8805:
---

[~jpullokkaran] [~hagleitn]
Validation in progress.. 

 CBO skipped due to SemanticException: Line 0:-1 Both left and right aliases 
 encountered in JOIN 'avg_cs_ext_discount_amt'
 -

 Key: HIVE-8805
 URL: https://issues.apache.org/jira/browse/HIVE-8805
 Project: Hive
  Issue Type: Bug
  Components: CBO
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Laljo John Pullokkaran
 Fix For: 0.14.0

 Attachments: HIVE-8805.patch


 Query
 {code}
 set hive.cbo.enable=true
 set hive.stats.fetch.column.stats=true
 set hive.exec.dynamic.partition.mode=nonstrict
 set hive.tez.auto.reducer.parallelism=true
 set hive.auto.convert.join.noconditionaltask.size=32000
 set hive.exec.reducers.bytes.per.reducer=1
 set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DummyTxnManager
 set hive.support.concurrency=false
 set hive.tez.exec.print.summary=true
 explain  
 SELECT sum(cs1.cs_ext_discount_amt) as excess_discount_amount
 FROM (SELECT cs.cs_item_sk as cs_item_sk,
  cs.cs_ext_discount_amt as cs_ext_discount_amt
  FROM catalog_sales cs
  JOIN date_dim d ON (d.d_date_sk = cs.cs_sold_date_sk)
  WHERE d.d_date between '2000-01-27' and '2000-04-27') cs1
 JOIN item i ON (i.i_item_sk = cs1.cs_item_sk)
 JOIN (SELECT cs2.cs_item_sk as cs_item_sk,
   1.3 * avg(cs_ext_discount_amt) as 
 avg_cs_ext_discount_amt
FROM (SELECT cs.cs_item_sk as cs_item_sk,
 cs.cs_ext_discount_amt as 
 cs_ext_discount_amt
 FROM catalog_sales cs
 JOIN date_dim d ON (d.d_date_sk = cs.cs_sold_date_sk)
 WHERE d.d_date between '2000-01-27' and '2000-04-27') 
 cs2
 GROUP BY cs2.cs_item_sk) tmp1
 ON (i.i_item_sk = tmp1.cs_item_sk)
 WHERE i.i_manufact_id = 436 and
cs1.cs_ext_discount_amt  tmp1.avg_cs_ext_discount_amt
 {code}
 Exception
 {code}
 14/11/07 19:15:38 [main]: ERROR parse.SemanticAnalyzer: CBO failed, skipping 
 CBO. 
 org.apache.hadoop.hive.ql.parse.SemanticException: Line 0:-1 Both left and 
 right aliases encountered in JOIN 'avg_cs_ext_discount_amt'
   at 
 org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.parseJoinCondition(SemanticAnalyzer.java:2369)
   at 
 org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.parseJoinCondition(SemanticAnalyzer.java:2293)
   at 
 org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.parseJoinCondition(SemanticAnalyzer.java:2249)
   at 
 org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genJoinTree(SemanticAnalyzer.java:8010)
   at 
 org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genPlan(SemanticAnalyzer.java:9678)
   at 
 org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genPlan(SemanticAnalyzer.java:9593)
   at 
 org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genPlan(SemanticAnalyzer.java:9619)
   at 
 org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genPlan(SemanticAnalyzer.java:9593)
   at 
 org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genPlan(SemanticAnalyzer.java:9619)
   at 
 org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genPlan(SemanticAnalyzer.java:9606)
   at 
 org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.analyzeInternal(SemanticAnalyzer.java:10053)
   at 
 org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.analyze(BaseSemanticAnalyzer.java:221)
   at 
 org.apache.hadoop.hive.ql.parse.ExplainSemanticAnalyzer.analyzeInternal(ExplainSemanticAnalyzer.java:74)
   at 
 org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.analyze(BaseSemanticAnalyzer.java:221)
   at org.apache.hadoop.hive.ql.Driver.compile(Driver.java:415)
   at org.apache.hadoop.hive.ql.Driver.compile(Driver.java:303)
   at org.apache.hadoop.hive.ql.Driver.compileInternal(Driver.java:1067)
   at org.apache.hadoop.hive.ql.Driver.runInternal(Driver.java:1129)
   at org.apache.hadoop.hive.ql.Driver.run(Driver.java:1004)
   at org.apache.hadoop.hive.ql.Driver.run(Driver.java:994)
   at 
 org.apache.hadoop.hive.cli.CliDriver.processLocalCmd(CliDriver.java:247)
   at org.apache.hadoop.hive.cli.CliDriver.processCmd(CliDriver.java:199)
   at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:410)
   at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:345)
   at 
 org.apache.hadoop.hive.cli.CliDriver.processReader(CliDriver.java:443)
   at 

[jira] [Commented] (HIVE-8805) CBO skipped due to SemanticException: Line 0:-1 Both left and right aliases encountered in JOIN 'avg_cs_ext_discount_amt'

2014-11-11 Thread Mostafa Mokhtar (JIRA)

[ 
https://issues.apache.org/jira/browse/HIVE-8805?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanelfocusedCommentId=14207466#comment-14207466
 ] 

Mostafa Mokhtar commented on HIVE-8805:
---

Ran TPC-DS 30TB and all good.

 CBO skipped due to SemanticException: Line 0:-1 Both left and right aliases 
 encountered in JOIN 'avg_cs_ext_discount_amt'
 -

 Key: HIVE-8805
 URL: https://issues.apache.org/jira/browse/HIVE-8805
 Project: Hive
  Issue Type: Bug
  Components: CBO
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Laljo John Pullokkaran
 Fix For: 0.14.0

 Attachments: HIVE-8805.patch, HIVE-8805.patch


 Query
 {code}
 set hive.cbo.enable=true
 set hive.stats.fetch.column.stats=true
 set hive.exec.dynamic.partition.mode=nonstrict
 set hive.tez.auto.reducer.parallelism=true
 set hive.auto.convert.join.noconditionaltask.size=32000
 set hive.exec.reducers.bytes.per.reducer=1
 set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DummyTxnManager
 set hive.support.concurrency=false
 set hive.tez.exec.print.summary=true
 explain  
 SELECT sum(cs1.cs_ext_discount_amt) as excess_discount_amount
 FROM (SELECT cs.cs_item_sk as cs_item_sk,
  cs.cs_ext_discount_amt as cs_ext_discount_amt
  FROM catalog_sales cs
  JOIN date_dim d ON (d.d_date_sk = cs.cs_sold_date_sk)
  WHERE d.d_date between '2000-01-27' and '2000-04-27') cs1
 JOIN item i ON (i.i_item_sk = cs1.cs_item_sk)
 JOIN (SELECT cs2.cs_item_sk as cs_item_sk,
   1.3 * avg(cs_ext_discount_amt) as 
 avg_cs_ext_discount_amt
FROM (SELECT cs.cs_item_sk as cs_item_sk,
 cs.cs_ext_discount_amt as 
 cs_ext_discount_amt
 FROM catalog_sales cs
 JOIN date_dim d ON (d.d_date_sk = cs.cs_sold_date_sk)
 WHERE d.d_date between '2000-01-27' and '2000-04-27') 
 cs2
 GROUP BY cs2.cs_item_sk) tmp1
 ON (i.i_item_sk = tmp1.cs_item_sk)
 WHERE i.i_manufact_id = 436 and
cs1.cs_ext_discount_amt  tmp1.avg_cs_ext_discount_amt
 {code}
 Exception
 {code}
 14/11/07 19:15:38 [main]: ERROR parse.SemanticAnalyzer: CBO failed, skipping 
 CBO. 
 org.apache.hadoop.hive.ql.parse.SemanticException: Line 0:-1 Both left and 
 right aliases encountered in JOIN 'avg_cs_ext_discount_amt'
   at 
 org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.parseJoinCondition(SemanticAnalyzer.java:2369)
   at 
 org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.parseJoinCondition(SemanticAnalyzer.java:2293)
   at 
 org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.parseJoinCondition(SemanticAnalyzer.java:2249)
   at 
 org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genJoinTree(SemanticAnalyzer.java:8010)
   at 
 org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genPlan(SemanticAnalyzer.java:9678)
   at 
 org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genPlan(SemanticAnalyzer.java:9593)
   at 
 org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genPlan(SemanticAnalyzer.java:9619)
   at 
 org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genPlan(SemanticAnalyzer.java:9593)
   at 
 org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genPlan(SemanticAnalyzer.java:9619)
   at 
 org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genPlan(SemanticAnalyzer.java:9606)
   at 
 org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.analyzeInternal(SemanticAnalyzer.java:10053)
   at 
 org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.analyze(BaseSemanticAnalyzer.java:221)
   at 
 org.apache.hadoop.hive.ql.parse.ExplainSemanticAnalyzer.analyzeInternal(ExplainSemanticAnalyzer.java:74)
   at 
 org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.analyze(BaseSemanticAnalyzer.java:221)
   at org.apache.hadoop.hive.ql.Driver.compile(Driver.java:415)
   at org.apache.hadoop.hive.ql.Driver.compile(Driver.java:303)
   at org.apache.hadoop.hive.ql.Driver.compileInternal(Driver.java:1067)
   at org.apache.hadoop.hive.ql.Driver.runInternal(Driver.java:1129)
   at org.apache.hadoop.hive.ql.Driver.run(Driver.java:1004)
   at org.apache.hadoop.hive.ql.Driver.run(Driver.java:994)
   at 
 org.apache.hadoop.hive.cli.CliDriver.processLocalCmd(CliDriver.java:247)
   at org.apache.hadoop.hive.cli.CliDriver.processCmd(CliDriver.java:199)
   at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:410)
   at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:345)
   at 
 org.apache.hadoop.hive.cli.CliDriver.processReader(CliDriver.java:443)
   at 

[jira] [Created] (HIVE-8805) CBO skipped due to SemanticException: Line 0:-1 Both left and right aliases encountered in JOIN 'avg_cs_ext_discount_amt'

2014-11-10 Thread Mostafa Mokhtar (JIRA)
Mostafa Mokhtar created HIVE-8805:
-

 Summary: CBO skipped due to SemanticException: Line 0:-1 Both left 
and right aliases encountered in JOIN 'avg_cs_ext_discount_amt'
 Key: HIVE-8805
 URL: https://issues.apache.org/jira/browse/HIVE-8805
 Project: Hive
  Issue Type: Bug
  Components: CBO
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Laljo John Pullokkaran
 Fix For: 0.14.0


Query
{code}
set hive.cbo.enable=true
set hive.stats.fetch.column.stats=true
set hive.exec.dynamic.partition.mode=nonstrict
set hive.tez.auto.reducer.parallelism=true
set hive.auto.convert.join.noconditionaltask.size=32000
set hive.exec.reducers.bytes.per.reducer=1
set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DummyTxnManager
set hive.support.concurrency=false
set hive.tez.exec.print.summary=true
explain  
SELECT sum(cs1.cs_ext_discount_amt) as excess_discount_amount
FROM (SELECT cs.cs_item_sk as cs_item_sk,
 cs.cs_ext_discount_amt as cs_ext_discount_amt
 FROM catalog_sales cs
 JOIN date_dim d ON (d.d_date_sk = cs.cs_sold_date_sk)
 WHERE d.d_date between '2000-01-27' and '2000-04-27') cs1
JOIN item i ON (i.i_item_sk = cs1.cs_item_sk)
JOIN (SELECT cs2.cs_item_sk as cs_item_sk,
  1.3 * avg(cs_ext_discount_amt) as 
avg_cs_ext_discount_amt
   FROM (SELECT cs.cs_item_sk as cs_item_sk,
cs.cs_ext_discount_amt as 
cs_ext_discount_amt
FROM catalog_sales cs
JOIN date_dim d ON (d.d_date_sk = cs.cs_sold_date_sk)
WHERE d.d_date between '2000-01-27' and '2000-04-27') 
cs2
GROUP BY cs2.cs_item_sk) tmp1
ON (i.i_item_sk = tmp1.cs_item_sk)
WHERE i.i_manufact_id = 436 and
   cs1.cs_ext_discount_amt  tmp1.avg_cs_ext_discount_amt
{code}

Exception
{code}
14/11/07 19:15:38 [main]: ERROR parse.SemanticAnalyzer: CBO failed, skipping 
CBO. 
org.apache.hadoop.hive.ql.parse.SemanticException: Line 0:-1 Both left and 
right aliases encountered in JOIN 'avg_cs_ext_discount_amt'
at 
org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.parseJoinCondition(SemanticAnalyzer.java:2369)
at 
org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.parseJoinCondition(SemanticAnalyzer.java:2293)
at 
org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.parseJoinCondition(SemanticAnalyzer.java:2249)
at 
org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genJoinTree(SemanticAnalyzer.java:8010)
at 
org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genPlan(SemanticAnalyzer.java:9678)
at 
org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genPlan(SemanticAnalyzer.java:9593)
at 
org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genPlan(SemanticAnalyzer.java:9619)
at 
org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genPlan(SemanticAnalyzer.java:9593)
at 
org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genPlan(SemanticAnalyzer.java:9619)
at 
org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genPlan(SemanticAnalyzer.java:9606)
at 
org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.analyzeInternal(SemanticAnalyzer.java:10053)
at 
org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.analyze(BaseSemanticAnalyzer.java:221)
at 
org.apache.hadoop.hive.ql.parse.ExplainSemanticAnalyzer.analyzeInternal(ExplainSemanticAnalyzer.java:74)
at 
org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.analyze(BaseSemanticAnalyzer.java:221)
at org.apache.hadoop.hive.ql.Driver.compile(Driver.java:415)
at org.apache.hadoop.hive.ql.Driver.compile(Driver.java:303)
at org.apache.hadoop.hive.ql.Driver.compileInternal(Driver.java:1067)
at org.apache.hadoop.hive.ql.Driver.runInternal(Driver.java:1129)
at org.apache.hadoop.hive.ql.Driver.run(Driver.java:1004)
at org.apache.hadoop.hive.ql.Driver.run(Driver.java:994)
at 
org.apache.hadoop.hive.cli.CliDriver.processLocalCmd(CliDriver.java:247)
at org.apache.hadoop.hive.cli.CliDriver.processCmd(CliDriver.java:199)
at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:410)
at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:345)
at 
org.apache.hadoop.hive.cli.CliDriver.processReader(CliDriver.java:443)
at org.apache.hadoop.hive.cli.CliDriver.processFile(CliDriver.java:459)
at 
org.apache.hadoop.hive.cli.CliDriver.executeDriver(CliDriver.java:739)
at org.apache.hadoop.hive.cli.CliDriver.run(CliDriver.java:677)
at org.apache.hadoop.hive.cli.CliDriver.main(CliDriver.java:616)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at 

[jira] [Commented] (HIVE-8556) introduce overflow control and sanity check to BytesBytesMapJoin

2014-11-06 Thread Mostafa Mokhtar (JIRA)

[ 
https://issues.apache.org/jira/browse/HIVE-8556?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanelfocusedCommentId=14201002#comment-14201002
 ] 

Mostafa Mokhtar commented on HIVE-8556:
---

Looks good to me
[~prasanth_j] ?

 introduce overflow control and sanity check to BytesBytesMapJoin
 

 Key: HIVE-8556
 URL: https://issues.apache.org/jira/browse/HIVE-8556
 Project: Hive
  Issue Type: Bug
Reporter: Sergey Shelukhin
Assignee: Sergey Shelukhin
Priority: Minor
 Attachments: HIVE-8556.patch


 When stats are incorrect, negative or very large number can be passed to the 
 map



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)


[jira] [Created] (HIVE-8765) TPC-DS Q21 : Incorrect join order makes query run slower (Not scaling selectivity by NDV)

2014-11-06 Thread Mostafa Mokhtar (JIRA)
Mostafa Mokhtar created HIVE-8765:
-

 Summary: TPC-DS Q21 : Incorrect join order makes query run slower 
(Not scaling selectivity by NDV) 
 Key: HIVE-8765
 URL: https://issues.apache.org/jira/browse/HIVE-8765
 Project: Hive
  Issue Type: Bug
  Components: CBO
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Laljo John Pullokkaran
 Fix For: 0.15.0


CBO joins with date_dim first instead of item where item is the more selective 
join.

Query 
{code}
select  *
 from(select w_warehouse_name
,i_item_id
,sum(case when (cast(d_date as date)  cast ('1998-04-08' as date))
then inv_quantity_on_hand 
  else 0 end) as inv_before
,sum(case when (cast(d_date as date) = cast ('1998-04-08' as date))
  then inv_quantity_on_hand 
  else 0 end) as inv_after
   from inventory
   ,warehouse
   ,item
   ,date_dim
   where i_current_price between 0.99 and 1.49
 and item.i_item_sk  = inventory.inv_item_sk
 and inventory.inv_warehouse_sk   = warehouse.w_warehouse_sk
 and inventory.inv_date_sk= date_dim.d_date_sk
 and d_date between '1998-03-09' and '1998-05-07'
   group by w_warehouse_name, i_item_id) x
 where (case when inv_before  0 
 then inv_after / inv_before 
 else null
 end) between 2.0/3.0 and 3.0/2.0
 order by w_warehouse_name
 ,i_item_id
 limit 100
{code}

Logical Plan 
{code}
2014-11-06 16:58:32,041 DEBUG [main]: parse.SemanticAnalyzer 
(SemanticAnalyzer.java:apply(12631)) - Plan After Join Reordering:
HiveSortRel(fetch=[100]): rowcount = 1.0, cumulative cost = 
{1.627879384609158E9 rows, 2.0 cpu, 0.0 io}, id = 12521
  HiveSortRel(sort0=[$0], sort1=[$1], dir0=[ASC], dir1=[ASC]): rowcount = 1.0, 
cumulative cost = {1.627879368609158E9 rows, 1.0 cpu, 0.0 io}, id = 12519
HiveProjectRel(w_warehouse_name=[$0], i_item_id=[$1], inv_before=[$2], 
inv_after=[$3]): rowcount = 1.0, cumulative cost = {1.627879352609158E9 rows, 
0.0 cpu, 0.0 io}, id = 12517
  HiveFilterRel(condition=[between(false, when(($2, 0), /(CAST($3):DOUBLE, 
CAST($2):DOUBLE), null), /(2E0, 3E0), /(3E0, 2E0))]): rowcount = 1.0, 
cumulative cost = {1.627879352609158E9 rows, 0.0 cpu, 0.0 io}, id = 12515
HiveAggregateRel(group=[{0, 1}], agg#0=[sum($2)], agg#1=[sum($3)]): 
rowcount = 1.7688372892644288, cumulative cost = {1.627879352609158E9 rows, 0.0 
cpu, 0.0 io}, id = 12513
  HiveProjectRel($f0=[$5], $f1=[$7], $f2=[when((CAST($10):DATE, 
CAST('1998-04-08'):DATE), $2, 0)], $f3=[when(=(CAST($10):DATE, 
CAST('1998-04-08'):DATE), $2, 0)]): rowcount = 1.8477987480495097, cumulative 
cost = {1.627879352609158E9 rows, 0.0 cpu, 0.0 io}, id = 12511
HiveProjectRel(inv_item_sk=[$2], inv_warehouse_sk=[$3], 
inv_quantity_on_hand=[$4], inv_date_sk=[$5], w_warehouse_sk=[$0], 
w_warehouse_name=[$1], i_item_sk=[$8], i_item_id=[$9], i_current_price=[$10], 
d_date_sk=[$6], d_date=[$7]): rowcount = 1.8477987480495097, cumulative cost = 
{1.627879352609158E9 rows, 0.0 cpu, 0.0 io}, id = 12577
  HiveJoinRel(condition=[=($3, $0)], joinType=[inner]): rowcount = 
1.8477987480495097, cumulative cost = {1.627879352609158E9 rows, 0.0 cpu, 0.0 
io}, id = 12575
HiveProjectRel(w_warehouse_sk=[$0], w_warehouse_name=[$2]): 
rowcount = 27.0, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 12463
  
HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_3.warehouse]]): rowcount 
= 27.0, cumulative cost = {0}, id = 12287
HiveJoinRel(condition=[=($6, $0)], joinType=[inner]): rowcount 
= 1.8477987480495097, cumulative cost = {1.6278793237613592E9 rows, 0.0 cpu, 
0.0 io}, id = 12573
  HiveJoinRel(condition=[=($3, $4)], joinType=[inner]): 
rowcount = 22284.45290147709, cumulative cost = {1.627857001E9 rows, 0.0 cpu, 
0.0 io}, id = 12534
HiveProjectRel(inv_item_sk=[$0], inv_warehouse_sk=[$1], 
inv_quantity_on_hand=[$2], inv_date_sk=[$3]): rowcount = 1.627857E9, cumulative 
cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 12460
  
HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_3.inventory]]): rowcount 
= 1.627857E9, cumulative cost = {0}, id = 12284
HiveProjectRel(d_date_sk=[$0], d_date=[$2]): rowcount = 
1.0, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 12507
  HiveFilterRel(condition=[between(false, $2, '1998-03-09', 
'1998-05-07')]): rowcount = 1.0, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, 
id = 12505

HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_3.date_dim]]): rowcount 
= 73049.0, cumulative cost = {0}, id = 12286
  HiveProjectRel(i_item_sk=[$0], i_item_id=[$1], 
i_current_price=[$5]): 

[jira] [Created] (HIVE-8767) CBO : Join on inequality results in in-correct join order

2014-11-06 Thread Mostafa Mokhtar (JIRA)
Mostafa Mokhtar created HIVE-8767:
-

 Summary: CBO : Join on inequality results in in-correct join order
 Key: HIVE-8767
 URL: https://issues.apache.org/jira/browse/HIVE-8767
 Project: Hive
  Issue Type: Bug
  Components: CBO
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Laljo John Pullokkaran
 Fix For: 0.15.0


Queries with hybrid joins (inner join and in-equality join) produce inefficient 
join order.
CBO joins the two tables involved in the in-equality join first then the 
remaining joins are considered.
The problem with that selectivity of the other joins is not taken into 
consideration.

Queries that are affected by this are Q64 and Q72 from TPC-DS

Logical plan for Q72
{code}
2014-11-06 14:13:12,169 DEBUG [main]: parse.SemanticAnalyzer 
(SemanticAnalyzer.java:apply(12631)) - Plan After Join Reordering:
HiveSortRel(fetch=[100]): rowcount = 139827.8175849229, cumulative cost = 
{7.195499709572942E13 rows, 279655.6351698458 cpu, 0.0 io}, id = 2037
  HiveSortRel(sort0=[$3], sort1=[$0], sort2=[$1], sort3=[$2], dir0=[DESC], 
dir1=[ASC], dir2=[ASC], dir3=[ASC]): rowcount = 139827.8175849229, cumulative 
cost = {7.195497058847592E13 rows, 139827.8175849229 cpu, 0.0 io}, id = 2035
HiveProjectRel(i_item_desc=[$0], w_warehouse_name=[$1], d_week_seq=[$2], 
total_cnt=[$3]): rowcount = 139827.8175849229, cumulative cost = 
{7.195494408122242E13 rows, 0.0 cpu, 0.0 io}, id = 2033
  HiveAggregateRel(group=[{0, 1, 2}], agg#0=[count()]): rowcount = 
139827.8175849229, cumulative cost = {7.195494408122242E13 rows, 0.0 cpu, 0.0 
io}, id = 2031
HiveProjectRel($f0=[$13], $f1=[$11], $f2=[$20]): rowcount = 
106451.860966184, cumulative cost = {7.195494408122242E13 rows, 0.0 cpu, 0.0 
io}, id = 2029
  HiveFilterRel(condition=[(CAST($25):DOUBLE, +(CAST($19):DOUBLE, 
CAST(5):DOUBLE))]): rowcount = 106451.860966184, cumulative cost = 
{7.195494408122242E13 rows, 0.0 cpu, 0.0 io}, id = 2027
HiveProjectRel(cs_ship_date_sk=[$2], cs_bill_cdemo_sk=[$3], 
cs_bill_hdemo_sk=[$4], cs_item_sk=[$5], cs_quantity=[$6], cs_sold_date_sk=[$7], 
inv_item_sk=[$8], inv_warehouse_sk=[$9], inv_quantity_on_hand=[$10], 
inv_date_sk=[$11], w_warehouse_sk=[$24], w_warehouse_name=[$25], 
i_item_sk=[$0], i_item_desc=[$1], cd_demo_sk=[$20], cd_marital_status=[$21], 
hd_demo_sk=[$18], hd_buy_potential=[$19], d_date_sk=[$12], d_date=[$13], 
d_week_seq=[$14], d_year=[$15], d_date_sk0=[$16], d_week_seq0=[$17], 
d_date_sk1=[$22], d_date0=[$23]): rowcount = 319355.582898552, cumulative cost 
= {7.195494408122242E13 rows, 0.0 cpu, 0.0 io}, id = 2451
  HiveJoinRel(condition=[=($24, $9)], joinType=[inner]): rowcount = 
319355.582898552, cumulative cost = {7.195494408122242E13 rows, 0.0 cpu, 0.0 
io}, id = 2449
HiveJoinRel(condition=[=($2, $22)], joinType=[inner]): rowcount 
= 319355.582898552, cumulative cost = {7.195494376183984E13 rows, 0.0 cpu, 0.0 
io}, id = 2447
  HiveJoinRel(condition=[=($0, $5)], joinType=[inner]): 
rowcount = 319355.582898552, cumulative cost = {7.195494336943527E13 rows, 0.0 
cpu, 0.0 io}, id = 2445
HiveProjectRel(i_item_sk=[$0], i_item_desc=[$4]): rowcount 
= 462000.0, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 1997
  
HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_3.item]]): rowcount = 
462000.0, cumulative cost = {0}, id = 1645
HiveJoinRel(condition=[=($1, $18)], joinType=[inner]): 
rowcount = 319355.582898552, cumulative cost = {7.195494258807969E13 rows, 0.0 
cpu, 0.0 io}, id = 2443
  HiveJoinRel(condition=[=($2, $16)], joinType=[inner]): 
rowcount = 2235489.080289864, cumulative cost = {7.195494007819061E13 rows, 0.0 
cpu, 0.0 io}, id = 2441
HiveJoinRel(condition=[AND(=($9, $14), =($12, $15))], 
joinType=[inner]): rowcount = 5.588722700724658E7, cumulative cost = 
{7.195488419067561E13 rows, 0.0 cpu, 0.0 io}, id = 2439
  HiveJoinRel(condition=[=($5, $10)], 
joinType=[inner]): rowcount = 5.732184228903609E9, cumulative cost = 
{7.19491519333977E13 rows, 0.0 cpu, 0.0 io}, id = 2080
HiveFilterRel(condition=[($8, $4)]): rowcount = 
7.190451896736688E13, cumulative cost = {4.4632966025E10 rows, 0.0 cpu, 0.0 
io}, id = 1991
  HiveProjectRel(cs_ship_date_sk=[$0], 
cs_bill_cdemo_sk=[$1], cs_bill_hdemo_sk=[$2], cs_item_sk=[$3], 
cs_quantity=[$4], cs_sold_date_sk=[$5], inv_item_sk=[$6], 
inv_warehouse_sk=[$7], inv_quantity_on_hand=[$8], inv_date_sk=[$9]): rowcount = 
2.1571355690210062E14, cumulative cost = {4.4632966025E10 rows, 0.0 cpu, 0.0 
io}, id = 2072
HiveJoinRel(condition=[=($3, $6)], 
joinType=[inner]): rowcount = 2.1571355690210062E14, cumulative cost = 

[jira] [Created] (HIVE-8769) Physical optimizer : Incorrect CE results in a shuffle join instead of a Map join (PK/FK pattern not detected)

2014-11-06 Thread Mostafa Mokhtar (JIRA)
Mostafa Mokhtar created HIVE-8769:
-

 Summary: Physical optimizer : Incorrect CE results in a shuffle 
join instead of a Map join (PK/FK pattern not detected)
 Key: HIVE-8769
 URL: https://issues.apache.org/jira/browse/HIVE-8769
 Project: Hive
  Issue Type: Bug
  Components: Physical Optimizer
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Prasanth J
 Fix For: 0.15.0


TPC-DS Q82 is running slower than hive 13 because the join type is not correct.

The estimate for item x inventory x date_dim is 227 Million rows while the 
actual is  3K rows.

Hive 13 finishes in  753  seconds.
Hive 14 finishes in  1,267  seconds.
Hive 14 + force map join finished in 431 seconds.

Query
{code}
select  i_item_id
   ,i_item_desc
   ,i_current_price
 from item, inventory, date_dim, store_sales
 where i_current_price between 30 and 30+30
 and inv_item_sk = i_item_sk
 and d_date_sk=inv_date_sk
 and d_date between '2002-05-30' and '2002-07-30'
 and i_manufact_id in (437,129,727,663)
 and inv_quantity_on_hand between 100 and 500
 and ss_item_sk = i_item_sk
 group by i_item_id,i_item_desc,i_current_price
 order by i_item_id
 limit 100
{code}

Plan 
{code}
STAGE PLANS:
  Stage: Stage-1
Tez
  Edges:
Map 7 - Map 1 (BROADCAST_EDGE), Map 2 (BROADCAST_EDGE)
Reducer 4 - Map 3 (SIMPLE_EDGE), Map 7 (SIMPLE_EDGE)
Reducer 5 - Reducer 4 (SIMPLE_EDGE)
Reducer 6 - Reducer 5 (SIMPLE_EDGE)
  DagName: mmokhtar_20141106005353_7a2eb8df-12ff-4fe9-89b4-30f1e4e3fb90:1
  Vertices:
Map 1 
Map Operator Tree:
TableScan
  alias: item
  filterExpr: ((i_current_price BETWEEN 30 AND 60 and 
(i_manufact_id) IN (437, 129, 727, 663)) and i_item_sk is not null) (type: 
boolean)
  Statistics: Num rows: 462000 Data size: 663862160 Basic 
stats: COMPLETE Column stats: COMPLETE
  Filter Operator
predicate: ((i_current_price BETWEEN 30 AND 60 and 
(i_manufact_id) IN (437, 129, 727, 663)) and i_item_sk is not null) (type: 
boolean)
Statistics: Num rows: 115500 Data size: 34185680 Basic 
stats: COMPLETE Column stats: COMPLETE
Select Operator
  expressions: i_item_sk (type: int), i_item_id (type: 
string), i_item_desc (type: string), i_current_price (type: float)
  outputColumnNames: _col0, _col1, _col2, _col3
  Statistics: Num rows: 115500 Data size: 33724832 Basic 
stats: COMPLETE Column stats: COMPLETE
  Reduce Output Operator
key expressions: _col0 (type: int)
sort order: +
Map-reduce partition columns: _col0 (type: int)
Statistics: Num rows: 115500 Data size: 33724832 Basic 
stats: COMPLETE Column stats: COMPLETE
value expressions: _col1 (type: string), _col2 (type: 
string), _col3 (type: float)
Execution mode: vectorized
Map 2 
Map Operator Tree:
TableScan
  alias: date_dim
  filterExpr: (d_date BETWEEN '2002-05-30' AND '2002-07-30' and 
d_date_sk is not null) (type: boolean)
  Statistics: Num rows: 73049 Data size: 81741831 Basic stats: 
COMPLETE Column stats: COMPLETE
  Filter Operator
predicate: (d_date BETWEEN '2002-05-30' AND '2002-07-30' 
and d_date_sk is not null) (type: boolean)
Statistics: Num rows: 36524 Data size: 3579352 Basic stats: 
COMPLETE Column stats: COMPLETE
Select Operator
  expressions: d_date_sk (type: int)
  outputColumnNames: _col0
  Statistics: Num rows: 36524 Data size: 146096 Basic 
stats: COMPLETE Column stats: COMPLETE
  Reduce Output Operator
key expressions: _col0 (type: int)
sort order: +
Map-reduce partition columns: _col0 (type: int)
Statistics: Num rows: 36524 Data size: 146096 Basic 
stats: COMPLETE Column stats: COMPLETE
  Select Operator
expressions: _col0 (type: int)
outputColumnNames: _col0
Statistics: Num rows: 36524 Data size: 146096 Basic 
stats: COMPLETE Column stats: COMPLETE
Group By Operator
  keys: _col0 (type: int)
  mode: hash
  outputColumnNames: _col0
  Statistics: Num rows: 18262 Data size: 73048 Basic 
stats: COMPLETE Column stats: COMPLETE
  Dynamic Partitioning Event 

[jira] [Created] (HIVE-8747) Estimate number of rows for table with 0 rows overflows resulting in an in-efficient plan

2014-11-05 Thread Mostafa Mokhtar (JIRA)
Mostafa Mokhtar created HIVE-8747:
-

 Summary: Estimate number of rows for table with 0 rows overflows 
resulting in an in-efficient plan 
 Key: HIVE-8747
 URL: https://issues.apache.org/jira/browse/HIVE-8747
 Project: Hive
  Issue Type: Bug
  Components: Physical Optimizer
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Prasanth J
Priority: Critical
 Fix For: 0.14.0


Query 
{code}
select count(*) 
from
  web_sales
 ,date_dim
  ,ship_mode
 where
 web_sales.ws_sold_date_sk = date_dim.d_date_sk
and web_sales.ws_ship_mode_sk = ship_mode.sm_ship_mode_sk
and d_year = 2002
and sm_carrier in ('DIAMOND','AIRBORNE')
{code}

Explain 
{code}
STAGE PLANS:
  Stage: Stage-1
Tez
  Edges:
Map 1 - Map 4 (BROADCAST_EDGE)
Map 4 - Map 3 (BROADCAST_EDGE)
Reducer 2 - Map 1 (SIMPLE_EDGE)
  DagName: mmokhtar_20141105180404_59e6fb65-529f-4eaa-9446-7f34d12bffac:30
  Vertices:
Map 1
Map Operator Tree:
TableScan
  alias: ship_mode
  filterExpr: ((sm_carrier) IN ('DIAMOND', 'AIRBORNE') and 
sm_ship_mode_sk is not null) (type: boolean)
  Statistics: Num rows: 0 Data size: 45 Basic stats: PARTIAL 
Column stats: COMPLETE
  Filter Operator
predicate: ((sm_carrier) IN ('DIAMOND', 'AIRBORNE') and 
sm_ship_mode_sk is not null) (type: boolean)
Statistics: Num rows: 9223372036854775807 Data size: 
9223372036854775807 Basic stats: COMPLETE Column stats: COMPLETE
Select Operator
  expressions: sm_ship_mode_sk (type: int)
  outputColumnNames: _col0
  Statistics: Num rows: 9223372036854775807 Data size: 
9223372036854775807 Basic stats: COMPLETE Column stats: COMPLETE
  Map Join Operator
condition map:
 Inner Join 0 to 1
condition expressions:
  0
  1
keys:
  0 _col1 (type: int)
  1 _col0 (type: int)
input vertices:
  0 Map 4
Statistics: Num rows: 9223372036854775807 Data size: 0 
Basic stats: PARTIAL Column stats: COMPLETE
Select Operator
  Statistics: Num rows: 9223372036854775807 Data size: 
0 Basic stats: PARTIAL Column stats: COMPLETE
  Group By Operator
aggregations: count()
mode: hash
outputColumnNames: _col0
Statistics: Num rows: 1 Data size: 8 Basic stats: 
COMPLETE Column stats: COMPLETE
Reduce Output Operator
  sort order:
  Statistics: Num rows: 1 Data size: 8 Basic stats: 
COMPLETE Column stats: COMPLETE
  value expressions: _col0 (type: bigint)
Execution mode: vectorized
Map 3
Map Operator Tree:
TableScan
  alias: date_dim
  filterExpr: ((d_year = 2002) and d_date_sk is not null) 
(type: boolean)
  Statistics: Num rows: 73049 Data size: 81741831 Basic stats: 
COMPLETE Column stats: COMPLETE
  Filter Operator
predicate: ((d_year = 2002) and d_date_sk is not null) 
(type: boolean)
Statistics: Num rows: 652 Data size: 5216 Basic stats: 
COMPLETE Column stats: COMPLETE
Select Operator
  expressions: d_date_sk (type: int)
  outputColumnNames: _col0
  Statistics: Num rows: 652 Data size: 2608 Basic stats: 
COMPLETE Column stats: COMPLETE
  Reduce Output Operator
key expressions: _col0 (type: int)
sort order: +
Map-reduce partition columns: _col0 (type: int)
Statistics: Num rows: 652 Data size: 2608 Basic stats: 
COMPLETE Column stats: COMPLETE
Execution mode: vectorized
Map 4
Map Operator Tree:
TableScan
  alias: web_sales
  filterExpr: (ws_sold_date_sk is not null and ws_ship_mode_sk 
is not null) (type: boolean)
  Statistics: Num rows: 143966864 Data size: 19577477788 Basic 
stats: COMPLETE Column stats: COMPLETE
  Filter Operator
predicate: (ws_sold_date_sk is not null and ws_ship_mode_sk 
is not null) (type: 

[jira] [Updated] (HIVE-8747) Estimate number of rows for table with 0 rows overflows resulting in an in-efficient plan

2014-11-05 Thread Mostafa Mokhtar (JIRA)

 [ 
https://issues.apache.org/jira/browse/HIVE-8747?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Mostafa Mokhtar updated HIVE-8747:
--
Description: 
ship_mode table has 0 rows.

Query 
{code}
select count(*) 
from
  web_sales
 ,date_dim
  ,ship_mode
 where
 web_sales.ws_sold_date_sk = date_dim.d_date_sk
and web_sales.ws_ship_mode_sk = ship_mode.sm_ship_mode_sk
and d_year = 2002
and sm_carrier in ('DIAMOND','AIRBORNE')
{code}

Explain 
{code}
STAGE PLANS:
  Stage: Stage-1
Tez
  Edges:
Map 1 - Map 4 (BROADCAST_EDGE)
Map 4 - Map 3 (BROADCAST_EDGE)
Reducer 2 - Map 1 (SIMPLE_EDGE)
  DagName: mmokhtar_20141105180404_59e6fb65-529f-4eaa-9446-7f34d12bffac:30
  Vertices:
Map 1
Map Operator Tree:
TableScan
  alias: ship_mode
  filterExpr: ((sm_carrier) IN ('DIAMOND', 'AIRBORNE') and 
sm_ship_mode_sk is not null) (type: boolean)
  Statistics: Num rows: 0 Data size: 45 Basic stats: PARTIAL 
Column stats: COMPLETE
  Filter Operator
predicate: ((sm_carrier) IN ('DIAMOND', 'AIRBORNE') and 
sm_ship_mode_sk is not null) (type: boolean)
Statistics: Num rows: 9223372036854775807 Data size: 
9223372036854775807 Basic stats: COMPLETE Column stats: COMPLETE
Select Operator
  expressions: sm_ship_mode_sk (type: int)
  outputColumnNames: _col0
  Statistics: Num rows: 9223372036854775807 Data size: 
9223372036854775807 Basic stats: COMPLETE Column stats: COMPLETE
  Map Join Operator
condition map:
 Inner Join 0 to 1
condition expressions:
  0
  1
keys:
  0 _col1 (type: int)
  1 _col0 (type: int)
input vertices:
  0 Map 4
Statistics: Num rows: 9223372036854775807 Data size: 0 
Basic stats: PARTIAL Column stats: COMPLETE
Select Operator
  Statistics: Num rows: 9223372036854775807 Data size: 
0 Basic stats: PARTIAL Column stats: COMPLETE
  Group By Operator
aggregations: count()
mode: hash
outputColumnNames: _col0
Statistics: Num rows: 1 Data size: 8 Basic stats: 
COMPLETE Column stats: COMPLETE
Reduce Output Operator
  sort order:
  Statistics: Num rows: 1 Data size: 8 Basic stats: 
COMPLETE Column stats: COMPLETE
  value expressions: _col0 (type: bigint)
Execution mode: vectorized
Map 3
Map Operator Tree:
TableScan
  alias: date_dim
  filterExpr: ((d_year = 2002) and d_date_sk is not null) 
(type: boolean)
  Statistics: Num rows: 73049 Data size: 81741831 Basic stats: 
COMPLETE Column stats: COMPLETE
  Filter Operator
predicate: ((d_year = 2002) and d_date_sk is not null) 
(type: boolean)
Statistics: Num rows: 652 Data size: 5216 Basic stats: 
COMPLETE Column stats: COMPLETE
Select Operator
  expressions: d_date_sk (type: int)
  outputColumnNames: _col0
  Statistics: Num rows: 652 Data size: 2608 Basic stats: 
COMPLETE Column stats: COMPLETE
  Reduce Output Operator
key expressions: _col0 (type: int)
sort order: +
Map-reduce partition columns: _col0 (type: int)
Statistics: Num rows: 652 Data size: 2608 Basic stats: 
COMPLETE Column stats: COMPLETE
Execution mode: vectorized
Map 4
Map Operator Tree:
TableScan
  alias: web_sales
  filterExpr: (ws_sold_date_sk is not null and ws_ship_mode_sk 
is not null) (type: boolean)
  Statistics: Num rows: 143966864 Data size: 19577477788 Basic 
stats: COMPLETE Column stats: COMPLETE
  Filter Operator
predicate: (ws_sold_date_sk is not null and ws_ship_mode_sk 
is not null) (type: boolean)
Statistics: Num rows: 143948856 Data size: 1151518824 Basic 
stats: COMPLETE Column stats: COMPLETE
Select Operator
  expressions: ws_sold_date_sk (type: int), ws_ship_mode_sk 
(type: int)
  outputColumnNames: _col0, 

[jira] [Created] (HIVE-8752) Disjunction cardinality estimation has selectivity of 1

2014-11-05 Thread Mostafa Mokhtar (JIRA)
Mostafa Mokhtar created HIVE-8752:
-

 Summary: Disjunction cardinality estimation has selectivity of 1
 Key: HIVE-8752
 URL: https://issues.apache.org/jira/browse/HIVE-8752
 Project: Hive
  Issue Type: Bug
  Components: CBO
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Laljo John Pullokkaran
Priority: Critical
 Fix For: 0.14.0


TPC-DS Q89 has the wrong join order.
Store_sales should be joining with item first then date_dim.

The issue is that the predicate on item shows a selectivity of 1 
{code}
((i_category in ('Home','Books','Electronics') and
  i_class in ('wallpaper','parenting','musical')
 )
  or (i_category in ('Shoes','Jewelry','Men') and
  i_class in ('womens','birdal','pants') 
))
{code}

{code}
HiveProjectRel(i_item_sk=[$0], i_brand=[$8], i_class=[$10], 
i_category=[$12]): rowcount = 462000.0, cumulative cost = {0.0 rows, 0.0 cpu, 
0.0 io}, id = 4052
  HiveFilterRel(condition=[OR(AND(in($12, 'Home', 'Books', 
'Electronics'), in($10, 'wallpaper', 'parenting', 'musical')), AND(in($12, 
'Shoes', 'Jewelry', 'Men'), in($10, 'womens', 'birdal', 'pants')))]): rowcount 
= 462000.0, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 4050

HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_3.item]]): rowcount = 
462000.0, cumulative cost = {0}, id = 3818
{code}

Query
{code}

select  *
from(
select i_category, i_class, i_brand,
   s_store_name, s_company_name,
   d_moy,
   sum(ss_sales_price) sum_sales,
   avg(sum(ss_sales_price)) over
 (partition by i_category, i_brand, s_store_name, s_company_name)
 avg_monthly_sales
from item, store_sales, date_dim, store
where store_sales.ss_item_sk = item.i_item_sk and
  store_sales.ss_sold_date_sk = date_dim.d_date_sk and
  store_sales.ss_store_sk = store.s_store_sk and
  d_year in (2000) and
((i_category in ('Home','Books','Electronics') and
  i_class in ('wallpaper','parenting','musical')
 )
  or (i_category in ('Shoes','Jewelry','Men') and
  i_class in ('womens','birdal','pants') 
))
group by i_category, i_class, i_brand,
 s_store_name, s_company_name, d_moy) tmp1
where case when (avg_monthly_sales  0) then (abs(sum_sales - 
avg_monthly_sales) / avg_monthly_sales) else null end  0.1
order by sum_sales - avg_monthly_sales, s_store_name
limit 100
{code}

The result of the wrong join order is that the query runs in 335 seconds 
compared to 124 seconds with the correct join order.

Removing the disjunction in the item filter produces the correct plan
{code}
 i_category in ('Home','Books','Electronics') and
  i_class in ('wallpaper','parenting','musical')
{code}



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)


[jira] [Created] (HIVE-8727) Dag summary has incorrect row counts and duration per vertex

2014-11-04 Thread Mostafa Mokhtar (JIRA)
Mostafa Mokhtar created HIVE-8727:
-

 Summary: Dag summary has incorrect row counts and duration per 
vertex
 Key: HIVE-8727
 URL: https://issues.apache.org/jira/browse/HIVE-8727
 Project: Hive
  Issue Type: Bug
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Prasanth J
 Fix For: 0.14.0


During the code review for HIVE-8495 some code was reworked which broke some of 
INPUT/OUTPUT counters and duration.

Patch attached which fixes that.



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)


[jira] [Updated] (HIVE-8727) Dag summary has incorrect row counts and duration per vertex

2014-11-04 Thread Mostafa Mokhtar (JIRA)

 [ 
https://issues.apache.org/jira/browse/HIVE-8727?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Mostafa Mokhtar updated HIVE-8727:
--
Attachment: HIVE-8727.1.patch

 Dag summary has incorrect row counts and duration per vertex
 

 Key: HIVE-8727
 URL: https://issues.apache.org/jira/browse/HIVE-8727
 Project: Hive
  Issue Type: Bug
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Prasanth J
 Fix For: 0.14.0

 Attachments: HIVE-8727.1.patch


 During the code review for HIVE-8495 some code was reworked which broke some 
 of INPUT/OUTPUT counters and duration.
 Patch attached which fixes that.



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)


[jira] [Created] (HIVE-8731) TPC-DS Q49 : More rows are returned than the limit set in the query

2014-11-04 Thread Mostafa Mokhtar (JIRA)
Mostafa Mokhtar created HIVE-8731:
-

 Summary: TPC-DS Q49 : More rows are returned than the limit set in 
the query 
 Key: HIVE-8731
 URL: https://issues.apache.org/jira/browse/HIVE-8731
 Project: Hive
  Issue Type: Bug
  Components: Vectorization
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Matt McCline
Priority: Critical
 Fix For: 0.14.0


TPC-DS query 49 returns more rows than that set in limit.

Query 
{code}
set hive.cbo.enable=true;
set hive.stats.fetch.column.stats=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.tez.auto.reducer.parallelism=true;
set hive.auto.convert.join.noconditionaltask.size=128000;
set hive.exec.reducers.bytes.per.reducer=1;
set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DummyTxnManager;
set hive.support.concurrency=false;
set hive.tez.exec.print.summary=true;

explain  

select  
 'web' as channel
 ,web.item
 ,web.return_ratio
 ,web.return_rank
 ,web.currency_rank
 from (
select 
 item
,return_ratio
,currency_ratio
,rank() over (order by return_ratio) as return_rank
,rank() over (order by currency_ratio) as currency_rank
from
(   select ws.ws_item_sk as item
,(cast(sum(coalesce(wr.wr_return_quantity,0)) as decimal(15,4))/
cast(sum(coalesce(ws.ws_quantity,0)) as decimal(15,4) )) as 
return_ratio
,(cast(sum(coalesce(wr.wr_return_amt,0)) as decimal(15,4))/
cast(sum(coalesce(ws.ws_net_paid,0)) as decimal(15,4) )) as 
currency_ratio
from 
 web_sales ws left outer join web_returns wr 
on (ws.ws_order_number = wr.wr_order_number and 
ws.ws_item_sk = wr.wr_item_sk)
 ,date_dim
where 
wr.wr_return_amt  1 
and ws.ws_net_profit  1
 and ws.ws_net_paid  0
 and ws.ws_quantity  0
 and ws.ws_sold_date_sk = date_dim.d_date_sk
 and d_year = 2000
 and d_moy = 12
group by ws.ws_item_sk
) in_web
 ) web
 where 
 (
 web.return_rank = 10
 or
 web.currency_rank = 10
 )
 union all
 select 
 'catalog' as channel
 ,catalog.item
 ,catalog.return_ratio
 ,catalog.return_rank
 ,catalog.currency_rank
 from (
select 
 item
,return_ratio
,currency_ratio
,rank() over (order by return_ratio) as return_rank
,rank() over (order by currency_ratio) as currency_rank
from
(   select 
cs.cs_item_sk as item
,(cast(sum(coalesce(cr.cr_return_quantity,0)) as decimal(15,4))/
cast(sum(coalesce(cs.cs_quantity,0)) as decimal(15,4) )) as 
return_ratio
,(cast(sum(coalesce(cr.cr_return_amount,0)) as decimal(15,4))/
cast(sum(coalesce(cs.cs_net_paid,0)) as decimal(15,4) )) as 
currency_ratio
from 
catalog_sales cs left outer join catalog_returns cr
on (cs.cs_order_number = cr.cr_order_number and 
cs.cs_item_sk = cr.cr_item_sk)
,date_dim
where 
cr.cr_return_amount  1 
and cs.cs_net_profit  1
 and cs.cs_net_paid  0
 and cs.cs_quantity  0
 and cs_sold_date_sk = d_date_sk
 and d_year = 2000
 and d_moy = 12
 group by cs.cs_item_sk
) in_cat
 ) catalog
 where 
 (
 catalog.return_rank = 10
 or
 catalog.currency_rank =10
 )
 union all
 select 
 'store' as channel
 ,store.item
 ,store.return_ratio
 ,store.return_rank
 ,store.currency_rank
 from (
select 
 item
,return_ratio
,currency_ratio
,rank() over (order by return_ratio) as return_rank
,rank() over (order by currency_ratio) as currency_rank
from
(   select sts.ss_item_sk as item
,(cast(sum(coalesce(sr.sr_return_quantity,0)) as 
decimal(15,4))/cast(sum(coalesce(sts.ss_quantity,0)) as decimal(15,4) )) as 
return_ratio
,(cast(sum(coalesce(sr.sr_return_amt,0)) as 
decimal(15,4))/cast(sum(coalesce(sts.ss_net_paid,0)) as decimal(15,4) )) as 
currency_ratio
from 
store_sales sts left outer join store_returns sr
on (sts.ss_ticket_number = sr.sr_ticket_number and 
sts.ss_item_sk = sr.sr_item_sk)
,date_dim
where 
sr.sr_return_amt  1 
and sts.ss_net_profit  1
  

[jira] [Updated] (HIVE-8731) TPC-DS Q49 : Semantic analyzer order by is not honored when used after union all

2014-11-04 Thread Mostafa Mokhtar (JIRA)

 [ 
https://issues.apache.org/jira/browse/HIVE-8731?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Mostafa Mokhtar updated HIVE-8731:
--
Assignee: Gunther Hagleitner  (was: Matt McCline)

 TPC-DS Q49 : Semantic analyzer order by is not honored when used after union 
 all 
 -

 Key: HIVE-8731
 URL: https://issues.apache.org/jira/browse/HIVE-8731
 Project: Hive
  Issue Type: Bug
  Components: Vectorization
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Gunther Hagleitner
Priority: Critical
 Fix For: 0.14.0


 TPC-DS query 49 returns more rows than that set in limit.
 Query 
 {code}
 set hive.cbo.enable=true;
 set hive.stats.fetch.column.stats=true;
 set hive.exec.dynamic.partition.mode=nonstrict;
 set hive.tez.auto.reducer.parallelism=true;
 set hive.auto.convert.join.noconditionaltask.size=128000;
 set hive.exec.reducers.bytes.per.reducer=1;
 set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DummyTxnManager;
 set hive.support.concurrency=false;
 set hive.tez.exec.print.summary=true;
 explain  
 select  
  'web' as channel
  ,web.item
  ,web.return_ratio
  ,web.return_rank
  ,web.currency_rank
  from (
   select 
item
   ,return_ratio
   ,currency_ratio
   ,rank() over (order by return_ratio) as return_rank
   ,rank() over (order by currency_ratio) as currency_rank
   from
   (   select ws.ws_item_sk as item
   ,(cast(sum(coalesce(wr.wr_return_quantity,0)) as decimal(15,4))/
   cast(sum(coalesce(ws.ws_quantity,0)) as decimal(15,4) )) as 
 return_ratio
   ,(cast(sum(coalesce(wr.wr_return_amt,0)) as decimal(15,4))/
   cast(sum(coalesce(ws.ws_net_paid,0)) as decimal(15,4) )) as 
 currency_ratio
   from 
web_sales ws left outer join web_returns wr 
   on (ws.ws_order_number = wr.wr_order_number and 
   ws.ws_item_sk = wr.wr_item_sk)
  ,date_dim
   where 
   wr.wr_return_amt  1 
   and ws.ws_net_profit  1
  and ws.ws_net_paid  0
  and ws.ws_quantity  0
  and ws.ws_sold_date_sk = date_dim.d_date_sk
  and d_year = 2000
  and d_moy = 12
   group by ws.ws_item_sk
   ) in_web
  ) web
  where 
  (
  web.return_rank = 10
  or
  web.currency_rank = 10
  )
  union all
  select 
  'catalog' as channel
  ,catalog.item
  ,catalog.return_ratio
  ,catalog.return_rank
  ,catalog.currency_rank
  from (
   select 
item
   ,return_ratio
   ,currency_ratio
   ,rank() over (order by return_ratio) as return_rank
   ,rank() over (order by currency_ratio) as currency_rank
   from
   (   select 
   cs.cs_item_sk as item
   ,(cast(sum(coalesce(cr.cr_return_quantity,0)) as decimal(15,4))/
   cast(sum(coalesce(cs.cs_quantity,0)) as decimal(15,4) )) as 
 return_ratio
   ,(cast(sum(coalesce(cr.cr_return_amount,0)) as decimal(15,4))/
   cast(sum(coalesce(cs.cs_net_paid,0)) as decimal(15,4) )) as 
 currency_ratio
   from 
   catalog_sales cs left outer join catalog_returns cr
   on (cs.cs_order_number = cr.cr_order_number and 
   cs.cs_item_sk = cr.cr_item_sk)
 ,date_dim
   where 
   cr.cr_return_amount  1 
   and cs.cs_net_profit  1
  and cs.cs_net_paid  0
  and cs.cs_quantity  0
  and cs_sold_date_sk = d_date_sk
  and d_year = 2000
  and d_moy = 12
  group by cs.cs_item_sk
   ) in_cat
  ) catalog
  where 
  (
  catalog.return_rank = 10
  or
  catalog.currency_rank =10
  )
  union all
  select 
  'store' as channel
  ,store.item
  ,store.return_ratio
  ,store.return_rank
  ,store.currency_rank
  from (
   select 
item
   ,return_ratio
   ,currency_ratio
   ,rank() over (order by return_ratio) as return_rank
   ,rank() over (order by currency_ratio) as currency_rank
   from
   (   select sts.ss_item_sk as item
   ,(cast(sum(coalesce(sr.sr_return_quantity,0)) as 
 decimal(15,4))/cast(sum(coalesce(sts.ss_quantity,0)) as decimal(15,4) )) as 
 return_ratio
   ,(cast(sum(coalesce(sr.sr_return_amt,0)) as 
 decimal(15,4))/cast(sum(coalesce(sts.ss_net_paid,0)) as decimal(15,4) )) as 
 currency_ratio
   from 
   store_sales sts left outer join store_returns 

[jira] [Updated] (HIVE-8731) TPC-DS Q49 : Semantic analyzer order by is not honored when used after union all

2014-11-04 Thread Mostafa Mokhtar (JIRA)

 [ 
https://issues.apache.org/jira/browse/HIVE-8731?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Mostafa Mokhtar updated HIVE-8731:
--
Summary: TPC-DS Q49 : Semantic analyzer order by is not honored when used 
after union all   (was: TPC-DS Q49 : More rows are returned than the limit set 
in the query )

 TPC-DS Q49 : Semantic analyzer order by is not honored when used after union 
 all 
 -

 Key: HIVE-8731
 URL: https://issues.apache.org/jira/browse/HIVE-8731
 Project: Hive
  Issue Type: Bug
  Components: Vectorization
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Matt McCline
Priority: Critical
 Fix For: 0.14.0


 TPC-DS query 49 returns more rows than that set in limit.
 Query 
 {code}
 set hive.cbo.enable=true;
 set hive.stats.fetch.column.stats=true;
 set hive.exec.dynamic.partition.mode=nonstrict;
 set hive.tez.auto.reducer.parallelism=true;
 set hive.auto.convert.join.noconditionaltask.size=128000;
 set hive.exec.reducers.bytes.per.reducer=1;
 set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DummyTxnManager;
 set hive.support.concurrency=false;
 set hive.tez.exec.print.summary=true;
 explain  
 select  
  'web' as channel
  ,web.item
  ,web.return_ratio
  ,web.return_rank
  ,web.currency_rank
  from (
   select 
item
   ,return_ratio
   ,currency_ratio
   ,rank() over (order by return_ratio) as return_rank
   ,rank() over (order by currency_ratio) as currency_rank
   from
   (   select ws.ws_item_sk as item
   ,(cast(sum(coalesce(wr.wr_return_quantity,0)) as decimal(15,4))/
   cast(sum(coalesce(ws.ws_quantity,0)) as decimal(15,4) )) as 
 return_ratio
   ,(cast(sum(coalesce(wr.wr_return_amt,0)) as decimal(15,4))/
   cast(sum(coalesce(ws.ws_net_paid,0)) as decimal(15,4) )) as 
 currency_ratio
   from 
web_sales ws left outer join web_returns wr 
   on (ws.ws_order_number = wr.wr_order_number and 
   ws.ws_item_sk = wr.wr_item_sk)
  ,date_dim
   where 
   wr.wr_return_amt  1 
   and ws.ws_net_profit  1
  and ws.ws_net_paid  0
  and ws.ws_quantity  0
  and ws.ws_sold_date_sk = date_dim.d_date_sk
  and d_year = 2000
  and d_moy = 12
   group by ws.ws_item_sk
   ) in_web
  ) web
  where 
  (
  web.return_rank = 10
  or
  web.currency_rank = 10
  )
  union all
  select 
  'catalog' as channel
  ,catalog.item
  ,catalog.return_ratio
  ,catalog.return_rank
  ,catalog.currency_rank
  from (
   select 
item
   ,return_ratio
   ,currency_ratio
   ,rank() over (order by return_ratio) as return_rank
   ,rank() over (order by currency_ratio) as currency_rank
   from
   (   select 
   cs.cs_item_sk as item
   ,(cast(sum(coalesce(cr.cr_return_quantity,0)) as decimal(15,4))/
   cast(sum(coalesce(cs.cs_quantity,0)) as decimal(15,4) )) as 
 return_ratio
   ,(cast(sum(coalesce(cr.cr_return_amount,0)) as decimal(15,4))/
   cast(sum(coalesce(cs.cs_net_paid,0)) as decimal(15,4) )) as 
 currency_ratio
   from 
   catalog_sales cs left outer join catalog_returns cr
   on (cs.cs_order_number = cr.cr_order_number and 
   cs.cs_item_sk = cr.cr_item_sk)
 ,date_dim
   where 
   cr.cr_return_amount  1 
   and cs.cs_net_profit  1
  and cs.cs_net_paid  0
  and cs.cs_quantity  0
  and cs_sold_date_sk = d_date_sk
  and d_year = 2000
  and d_moy = 12
  group by cs.cs_item_sk
   ) in_cat
  ) catalog
  where 
  (
  catalog.return_rank = 10
  or
  catalog.currency_rank =10
  )
  union all
  select 
  'store' as channel
  ,store.item
  ,store.return_ratio
  ,store.return_rank
  ,store.currency_rank
  from (
   select 
item
   ,return_ratio
   ,currency_ratio
   ,rank() over (order by return_ratio) as return_rank
   ,rank() over (order by currency_ratio) as currency_rank
   from
   (   select sts.ss_item_sk as item
   ,(cast(sum(coalesce(sr.sr_return_quantity,0)) as 
 decimal(15,4))/cast(sum(coalesce(sts.ss_quantity,0)) as decimal(15,4) )) as 
 return_ratio
   ,(cast(sum(coalesce(sr.sr_return_amt,0)) as 
 decimal(15,4))/cast(sum(coalesce(sts.ss_net_paid,0)) as 

[jira] [Updated] (HIVE-8671) Overflow in estimate row count and data size with fetch column stats

2014-10-31 Thread Mostafa Mokhtar (JIRA)

 [ 
https://issues.apache.org/jira/browse/HIVE-8671?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Mostafa Mokhtar updated HIVE-8671:
--
Attachment: HIVE-8671.1.patch

 Overflow in estimate row count and data size with fetch column stats
 

 Key: HIVE-8671
 URL: https://issues.apache.org/jira/browse/HIVE-8671
 Project: Hive
  Issue Type: Bug
  Components: Physical Optimizer
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Prasanth J
Priority: Critical
 Fix For: 0.14.0

 Attachments: HIVE-8671.1.patch


 Overflow in row counts and data size for several TPC-DS queries.
 Interestingly the operators which have overflow end up running with a small 
 parallelism.
 For instance Reducer 2 has an overflow but it only runs with parallelism of 2.
 {code}
Reducer 2 
 Reduce Operator Tree:
   Group By Operator
 aggregations: sum(VALUE._col0)
 keys: KEY._col0 (type: string), KEY._col1 (type: string), 
 KEY._col2 (type: string), KEY._col3 (type: string), KEY._col4 (type: float)
 mode: mergepartial
 outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
 Statistics: Num rows: 9223372036854775807 Data size: 
 9223372036854775341 Basic stats: COMPLETE Column stats: COMPLETE
 Reduce Output Operator
   key expressions: _col3 (type: string), _col3 (type: string)
   sort order: ++
   Map-reduce partition columns: _col3 (type: string)
   Statistics: Num rows: 9223372036854775807 Data size: 
 9223372036854775341 Basic stats: COMPLETE Column stats: COMPLETE
   value expressions: _col0 (type: string), _col1 (type: 
 string), _col2 (type: string), _col3 (type: string), _col4 (type: float), 
 _col5 (type: double)
 Execution mode: vectorized
 {code}
 {code}
 VERTEX   TOTAL_TASKSDURATION_SECONDS CPU_TIME_MILLIS 
 INPUT_RECORDS   OUTPUT_RECORDS 
 Map 1 62   26.41   1,779,510   
 211,978,502   60,628,390
 Map 5  14.28   6,950   
 138,098  138,098
 Map 6  12.44   3,910
 31   31
 Reducer 2  2   22.69  61,320
 60,628,390   69,182
 Reducer 3  12.63   3,910
 69,182  100
 Reducer 4  11.01   1,180   
 100  100
 {code}
 Query
 {code}
 explain  
 select  i_item_desc 
   ,i_category 
   ,i_class 
   ,i_current_price
   ,i_item_id
   ,sum(ws_ext_sales_price) as itemrevenue 
   ,sum(ws_ext_sales_price)*100/sum(sum(ws_ext_sales_price)) over
   (partition by i_class) as revenueratio
 from  
   web_sales
   ,item 
   ,date_dim
 where 
   web_sales.ws_item_sk = item.i_item_sk 
   and item.i_category in ('Jewelry', 'Sports', 'Books')
   and web_sales.ws_sold_date_sk = date_dim.d_date_sk
   and date_dim.d_date between '2001-01-12' and '2001-02-11'
 group by 
   i_item_id
 ,i_item_desc 
 ,i_category
 ,i_class
 ,i_current_price
 order by 
   i_category
 ,i_class
 ,i_item_id
 ,i_item_desc
 ,revenueratio
 limit 100
 {code}
 Explain 
 {code}
 STAGE PLANS:
   Stage: Stage-1
 Tez
   Edges:
 Map 1 - Map 5 (BROADCAST_EDGE), Map 6 (BROADCAST_EDGE)
 Reducer 2 - Map 1 (SIMPLE_EDGE)
 Reducer 3 - Reducer 2 (SIMPLE_EDGE)
 Reducer 4 - Reducer 3 (SIMPLE_EDGE)
   DagName: mmokhtar_20141019164343_854cb757-01bd-40cb-843e-9ada7c5e6f38:1
   Vertices:
 Map 1 
 Map Operator Tree:
 TableScan
   alias: web_sales
   filterExpr: ws_item_sk is not null (type: boolean)
   Statistics: Num rows: 21594638446 Data size: 2850189889652 
 Basic stats: COMPLETE Column stats: COMPLETE
   Filter Operator
 predicate: ws_item_sk is not null (type: boolean)
 Statistics: Num rows: 21594638446 Data size: 172746300152 
 Basic stats: COMPLETE Column stats: COMPLETE
 Select Operator
   expressions: ws_item_sk (type: int), ws_ext_sales_price 
 (type: float), ws_sold_date_sk (type: int)
   outputColumnNames: _col0, _col1, _col2
   Statistics: Num rows: 21594638446 Data size: 
 172746300152 Basic stats: COMPLETE Column stats: COMPLETE
   Map Join Operator
 

[jira] [Commented] (HIVE-8671) Overflow in estimate row count and data size with fetch column stats

2014-10-31 Thread Mostafa Mokhtar (JIRA)

[ 
https://issues.apache.org/jira/browse/HIVE-8671?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanelfocusedCommentId=14191964#comment-14191964
 ] 

Mostafa Mokhtar commented on HIVE-8671:
---

[~sershe]
Patch attached handles the overflow in the physical planning but not why the 
overflow happens in the first place

 Overflow in estimate row count and data size with fetch column stats
 

 Key: HIVE-8671
 URL: https://issues.apache.org/jira/browse/HIVE-8671
 Project: Hive
  Issue Type: Bug
  Components: Physical Optimizer
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Prasanth J
Priority: Critical
 Fix For: 0.14.0

 Attachments: HIVE-8671.1.patch


 Overflow in row counts and data size for several TPC-DS queries.
 Interestingly the operators which have overflow end up running with a small 
 parallelism.
 For instance Reducer 2 has an overflow but it only runs with parallelism of 2.
 {code}
Reducer 2 
 Reduce Operator Tree:
   Group By Operator
 aggregations: sum(VALUE._col0)
 keys: KEY._col0 (type: string), KEY._col1 (type: string), 
 KEY._col2 (type: string), KEY._col3 (type: string), KEY._col4 (type: float)
 mode: mergepartial
 outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
 Statistics: Num rows: 9223372036854775807 Data size: 
 9223372036854775341 Basic stats: COMPLETE Column stats: COMPLETE
 Reduce Output Operator
   key expressions: _col3 (type: string), _col3 (type: string)
   sort order: ++
   Map-reduce partition columns: _col3 (type: string)
   Statistics: Num rows: 9223372036854775807 Data size: 
 9223372036854775341 Basic stats: COMPLETE Column stats: COMPLETE
   value expressions: _col0 (type: string), _col1 (type: 
 string), _col2 (type: string), _col3 (type: string), _col4 (type: float), 
 _col5 (type: double)
 Execution mode: vectorized
 {code}
 {code}
 VERTEX   TOTAL_TASKSDURATION_SECONDS CPU_TIME_MILLIS 
 INPUT_RECORDS   OUTPUT_RECORDS 
 Map 1 62   26.41   1,779,510   
 211,978,502   60,628,390
 Map 5  14.28   6,950   
 138,098  138,098
 Map 6  12.44   3,910
 31   31
 Reducer 2  2   22.69  61,320
 60,628,390   69,182
 Reducer 3  12.63   3,910
 69,182  100
 Reducer 4  11.01   1,180   
 100  100
 {code}
 Query
 {code}
 explain  
 select  i_item_desc 
   ,i_category 
   ,i_class 
   ,i_current_price
   ,i_item_id
   ,sum(ws_ext_sales_price) as itemrevenue 
   ,sum(ws_ext_sales_price)*100/sum(sum(ws_ext_sales_price)) over
   (partition by i_class) as revenueratio
 from  
   web_sales
   ,item 
   ,date_dim
 where 
   web_sales.ws_item_sk = item.i_item_sk 
   and item.i_category in ('Jewelry', 'Sports', 'Books')
   and web_sales.ws_sold_date_sk = date_dim.d_date_sk
   and date_dim.d_date between '2001-01-12' and '2001-02-11'
 group by 
   i_item_id
 ,i_item_desc 
 ,i_category
 ,i_class
 ,i_current_price
 order by 
   i_category
 ,i_class
 ,i_item_id
 ,i_item_desc
 ,revenueratio
 limit 100
 {code}
 Explain 
 {code}
 STAGE PLANS:
   Stage: Stage-1
 Tez
   Edges:
 Map 1 - Map 5 (BROADCAST_EDGE), Map 6 (BROADCAST_EDGE)
 Reducer 2 - Map 1 (SIMPLE_EDGE)
 Reducer 3 - Reducer 2 (SIMPLE_EDGE)
 Reducer 4 - Reducer 3 (SIMPLE_EDGE)
   DagName: mmokhtar_20141019164343_854cb757-01bd-40cb-843e-9ada7c5e6f38:1
   Vertices:
 Map 1 
 Map Operator Tree:
 TableScan
   alias: web_sales
   filterExpr: ws_item_sk is not null (type: boolean)
   Statistics: Num rows: 21594638446 Data size: 2850189889652 
 Basic stats: COMPLETE Column stats: COMPLETE
   Filter Operator
 predicate: ws_item_sk is not null (type: boolean)
 Statistics: Num rows: 21594638446 Data size: 172746300152 
 Basic stats: COMPLETE Column stats: COMPLETE
 Select Operator
   expressions: ws_item_sk (type: int), ws_ext_sales_price 
 (type: float), ws_sold_date_sk (type: int)
   outputColumnNames: _col0, _col1, _col2
   Statistics: Num 

[jira] [Updated] (HIVE-8671) Overflow in estimate row count and data size with fetch column stats

2014-10-31 Thread Mostafa Mokhtar (JIRA)

 [ 
https://issues.apache.org/jira/browse/HIVE-8671?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Mostafa Mokhtar updated HIVE-8671:
--
Attachment: HIVE-8671.2.patch

[~sershe]

 Overflow in estimate row count and data size with fetch column stats
 

 Key: HIVE-8671
 URL: https://issues.apache.org/jira/browse/HIVE-8671
 Project: Hive
  Issue Type: Bug
  Components: Physical Optimizer
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Prasanth J
Priority: Critical
 Fix For: 0.14.0

 Attachments: HIVE-8671.1.patch, HIVE-8671.2.patch


 Overflow in row counts and data size for several TPC-DS queries.
 Interestingly the operators which have overflow end up running with a small 
 parallelism.
 For instance Reducer 2 has an overflow but it only runs with parallelism of 2.
 {code}
Reducer 2 
 Reduce Operator Tree:
   Group By Operator
 aggregations: sum(VALUE._col0)
 keys: KEY._col0 (type: string), KEY._col1 (type: string), 
 KEY._col2 (type: string), KEY._col3 (type: string), KEY._col4 (type: float)
 mode: mergepartial
 outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
 Statistics: Num rows: 9223372036854775807 Data size: 
 9223372036854775341 Basic stats: COMPLETE Column stats: COMPLETE
 Reduce Output Operator
   key expressions: _col3 (type: string), _col3 (type: string)
   sort order: ++
   Map-reduce partition columns: _col3 (type: string)
   Statistics: Num rows: 9223372036854775807 Data size: 
 9223372036854775341 Basic stats: COMPLETE Column stats: COMPLETE
   value expressions: _col0 (type: string), _col1 (type: 
 string), _col2 (type: string), _col3 (type: string), _col4 (type: float), 
 _col5 (type: double)
 Execution mode: vectorized
 {code}
 {code}
 VERTEX   TOTAL_TASKSDURATION_SECONDS CPU_TIME_MILLIS 
 INPUT_RECORDS   OUTPUT_RECORDS 
 Map 1 62   26.41   1,779,510   
 211,978,502   60,628,390
 Map 5  14.28   6,950   
 138,098  138,098
 Map 6  12.44   3,910
 31   31
 Reducer 2  2   22.69  61,320
 60,628,390   69,182
 Reducer 3  12.63   3,910
 69,182  100
 Reducer 4  11.01   1,180   
 100  100
 {code}
 Query
 {code}
 explain  
 select  i_item_desc 
   ,i_category 
   ,i_class 
   ,i_current_price
   ,i_item_id
   ,sum(ws_ext_sales_price) as itemrevenue 
   ,sum(ws_ext_sales_price)*100/sum(sum(ws_ext_sales_price)) over
   (partition by i_class) as revenueratio
 from  
   web_sales
   ,item 
   ,date_dim
 where 
   web_sales.ws_item_sk = item.i_item_sk 
   and item.i_category in ('Jewelry', 'Sports', 'Books')
   and web_sales.ws_sold_date_sk = date_dim.d_date_sk
   and date_dim.d_date between '2001-01-12' and '2001-02-11'
 group by 
   i_item_id
 ,i_item_desc 
 ,i_category
 ,i_class
 ,i_current_price
 order by 
   i_category
 ,i_class
 ,i_item_id
 ,i_item_desc
 ,revenueratio
 limit 100
 {code}
 Explain 
 {code}
 STAGE PLANS:
   Stage: Stage-1
 Tez
   Edges:
 Map 1 - Map 5 (BROADCAST_EDGE), Map 6 (BROADCAST_EDGE)
 Reducer 2 - Map 1 (SIMPLE_EDGE)
 Reducer 3 - Reducer 2 (SIMPLE_EDGE)
 Reducer 4 - Reducer 3 (SIMPLE_EDGE)
   DagName: mmokhtar_20141019164343_854cb757-01bd-40cb-843e-9ada7c5e6f38:1
   Vertices:
 Map 1 
 Map Operator Tree:
 TableScan
   alias: web_sales
   filterExpr: ws_item_sk is not null (type: boolean)
   Statistics: Num rows: 21594638446 Data size: 2850189889652 
 Basic stats: COMPLETE Column stats: COMPLETE
   Filter Operator
 predicate: ws_item_sk is not null (type: boolean)
 Statistics: Num rows: 21594638446 Data size: 172746300152 
 Basic stats: COMPLETE Column stats: COMPLETE
 Select Operator
   expressions: ws_item_sk (type: int), ws_ext_sales_price 
 (type: float), ws_sold_date_sk (type: int)
   outputColumnNames: _col0, _col1, _col2
   Statistics: Num rows: 21594638446 Data size: 
 172746300152 Basic stats: COMPLETE Column stats: COMPLETE
   Map Join 

[jira] [Commented] (HIVE-8526) Hive : CBO incorrect join order in TPC-DS Q45 as self join selectivity has incorrect CE

2014-10-31 Thread Mostafa Mokhtar (JIRA)

[ 
https://issues.apache.org/jira/browse/HIVE-8526?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanelfocusedCommentId=14192259#comment-14192259
 ] 

Mostafa Mokhtar commented on HIVE-8526:
---

[~rhbutani] [~hagleitn] [~jpullokkaran]

Issue resolved in latest build 
{code}
STAGE PLANS:
  Stage: Stage-1
Tez
  Edges:
Map 1 - Map 8 (BROADCAST_EDGE)
Map 6 - Map 5 (BROADCAST_EDGE), Map 9 (BROADCAST_EDGE)
Map 9 - Map 7 (BROADCAST_EDGE)
Reducer 2 - Map 1 (SIMPLE_EDGE), Map 6 (SIMPLE_EDGE)
Reducer 3 - Reducer 2 (SIMPLE_EDGE)
Reducer 4 - Reducer 3 (SIMPLE_EDGE)
  DagName: mmokhtar_20141031145858_4f12c0f7-13ef-46e8-9535-81b02cc8a937:1
  Vertices:
Map 1
Map Operator Tree:
TableScan
  alias: customer_address
  filterExpr: ca_address_sk is not null (type: boolean)
  Statistics: Num rows: 4000 Data size: 40595195284 Basic 
stats: COMPLETE Column stats: COMPLETE
  Filter Operator
predicate: ca_address_sk is not null (type: boolean)
Statistics: Num rows: 4000 Data size: 764000 Basic 
stats: COMPLETE Column stats: COMPLETE
Select Operator
  expressions: ca_address_sk (type: int), ca_county (type: 
string), ca_zip (type: string)
  outputColumnNames: _col0, _col1, _col2
  Statistics: Num rows: 4000 Data size: 764000 
Basic stats: COMPLETE Column stats: COMPLETE
  Map Join Operator
condition map:
 Inner Join 0 to 1
condition expressions:
  0 {_col0}
  1 {_col1} {_col2}
keys:
  0 _col1 (type: int)
  1 _col0 (type: int)
outputColumnNames: _col0, _col3, _col4
input vertices:
  0 Map 8
Statistics: Num rows: 8000 Data size: 1496000 
Basic stats: COMPLETE Column stats: COMPLETE
Reduce Output Operator
  key expressions: _col0 (type: int)
  sort order: +
  Map-reduce partition columns: _col0 (type: int)
  Statistics: Num rows: 8000 Data size: 1496000 
Basic stats: COMPLETE Column stats: COMPLETE
  value expressions: _col3 (type: string), _col4 (type: 
string)
Execution mode: vectorized
Map 5
Map Operator Tree:
TableScan
  alias: date_dim
  filterExpr: (((d_qoy = 2) and (d_year = 2000)) and d_date_sk 
is not null) (type: boolean)
  Statistics: Num rows: 73049 Data size: 81741831 Basic stats: 
COMPLETE Column stats: COMPLETE
  Filter Operator
predicate: (((d_qoy = 2) and (d_year = 2000)) and d_date_sk 
is not null) (type: boolean)
Statistics: Num rows: 635 Data size: 7620 Basic stats: 
COMPLETE Column stats: COMPLETE
Select Operator
  expressions: d_date_sk (type: int)
  outputColumnNames: _col0
  Statistics: Num rows: 635 Data size: 2540 Basic stats: 
COMPLETE Column stats: COMPLETE
  Reduce Output Operator
key expressions: _col0 (type: int)
sort order: +
Map-reduce partition columns: _col0 (type: int)
Statistics: Num rows: 635 Data size: 2540 Basic stats: 
COMPLETE Column stats: COMPLETE
  Select Operator
expressions: _col0 (type: int)
outputColumnNames: _col0
Statistics: Num rows: 635 Data size: 2540 Basic stats: 
COMPLETE Column stats: COMPLETE
Group By Operator
  keys: _col0 (type: int)
  mode: hash
  outputColumnNames: _col0
  Statistics: Num rows: 317 Data size: 1268 Basic 
stats: COMPLETE Column stats: COMPLETE
  Dynamic Partitioning Event Operator
Target Input: web_sales
Partition key expr: ws_sold_date_sk
Statistics: Num rows: 317 Data size: 1268 Basic 
stats: COMPLETE Column stats: COMPLETE
Target column: ws_sold_date_sk
Target Vertex: Map 6
Execution mode: vectorized
Map 6
Map Operator Tree:
TableScan
  alias: web_sales
   

[jira] [Created] (HIVE-8671) Overflow in estimate row count and data size with fetch column stats

2014-10-30 Thread Mostafa Mokhtar (JIRA)
Mostafa Mokhtar created HIVE-8671:
-

 Summary: Overflow in estimate row count and data size with fetch 
column stats
 Key: HIVE-8671
 URL: https://issues.apache.org/jira/browse/HIVE-8671
 Project: Hive
  Issue Type: Bug
  Components: Physical Optimizer
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Prasanth J
Priority: Critical
 Fix For: 0.14.0


Overflow in row counts and data size for several TPC-DS queries.
Interestingly the operators which have overflow end up running with a small 
parallelism.

For instance Reducer 2 has an overflow but it only runs with parallelism of 2.
{code}
   Reducer 2 
Reduce Operator Tree:
  Group By Operator
aggregations: sum(VALUE._col0)
keys: KEY._col0 (type: string), KEY._col1 (type: string), 
KEY._col2 (type: string), KEY._col3 (type: string), KEY._col4 (type: float)
mode: mergepartial
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
Statistics: Num rows: 9223372036854775807 Data size: 
9223372036854775341 Basic stats: COMPLETE Column stats: COMPLETE
Reduce Output Operator
  key expressions: _col3 (type: string), _col3 (type: string)
  sort order: ++
  Map-reduce partition columns: _col3 (type: string)
  Statistics: Num rows: 9223372036854775807 Data size: 
9223372036854775341 Basic stats: COMPLETE Column stats: COMPLETE
  value expressions: _col0 (type: string), _col1 (type: 
string), _col2 (type: string), _col3 (type: string), _col4 (type: float), _col5 
(type: double)
Execution mode: vectorized
{code}

{code}
VERTEX   TOTAL_TASKS  FAILED_TASKS KILLED_TASKSDURATION_SECONDS 
CPU_TIME_MILLIS GC_TIME_MILLIS  INPUT_RECORDS   OUTPUT_RECORDS 
Map 1 62 00   26.41 
  1,779,51022,242 211,978,502   60,628,390
Map 5  1 004.28 
  6,95085 138,098  138,098
Map 6  1 002.44 
  3,91028  31   31
Reducer 2  2 00   22.69 
 61,320 1,724  60,628,390   69,182
Reducer 3  1 002.63 
  3,91019  69,182  100
Reducer 4  1 001.01 
  1,18033 100  100
{code}

Query
{code}
explain  
select  i_item_desc 
  ,i_category 
  ,i_class 
  ,i_current_price
  ,i_item_id
  ,sum(ws_ext_sales_price) as itemrevenue 
  ,sum(ws_ext_sales_price)*100/sum(sum(ws_ext_sales_price)) over
  (partition by i_class) as revenueratio
from
web_sales
,item 
,date_dim
where 
web_sales.ws_item_sk = item.i_item_sk 
and item.i_category in ('Jewelry', 'Sports', 'Books')
and web_sales.ws_sold_date_sk = date_dim.d_date_sk
and date_dim.d_date between '2001-01-12' and '2001-02-11'
group by 
i_item_id
,i_item_desc 
,i_category
,i_class
,i_current_price
order by 
i_category
,i_class
,i_item_id
,i_item_desc
,revenueratio
limit 100
{code}

Explain 
{code}
STAGE PLANS:
  Stage: Stage-1
Tez
  Edges:
Map 1 - Map 5 (BROADCAST_EDGE), Map 6 (BROADCAST_EDGE)
Reducer 2 - Map 1 (SIMPLE_EDGE)
Reducer 3 - Reducer 2 (SIMPLE_EDGE)
Reducer 4 - Reducer 3 (SIMPLE_EDGE)
  DagName: mmokhtar_20141019164343_854cb757-01bd-40cb-843e-9ada7c5e6f38:1
  Vertices:
Map 1 
Map Operator Tree:
TableScan
  alias: web_sales
  filterExpr: ws_item_sk is not null (type: boolean)
  Statistics: Num rows: 21594638446 Data size: 2850189889652 
Basic stats: COMPLETE Column stats: COMPLETE
  Filter Operator
predicate: ws_item_sk is not null (type: boolean)
Statistics: Num rows: 21594638446 Data size: 172746300152 
Basic stats: COMPLETE Column stats: COMPLETE
Select Operator
  expressions: ws_item_sk (type: int), ws_ext_sales_price 
(type: float), ws_sold_date_sk (type: int)
  outputColumnNames: _col0, _col1, _col2
  Statistics: Num rows: 21594638446 Data size: 172746300152 
Basic stats: COMPLETE Column stats: COMPLETE
  Map Join Operator
condition map:
   

[jira] [Updated] (HIVE-8671) Overflow in estimate row count and data size with fetch column stats

2014-10-30 Thread Mostafa Mokhtar (JIRA)

 [ 
https://issues.apache.org/jira/browse/HIVE-8671?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Mostafa Mokhtar updated HIVE-8671:
--
Description: 
Overflow in row counts and data size for several TPC-DS queries.
Interestingly the operators which have overflow end up running with a small 
parallelism.

For instance Reducer 2 has an overflow but it only runs with parallelism of 2.
{code}
   Reducer 2 
Reduce Operator Tree:
  Group By Operator
aggregations: sum(VALUE._col0)
keys: KEY._col0 (type: string), KEY._col1 (type: string), 
KEY._col2 (type: string), KEY._col3 (type: string), KEY._col4 (type: float)
mode: mergepartial
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
Statistics: Num rows: 9223372036854775807 Data size: 
9223372036854775341 Basic stats: COMPLETE Column stats: COMPLETE
Reduce Output Operator
  key expressions: _col3 (type: string), _col3 (type: string)
  sort order: ++
  Map-reduce partition columns: _col3 (type: string)
  Statistics: Num rows: 9223372036854775807 Data size: 
9223372036854775341 Basic stats: COMPLETE Column stats: COMPLETE
  value expressions: _col0 (type: string), _col1 (type: 
string), _col2 (type: string), _col3 (type: string), _col4 (type: float), _col5 
(type: double)
Execution mode: vectorized
{code}

{code}
VERTEX   TOTAL_TASKSDURATION_SECONDS CPU_TIME_MILLIS INPUT_RECORDS  
 OUTPUT_RECORDS 
Map 1 62   26.41   1,779,510   211,978,502  
 60,628,390
Map 5  14.28   6,950   138,098  
138,098
Map 6  12.44   3,91031  
 31
Reducer 2  2   22.69  61,32060,628,390  
 69,182
Reducer 3  12.63   3,91069,182  
100
Reducer 4  11.01   1,180   100  
100
{code}

Query
{code}
explain  
select  i_item_desc 
  ,i_category 
  ,i_class 
  ,i_current_price
  ,i_item_id
  ,sum(ws_ext_sales_price) as itemrevenue 
  ,sum(ws_ext_sales_price)*100/sum(sum(ws_ext_sales_price)) over
  (partition by i_class) as revenueratio
from
web_sales
,item 
,date_dim
where 
web_sales.ws_item_sk = item.i_item_sk 
and item.i_category in ('Jewelry', 'Sports', 'Books')
and web_sales.ws_sold_date_sk = date_dim.d_date_sk
and date_dim.d_date between '2001-01-12' and '2001-02-11'
group by 
i_item_id
,i_item_desc 
,i_category
,i_class
,i_current_price
order by 
i_category
,i_class
,i_item_id
,i_item_desc
,revenueratio
limit 100
{code}

Explain 
{code}
STAGE PLANS:
  Stage: Stage-1
Tez
  Edges:
Map 1 - Map 5 (BROADCAST_EDGE), Map 6 (BROADCAST_EDGE)
Reducer 2 - Map 1 (SIMPLE_EDGE)
Reducer 3 - Reducer 2 (SIMPLE_EDGE)
Reducer 4 - Reducer 3 (SIMPLE_EDGE)
  DagName: mmokhtar_20141019164343_854cb757-01bd-40cb-843e-9ada7c5e6f38:1
  Vertices:
Map 1 
Map Operator Tree:
TableScan
  alias: web_sales
  filterExpr: ws_item_sk is not null (type: boolean)
  Statistics: Num rows: 21594638446 Data size: 2850189889652 
Basic stats: COMPLETE Column stats: COMPLETE
  Filter Operator
predicate: ws_item_sk is not null (type: boolean)
Statistics: Num rows: 21594638446 Data size: 172746300152 
Basic stats: COMPLETE Column stats: COMPLETE
Select Operator
  expressions: ws_item_sk (type: int), ws_ext_sales_price 
(type: float), ws_sold_date_sk (type: int)
  outputColumnNames: _col0, _col1, _col2
  Statistics: Num rows: 21594638446 Data size: 172746300152 
Basic stats: COMPLETE Column stats: COMPLETE
  Map Join Operator
condition map:
 Inner Join 0 to 1
condition expressions:
  0 {_col0} {_col1}
  1 
keys:
  0 _col2 (type: int)
  1 _col0 (type: int)
outputColumnNames: _col0, _col1
input vertices:
  1 Map 6
Statistics: Num rows: 24145061366 Data size: 
193160490928 Basic stats: COMPLETE Column stats: COMPLETE
Map Join Operator
  

[jira] [Created] (HIVE-8677) TPC-DS Q51 : fails with init not supported exception in GenericUDAFStreamingEvaluator.init

2014-10-30 Thread Mostafa Mokhtar (JIRA)
Mostafa Mokhtar created HIVE-8677:
-

 Summary: TPC-DS Q51 : fails with init not supported exception in 
GenericUDAFStreamingEvaluator.init
 Key: HIVE-8677
 URL: https://issues.apache.org/jira/browse/HIVE-8677
 Project: Hive
  Issue Type: Bug
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Gunther Hagleitner
Priority: Critical
 Fix For: 0.14.0


TPC-DS Q51 fails with the exception below 
{code}
, TaskAttempt 3 failed, info=[Error: Failure while running 
task:java.lang.RuntimeException: java.lang.RuntimeException: Reduce operator 
initialization failed
at 
org.apache.hadoop.hive.ql.exec.tez.TezProcessor.initializeAndRunProcessor(TezProcessor.java:186)
at 
org.apache.hadoop.hive.ql.exec.tez.TezProcessor.run(TezProcessor.java:138)
at 
org.apache.tez.runtime.LogicalIOProcessorRuntimeTask.run(LogicalIOProcessorRuntimeTask.java:324)
at 
org.apache.tez.runtime.task.TezTaskRunner$TaskRunnerCallable$1.run(TezTaskRunner.java:176)
at 
org.apache.tez.runtime.task.TezTaskRunner$TaskRunnerCallable$1.run(TezTaskRunner.java:168)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at 
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1548)
at 
org.apache.tez.runtime.task.TezTaskRunner$TaskRunnerCallable.call(TezTaskRunner.java:168)
at 
org.apache.tez.runtime.task.TezTaskRunner$TaskRunnerCallable.call(TezTaskRunner.java:163)
at java.util.concurrent.FutureTask.run(FutureTask.java:262)
at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:744)
Caused by: java.lang.RuntimeException: Reduce operator initialization failed
at 
org.apache.hadoop.hive.ql.exec.tez.ReduceRecordProcessor.init(ReduceRecordProcessor.java:146)
at 
org.apache.hadoop.hive.ql.exec.tez.TezProcessor.initializeAndRunProcessor(TezProcessor.java:162)
... 13 more
Caused by: org.apache.hadoop.hive.ql.metadata.HiveException: : init not 
supported
at 
org.apache.hadoop.hive.ql.udf.generic.GenericUDAFStreamingEvaluator.init(GenericUDAFStreamingEvaluator.java:70)
at 
org.apache.hadoop.hive.ql.plan.PTFDeserializer.setupWdwFnEvaluator(PTFDeserializer.java:209)
at 
org.apache.hadoop.hive.ql.plan.PTFDeserializer.initializeWindowing(PTFDeserializer.java:130)
at 
org.apache.hadoop.hive.ql.plan.PTFDeserializer.initializePTFChain(PTFDeserializer.java:94)
at 
org.apache.hadoop.hive.ql.exec.PTFOperator.reconstructQueryDef(PTFOperator.java:144)
at 
org.apache.hadoop.hive.ql.exec.PTFOperator.initializeOp(PTFOperator.java:74)
at org.apache.hadoop.hive.ql.exec.Operator.initialize(Operator.java:385)
at org.apache.hadoop.hive.ql.exec.Operator.initialize(Operator.java:469)
at 
org.apache.hadoop.hive.ql.exec.Operator.initializeChildren(Operator.java:425)
at 
org.apache.hadoop.hive.ql.exec.ExtractOperator.initializeOp(ExtractOperator.java:40)
at org.apache.hadoop.hive.ql.exec.Operator.initialize(Operator.java:385)
at 
org.apache.hadoop.hive.ql.exec.tez.ReduceRecordProcessor.init(ReduceRecordProcessor.java:116)
... 14 more
{code}

Query
{code}
set hive.cbo.enable=true;
set hive.stats.fetch.column.stats=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.tez.auto.reducer.parallelism=true;
set hive.tez.exec.print.summary=true;
set hive.auto.convert.join.noconditionaltask.size=128000;
set hive.exec.reducers.bytes.per.reducer=1;
set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DummyTxnManager;
set hive.support.concurrency=false;
 
WITH web_v1 as (
select
  ws_item_sk item_sk, d_date, sum(ws_sales_price),
  sum(sum(ws_sales_price))
  over (partition by ws_item_sk order by d_date rows between unbounded 
preceding and current row) cume_sales
from web_sales
,date_dim
where ws_sold_date_sk=d_date_sk
  and d_month_seq between 1193 and 1193+11
  and ws_item_sk is not NULL
group by ws_item_sk, d_date),
store_v1 as (
select
  ss_item_sk item_sk, d_date, sum(ss_sales_price),
  sum(sum(ss_sales_price))
  over (partition by ss_item_sk order by d_date rows between unbounded 
preceding and current row) cume_sales
from store_sales
,date_dim
where ss_sold_date_sk=d_date_sk
  and d_month_seq between 1193 and 1193+11
  and ss_item_sk is not NULL
group by ss_item_sk, d_date)
 select  *
from (select item_sk
 ,d_date
 ,web_sales
 ,store_sales
 ,max(web_sales)
 over (partition by item_sk order by d_date rows between unbounded 
preceding and current row) web_cumulative
 ,max(store_sales)
 over 

[jira] [Commented] (HIVE-8671) Overflow in estimate row count and data size with fetch column stats

2014-10-30 Thread Mostafa Mokhtar (JIRA)

[ 
https://issues.apache.org/jira/browse/HIVE-8671?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanelfocusedCommentId=14191274#comment-14191274
 ] 

Mostafa Mokhtar commented on HIVE-8671:
---

This is where the bug is :

Since we hit an overflow before data size is set to Log.MAX_VALUE, then when we 
add 1 to that it overflows and reducers ends up being 1

{code}
  public static int estimateReducers(long totalInputFileSize, long 
bytesPerReducer,
  int maxReducers, boolean powersOfTwo) {

int reducers = (int) ((totalInputFileSize + bytesPerReducer - 1) / 
bytesPerReducer);
reducers = Math.max(1, reducers);
reducers = Math.min(maxReducers, reducers);
{code}

 Overflow in estimate row count and data size with fetch column stats
 

 Key: HIVE-8671
 URL: https://issues.apache.org/jira/browse/HIVE-8671
 Project: Hive
  Issue Type: Bug
  Components: Physical Optimizer
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Prasanth J
Priority: Critical
 Fix For: 0.14.0


 Overflow in row counts and data size for several TPC-DS queries.
 Interestingly the operators which have overflow end up running with a small 
 parallelism.
 For instance Reducer 2 has an overflow but it only runs with parallelism of 2.
 {code}
Reducer 2 
 Reduce Operator Tree:
   Group By Operator
 aggregations: sum(VALUE._col0)
 keys: KEY._col0 (type: string), KEY._col1 (type: string), 
 KEY._col2 (type: string), KEY._col3 (type: string), KEY._col4 (type: float)
 mode: mergepartial
 outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
 Statistics: Num rows: 9223372036854775807 Data size: 
 9223372036854775341 Basic stats: COMPLETE Column stats: COMPLETE
 Reduce Output Operator
   key expressions: _col3 (type: string), _col3 (type: string)
   sort order: ++
   Map-reduce partition columns: _col3 (type: string)
   Statistics: Num rows: 9223372036854775807 Data size: 
 9223372036854775341 Basic stats: COMPLETE Column stats: COMPLETE
   value expressions: _col0 (type: string), _col1 (type: 
 string), _col2 (type: string), _col3 (type: string), _col4 (type: float), 
 _col5 (type: double)
 Execution mode: vectorized
 {code}
 {code}
 VERTEX   TOTAL_TASKSDURATION_SECONDS CPU_TIME_MILLIS 
 INPUT_RECORDS   OUTPUT_RECORDS 
 Map 1 62   26.41   1,779,510   
 211,978,502   60,628,390
 Map 5  14.28   6,950   
 138,098  138,098
 Map 6  12.44   3,910
 31   31
 Reducer 2  2   22.69  61,320
 60,628,390   69,182
 Reducer 3  12.63   3,910
 69,182  100
 Reducer 4  11.01   1,180   
 100  100
 {code}
 Query
 {code}
 explain  
 select  i_item_desc 
   ,i_category 
   ,i_class 
   ,i_current_price
   ,i_item_id
   ,sum(ws_ext_sales_price) as itemrevenue 
   ,sum(ws_ext_sales_price)*100/sum(sum(ws_ext_sales_price)) over
   (partition by i_class) as revenueratio
 from  
   web_sales
   ,item 
   ,date_dim
 where 
   web_sales.ws_item_sk = item.i_item_sk 
   and item.i_category in ('Jewelry', 'Sports', 'Books')
   and web_sales.ws_sold_date_sk = date_dim.d_date_sk
   and date_dim.d_date between '2001-01-12' and '2001-02-11'
 group by 
   i_item_id
 ,i_item_desc 
 ,i_category
 ,i_class
 ,i_current_price
 order by 
   i_category
 ,i_class
 ,i_item_id
 ,i_item_desc
 ,revenueratio
 limit 100
 {code}
 Explain 
 {code}
 STAGE PLANS:
   Stage: Stage-1
 Tez
   Edges:
 Map 1 - Map 5 (BROADCAST_EDGE), Map 6 (BROADCAST_EDGE)
 Reducer 2 - Map 1 (SIMPLE_EDGE)
 Reducer 3 - Reducer 2 (SIMPLE_EDGE)
 Reducer 4 - Reducer 3 (SIMPLE_EDGE)
   DagName: mmokhtar_20141019164343_854cb757-01bd-40cb-843e-9ada7c5e6f38:1
   Vertices:
 Map 1 
 Map Operator Tree:
 TableScan
   alias: web_sales
   filterExpr: ws_item_sk is not null (type: boolean)
   Statistics: Num rows: 21594638446 Data size: 2850189889652 
 Basic stats: COMPLETE Column stats: COMPLETE
   Filter Operator
 predicate: ws_item_sk is not null (type: boolean)
 Statistics: Num rows: 21594638446 Data size: 

[jira] [Commented] (HIVE-8671) Overflow in estimate row count and data size with fetch column stats

2014-10-30 Thread Mostafa Mokhtar (JIRA)

[ 
https://issues.apache.org/jira/browse/HIVE-8671?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanelfocusedCommentId=14191285#comment-14191285
 ] 

Mostafa Mokhtar commented on HIVE-8671:
---

For the code below the query had the following inputs:
totalInputFileSize = 9223372036854775341
bytesPerReducer = 1

9223372036854775341 + 1 - Overflow.
{code}
  public static int estimateReducers(long totalInputFileSize, long 
bytesPerReducer,
  int maxReducers, boolean powersOfTwo) {

int reducers = (int) ((totalInputFileSize + bytesPerReducer - 1) / 
bytesPerReducer);
reducers = Math.max(1, reducers);
reducers = Math.min(maxReducers, reducers);
{code}

I recommend changing to 
{code}
int reducers = (int) ((Math.max(totalInputFileSize,bytesPerReducer )) / 
bytesPerReducer);
{code}

 Overflow in estimate row count and data size with fetch column stats
 

 Key: HIVE-8671
 URL: https://issues.apache.org/jira/browse/HIVE-8671
 Project: Hive
  Issue Type: Bug
  Components: Physical Optimizer
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Prasanth J
Priority: Critical
 Fix For: 0.14.0


 Overflow in row counts and data size for several TPC-DS queries.
 Interestingly the operators which have overflow end up running with a small 
 parallelism.
 For instance Reducer 2 has an overflow but it only runs with parallelism of 2.
 {code}
Reducer 2 
 Reduce Operator Tree:
   Group By Operator
 aggregations: sum(VALUE._col0)
 keys: KEY._col0 (type: string), KEY._col1 (type: string), 
 KEY._col2 (type: string), KEY._col3 (type: string), KEY._col4 (type: float)
 mode: mergepartial
 outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
 Statistics: Num rows: 9223372036854775807 Data size: 
 9223372036854775341 Basic stats: COMPLETE Column stats: COMPLETE
 Reduce Output Operator
   key expressions: _col3 (type: string), _col3 (type: string)
   sort order: ++
   Map-reduce partition columns: _col3 (type: string)
   Statistics: Num rows: 9223372036854775807 Data size: 
 9223372036854775341 Basic stats: COMPLETE Column stats: COMPLETE
   value expressions: _col0 (type: string), _col1 (type: 
 string), _col2 (type: string), _col3 (type: string), _col4 (type: float), 
 _col5 (type: double)
 Execution mode: vectorized
 {code}
 {code}
 VERTEX   TOTAL_TASKSDURATION_SECONDS CPU_TIME_MILLIS 
 INPUT_RECORDS   OUTPUT_RECORDS 
 Map 1 62   26.41   1,779,510   
 211,978,502   60,628,390
 Map 5  14.28   6,950   
 138,098  138,098
 Map 6  12.44   3,910
 31   31
 Reducer 2  2   22.69  61,320
 60,628,390   69,182
 Reducer 3  12.63   3,910
 69,182  100
 Reducer 4  11.01   1,180   
 100  100
 {code}
 Query
 {code}
 explain  
 select  i_item_desc 
   ,i_category 
   ,i_class 
   ,i_current_price
   ,i_item_id
   ,sum(ws_ext_sales_price) as itemrevenue 
   ,sum(ws_ext_sales_price)*100/sum(sum(ws_ext_sales_price)) over
   (partition by i_class) as revenueratio
 from  
   web_sales
   ,item 
   ,date_dim
 where 
   web_sales.ws_item_sk = item.i_item_sk 
   and item.i_category in ('Jewelry', 'Sports', 'Books')
   and web_sales.ws_sold_date_sk = date_dim.d_date_sk
   and date_dim.d_date between '2001-01-12' and '2001-02-11'
 group by 
   i_item_id
 ,i_item_desc 
 ,i_category
 ,i_class
 ,i_current_price
 order by 
   i_category
 ,i_class
 ,i_item_id
 ,i_item_desc
 ,revenueratio
 limit 100
 {code}
 Explain 
 {code}
 STAGE PLANS:
   Stage: Stage-1
 Tez
   Edges:
 Map 1 - Map 5 (BROADCAST_EDGE), Map 6 (BROADCAST_EDGE)
 Reducer 2 - Map 1 (SIMPLE_EDGE)
 Reducer 3 - Reducer 2 (SIMPLE_EDGE)
 Reducer 4 - Reducer 3 (SIMPLE_EDGE)
   DagName: mmokhtar_20141019164343_854cb757-01bd-40cb-843e-9ada7c5e6f38:1
   Vertices:
 Map 1 
 Map Operator Tree:
 TableScan
   alias: web_sales
   filterExpr: ws_item_sk is not null (type: boolean)
   Statistics: Num rows: 21594638446 Data size: 2850189889652 
 Basic stats: COMPLETE Column stats: COMPLETE
   Filter Operator
  

[jira] [Updated] (HIVE-8495) Add progress bar for Hive on Tez queries

2014-10-29 Thread Mostafa Mokhtar (JIRA)

 [ 
https://issues.apache.org/jira/browse/HIVE-8495?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Mostafa Mokhtar updated HIVE-8495:
--
Attachment: HIVE-8495.4.patch

 Add progress bar for Hive on Tez queries
 

 Key: HIVE-8495
 URL: https://issues.apache.org/jira/browse/HIVE-8495
 Project: Hive
  Issue Type: Bug
  Components: CLI
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Mostafa Mokhtar
 Fix For: 0.14.0

 Attachments: HIVE-8495.1.patch, HIVE-8495.2.patch, HIVE-8495.3.patch, 
 HIVE-8495.4.patch, Screen Shot 2014-10-16 at 9.35.26 PM.png, Screen Shot 
 2014-10-22 at 11.48.57 AM.png


 Build a Progress bar to provide overall progress on running tasks.
 Progress is calculated as : 
  (Completed tasks) / (Total number of tasks)



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)


[jira] [Updated] (HIVE-8495) Add progress bar for Hive on Tez queries

2014-10-29 Thread Mostafa Mokhtar (JIRA)

 [ 
https://issues.apache.org/jira/browse/HIVE-8495?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Mostafa Mokhtar updated HIVE-8495:
--
Attachment: HIVE-8495.5.patch

[~prasanth_j]

 Add progress bar for Hive on Tez queries
 

 Key: HIVE-8495
 URL: https://issues.apache.org/jira/browse/HIVE-8495
 Project: Hive
  Issue Type: Bug
  Components: CLI
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Mostafa Mokhtar
 Fix For: 0.14.0

 Attachments: HIVE-8495.1.patch, HIVE-8495.2.patch, HIVE-8495.3.patch, 
 HIVE-8495.4.patch, HIVE-8495.5.patch, Screen Shot 2014-10-16 at 9.35.26 
 PM.png, Screen Shot 2014-10-22 at 11.48.57 AM.png


 Build a Progress bar to provide overall progress on running tasks.
 Progress is calculated as : 
  (Completed tasks) / (Total number of tasks)



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)


[jira] [Updated] (HIVE-8495) Add progress bar for Hive on Tez queries

2014-10-29 Thread Mostafa Mokhtar (JIRA)

 [ 
https://issues.apache.org/jira/browse/HIVE-8495?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Mostafa Mokhtar updated HIVE-8495:
--
Attachment: HIVE-8495.5.patch

 Add progress bar for Hive on Tez queries
 

 Key: HIVE-8495
 URL: https://issues.apache.org/jira/browse/HIVE-8495
 Project: Hive
  Issue Type: Bug
  Components: CLI
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Mostafa Mokhtar
 Fix For: 0.14.0

 Attachments: HIVE-8495.1.patch, HIVE-8495.2.patch, HIVE-8495.3.patch, 
 HIVE-8495.4.patch, HIVE-8495.5.patch, Screen Shot 2014-10-16 at 9.35.26 
 PM.png, Screen Shot 2014-10-22 at 11.48.57 AM.png


 Build a Progress bar to provide overall progress on running tasks.
 Progress is calculated as : 
  (Completed tasks) / (Total number of tasks)



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)


[jira] [Updated] (HIVE-8495) Add progress bar for Hive on Tez queries

2014-10-29 Thread Mostafa Mokhtar (JIRA)

 [ 
https://issues.apache.org/jira/browse/HIVE-8495?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Mostafa Mokhtar updated HIVE-8495:
--
Attachment: (was: HIVE-8495.5.patch)

 Add progress bar for Hive on Tez queries
 

 Key: HIVE-8495
 URL: https://issues.apache.org/jira/browse/HIVE-8495
 Project: Hive
  Issue Type: Bug
  Components: CLI
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Mostafa Mokhtar
 Fix For: 0.14.0

 Attachments: HIVE-8495.1.patch, HIVE-8495.2.patch, HIVE-8495.3.patch, 
 HIVE-8495.4.patch, HIVE-8495.5.patch, Screen Shot 2014-10-16 at 9.35.26 
 PM.png, Screen Shot 2014-10-22 at 11.48.57 AM.png


 Build a Progress bar to provide overall progress on running tasks.
 Progress is calculated as : 
  (Completed tasks) / (Total number of tasks)



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)


[jira] [Updated] (HIVE-8495) Add progress bar for Hive on Tez queries

2014-10-28 Thread Mostafa Mokhtar (JIRA)

 [ 
https://issues.apache.org/jira/browse/HIVE-8495?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Mostafa Mokhtar updated HIVE-8495:
--
Attachment: HIVE-8495.2.patch

 Add progress bar for Hive on Tez queries
 

 Key: HIVE-8495
 URL: https://issues.apache.org/jira/browse/HIVE-8495
 Project: Hive
  Issue Type: Bug
  Components: CLI
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Mostafa Mokhtar
 Fix For: 0.14.0

 Attachments: HIVE-8495.1.patch, HIVE-8495.2.patch, Screen Shot 
 2014-10-16 at 9.35.26 PM.png, Screen Shot 2014-10-22 at 11.48.57 AM.png


 Build a Progress bar to provide overall progress on running tasks.
 Progress is calculated as : 
  (Completed tasks) / (Total number of tasks)



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)


[jira] [Updated] (HIVE-8495) Add progress bar for Hive on Tez queries

2014-10-28 Thread Mostafa Mokhtar (JIRA)

 [ 
https://issues.apache.org/jira/browse/HIVE-8495?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Mostafa Mokhtar updated HIVE-8495:
--
Attachment: (was: HIVE-8495.2.patch)

 Add progress bar for Hive on Tez queries
 

 Key: HIVE-8495
 URL: https://issues.apache.org/jira/browse/HIVE-8495
 Project: Hive
  Issue Type: Bug
  Components: CLI
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Mostafa Mokhtar
 Fix For: 0.14.0

 Attachments: HIVE-8495.1.patch, Screen Shot 2014-10-16 at 9.35.26 
 PM.png, Screen Shot 2014-10-22 at 11.48.57 AM.png


 Build a Progress bar to provide overall progress on running tasks.
 Progress is calculated as : 
  (Completed tasks) / (Total number of tasks)



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)


[jira] [Updated] (HIVE-8495) Add progress bar for Hive on Tez queries

2014-10-28 Thread Mostafa Mokhtar (JIRA)

 [ 
https://issues.apache.org/jira/browse/HIVE-8495?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Mostafa Mokhtar updated HIVE-8495:
--
Attachment: HIVE-8495.2.patch

 Add progress bar for Hive on Tez queries
 

 Key: HIVE-8495
 URL: https://issues.apache.org/jira/browse/HIVE-8495
 Project: Hive
  Issue Type: Bug
  Components: CLI
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Mostafa Mokhtar
 Fix For: 0.14.0

 Attachments: HIVE-8495.1.patch, HIVE-8495.2.patch, Screen Shot 
 2014-10-16 at 9.35.26 PM.png, Screen Shot 2014-10-22 at 11.48.57 AM.png


 Build a Progress bar to provide overall progress on running tasks.
 Progress is calculated as : 
  (Completed tasks) / (Total number of tasks)



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)


[jira] [Updated] (HIVE-8495) Add progress bar for Hive on Tez queries

2014-10-28 Thread Mostafa Mokhtar (JIRA)

 [ 
https://issues.apache.org/jira/browse/HIVE-8495?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Mostafa Mokhtar updated HIVE-8495:
--
Attachment: HIVE-8495.3.patch

 Add progress bar for Hive on Tez queries
 

 Key: HIVE-8495
 URL: https://issues.apache.org/jira/browse/HIVE-8495
 Project: Hive
  Issue Type: Bug
  Components: CLI
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Mostafa Mokhtar
 Fix For: 0.14.0

 Attachments: HIVE-8495.1.patch, HIVE-8495.2.patch, HIVE-8495.3.patch, 
 Screen Shot 2014-10-16 at 9.35.26 PM.png, Screen Shot 2014-10-22 at 11.48.57 
 AM.png


 Build a Progress bar to provide overall progress on running tasks.
 Progress is calculated as : 
  (Completed tasks) / (Total number of tasks)



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)


[jira] [Updated] (HIVE-8495) Add progress bar for Hive on Tez queries

2014-10-28 Thread Mostafa Mokhtar (JIRA)

 [ 
https://issues.apache.org/jira/browse/HIVE-8495?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Mostafa Mokhtar updated HIVE-8495:
--
Attachment: HIVE-8495.3.patch

 Add progress bar for Hive on Tez queries
 

 Key: HIVE-8495
 URL: https://issues.apache.org/jira/browse/HIVE-8495
 Project: Hive
  Issue Type: Bug
  Components: CLI
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Mostafa Mokhtar
 Fix For: 0.14.0

 Attachments: HIVE-8495.1.patch, HIVE-8495.2.patch, HIVE-8495.3.patch, 
 Screen Shot 2014-10-16 at 9.35.26 PM.png, Screen Shot 2014-10-22 at 11.48.57 
 AM.png


 Build a Progress bar to provide overall progress on running tasks.
 Progress is calculated as : 
  (Completed tasks) / (Total number of tasks)



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)


[jira] [Updated] (HIVE-8495) Add progress bar for Hive on Tez queries

2014-10-28 Thread Mostafa Mokhtar (JIRA)

 [ 
https://issues.apache.org/jira/browse/HIVE-8495?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Mostafa Mokhtar updated HIVE-8495:
--
Attachment: (was: HIVE-8495.3.patch)

 Add progress bar for Hive on Tez queries
 

 Key: HIVE-8495
 URL: https://issues.apache.org/jira/browse/HIVE-8495
 Project: Hive
  Issue Type: Bug
  Components: CLI
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Mostafa Mokhtar
 Fix For: 0.14.0

 Attachments: HIVE-8495.1.patch, HIVE-8495.2.patch, HIVE-8495.3.patch, 
 Screen Shot 2014-10-16 at 9.35.26 PM.png, Screen Shot 2014-10-22 at 11.48.57 
 AM.png


 Build a Progress bar to provide overall progress on running tasks.
 Progress is calculated as : 
  (Completed tasks) / (Total number of tasks)



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)


[jira] [Commented] (HIVE-8526) Hive : CBO incorrect join order in TPC-DS Q45 as self join selectivity has incorrect CE

2014-10-27 Thread Mostafa Mokhtar (JIRA)

[ 
https://issues.apache.org/jira/browse/HIVE-8526?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanelfocusedCommentId=14186374#comment-14186374
 ] 

Mostafa Mokhtar commented on HIVE-8526:
---

[~rhbutani]
Using the patch the wrong join order is still generated

{code}
2014-10-28 01:08:59,970 DEBUG [main]: parse.SemanticAnalyzer 
(SemanticAnalyzer.java:apply(12601)) - Plan After Join Reordering:
HiveSortRel(fetch=[100]): rowcount = 319777.48643073987, cumulative cost = 
{2.4340839576240274E8 rows, 639555.9728614797 cpu, 0.0 io}, id = 616
  HiveSortRel(sort0=[$0], sort1=[$1], dir0=[ASC], dir1=[ASC]): rowcount = 
319777.48643073987, cumulative cost = {1.9476877928923708E8 rows, 
319778.48643073987 cpu, 0.0 io}, id = 614
HiveProjectRel(ca_zip=[$0], ca_county=[$1], _o__c2=[$2]): rowcount = 
319777.48643073987, cumulative cost = {1.4612916281607142E8 rows, 1.0 cpu, 0.0 
io}, id = 612
  HiveAggregateRel(group=[{0, 1}], agg#0=[sum($2)]): rowcount = 
319777.48643073987, cumulative cost = {1.4612916281607142E8 rows, 1.0 cpu, 0.0 
io}, id = 610
HiveProjectRel($f0=[$2], $f1=[$1], $f2=[$0]): rowcount = 
8.94102670242874, cumulative cost = {1.4612916281607142E8 rows, 1.0 cpu, 0.0 
io}, id = 608
  HiveProjectRel(ws_sales_price=[$2], ca_county=[$7], ca_zip=[$8]): 
rowcount = 8.94102670242874, cumulative cost = {1.4612916281607142E8 rows, 1.0 
cpu, 0.0 io}, id = 606
SemiJoinRel(condition=[=($13, $14)], joinType=[inner]): rowcount = 
8.94102670242874, cumulative cost = {1.4612916281607142E8 rows, 1.0 cpu, 0.0 
io}, id = 604
  HiveProjectRel(ws_item_sk=[$5], ws_bill_customer_sk=[$6], 
ws_sales_price=[$7], ws_sold_date_sk=[$8], c_customer_sk=[$0], 
c_current_addr_sk=[$1], ca_address_sk=[$2], ca_county=[$3], ca_zip=[$4], 
d_date_sk=[$9], d_year=[$10], d_qoy=[$11], i_item_sk=[$12], i_item_id=[$13]): 
rowcount = 429169.2817165796, cumulative cost = {1.4612916181607142E8 rows, 0.0 
cpu, 0.0 io}, id = 703
HiveJoinRel(condition=[=($6, $0)], joinType=[inner]): rowcount 
= 429169.2817165796, cumulative cost = {1.4612916181607142E8 rows, 0.0 cpu, 0.0 
io}, id = 701
  HiveJoinRel(condition=[=($1, $2)], joinType=[inner]): 
rowcount = 160.0, cumulative cost = {240.0 rows, 0.0 cpu, 0.0 io}, id = 
634
HiveProjectRel(c_customer_sk=[$0], c_current_addr_sk=[$4]): 
rowcount = 160.0, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 537
  
HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_200_orig.customer]]): 
rowcount = 160.0, cumulative cost = {0}, id = 360
HiveProjectRel(ca_address_sk=[$0], ca_county=[$7], 
ca_zip=[$9]): rowcount = 80.0, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 
io}, id = 584
  
HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_200_orig.customer_address]]):
 rowcount = 80.0, cumulative cost = {0}, id = 356
  HiveJoinRel(condition=[=($0, $7)], joinType=[inner]): 
rowcount = 257083.68571428573, cumulative cost = {1.4427207813035715E8 rows, 
0.0 cpu, 0.0 io}, id = 699
HiveJoinRel(condition=[=($3, $4)], joinType=[inner]): 
rowcount = 257083.68571428573, cumulative cost = {1.439669964287E8 rows, 
0.0 cpu, 0.0 io}, id = 637
  HiveProjectRel(ws_item_sk=[$2], ws_bill_customer_sk=[$3], 
ws_sales_price=[$20], ws_sold_date_sk=[$33]): rowcount = 1.43966864E8, 
cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 534

HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_200_orig.web_sales]]): 
rowcount = 1.43966864E8, cumulative cost = {0}, id = 358
  HiveProjectRel(d_date_sk=[$0], d_year=[$6], d_qoy=[$10]): 
rowcount = 130.44464285714287, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, 
id = 590
HiveFilterRel(condition=[AND(=($10, 2), =($6, 2000))]): 
rowcount = 130.44464285714287, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, 
id = 588
  
HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_200_orig.date_dim]]): 
rowcount = 73049.0, cumulative cost = {0}, id = 359
HiveProjectRel(i_item_sk=[$0], i_item_id=[$1]): rowcount = 
48000.0, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 594
  
HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_200_orig.item]]): rowcount = 
48000.0, cumulative cost = {0}, id = 357
  HiveProjectRel(i_item_id=[$1]): rowcount = 1.0, cumulative cost = 
{0.0 rows, 0.0 cpu, 0.0 io}, id = 602
HiveProjectRel(i_item_sk=[$0], i_item_id=[$1]): rowcount = 1.0, 
cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 600
  HiveFilterRel(condition=[in($0, 2, 3, 5, 7, 11, 13, 17, 19, 
23, 29)]): rowcount = 1.0, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 
598


[jira] [Updated] (HIVE-8495) Add progress bar for Hive on Tez queries

2014-10-25 Thread Mostafa Mokhtar (JIRA)

 [ 
https://issues.apache.org/jira/browse/HIVE-8495?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Mostafa Mokhtar updated HIVE-8495:
--
Attachment: HIVE-8495.2.patch

 Add progress bar for Hive on Tez queries
 

 Key: HIVE-8495
 URL: https://issues.apache.org/jira/browse/HIVE-8495
 Project: Hive
  Issue Type: Bug
  Components: CLI
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Mostafa Mokhtar
 Fix For: 0.14.0

 Attachments: HIVE-8495.1.patch, HIVE-8495.2.patch, Screen Shot 
 2014-10-16 at 9.35.26 PM.png, Screen Shot 2014-10-22 at 11.48.57 AM.png


 Build a Progress bar to provide overall progress on running tasks.
 Progress is calculated as : 
  (Completed tasks) / (Total number of tasks)



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)


[jira] [Updated] (HIVE-8495) Add progress bar for Hive on Tez queries

2014-10-25 Thread Mostafa Mokhtar (JIRA)

 [ 
https://issues.apache.org/jira/browse/HIVE-8495?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Mostafa Mokhtar updated HIVE-8495:
--
Attachment: (was: HIVE-8495.2.patch)

 Add progress bar for Hive on Tez queries
 

 Key: HIVE-8495
 URL: https://issues.apache.org/jira/browse/HIVE-8495
 Project: Hive
  Issue Type: Bug
  Components: CLI
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Mostafa Mokhtar
 Fix For: 0.14.0

 Attachments: HIVE-8495.1.patch, Screen Shot 2014-10-16 at 9.35.26 
 PM.png, Screen Shot 2014-10-22 at 11.48.57 AM.png


 Build a Progress bar to provide overall progress on running tasks.
 Progress is calculated as : 
  (Completed tasks) / (Total number of tasks)



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)


[jira] [Commented] (HIVE-8517) When joining on partition column NDV gets overridden by StatsUtils.getColStatisticsFromExpression

2014-10-23 Thread Mostafa Mokhtar (JIRA)

[ 
https://issues.apache.org/jira/browse/HIVE-8517?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanelfocusedCommentId=14181561#comment-14181561
 ] 

Mostafa Mokhtar commented on HIVE-8517:
---

[~vikram.dixit] [~hagleitn]
Failure is unrelated.



 When joining on partition column NDV gets overridden by 
 StatsUtils.getColStatisticsFromExpression
 -

 Key: HIVE-8517
 URL: https://issues.apache.org/jira/browse/HIVE-8517
 Project: Hive
  Issue Type: Bug
  Components: Physical Optimizer
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Mostafa Mokhtar
Priority: Critical
 Fix For: 0.14.0

 Attachments: HIVE-8517.1.patch, HIVE-8517.2.patch, HIVE-8517.3.patch


 When joining on partition column number of partitions is used as NDV which 
 gets overridden by StatsUtils.getColStatisticsFromExpression and the number 
 of partitions used as NDV is replaced by number of rows which results in the 
 same behavior as explained in 
 https://issues.apache.org/jira/browse/HIVE-8196. Joining on partition 
 columns with fetch column stats enabled results it very small CE which 
 negatively affects query performance 
 This is the call stack.
 {code}
 StatsUtils.getColStatisticsFromExpression(HiveConf, Statistics, ExprNodeDesc) 
 line: 1001  
 StatsRulesProcFactory$ReduceSinkStatsRule.process(Node, StackNode, 
 NodeProcessorCtx, Object...) line: 1479  
 DefaultRuleDispatcher.dispatch(Node, StackNode, Object...) line: 90 
 PreOrderWalker(DefaultGraphWalker).dispatchAndReturn(Node, StackNode) line: 
 94  
 PreOrderWalker(DefaultGraphWalker).dispatch(Node, StackNode) line: 78   
 PreOrderWalker.walk(Node) line: 54
 PreOrderWalker.walk(Node) line: 59
 PreOrderWalker.walk(Node) line: 59
 PreOrderWalker(DefaultGraphWalker).startWalking(CollectionNode, 
 HashMapNode,Object) line: 109 
 AnnotateWithStatistics.transform(ParseContext) line: 78   
 TezCompiler.runStatsAnnotation(OptimizeTezProcContext) line: 248  
 TezCompiler.optimizeOperatorPlan(ParseContext, SetReadEntity, 
 SetWriteEntity) line: 120   
 TezCompiler(TaskCompiler).compile(ParseContext, ListTaskSerializable, 
 HashSetReadEntity, HashSetWriteEntity) line: 99 
 SemanticAnalyzer.analyzeInternal(ASTNode) line: 10037 
 SemanticAnalyzer(BaseSemanticAnalyzer).analyze(ASTNode, Context) line: 221
 ExplainSemanticAnalyzer.analyzeInternal(ASTNode) line: 74 
 ExplainSemanticAnalyzer(BaseSemanticAnalyzer).analyze(ASTNode, Context) line: 
 221 
 Driver.compile(String, boolean) line: 415 
 {code}
 Query
 {code}
 select
   ss_item_sk item_sk, d_date, sum(ss_sales_price),
   sum(sum(ss_sales_price))
   over (partition by ss_item_sk order by d_date rows between unbounded 
 preceding and current row) cume_sales
 from store_sales
 ,date_dim
 where ss_sold_date_sk=d_date_sk
   and d_month_seq between 1193 and 1193+11
   and ss_item_sk is not NULL
 group by ss_item_sk, d_date
 {code}
 Plan 
 Notice in the Map join operator the number of rows drop from 82,510,879,939 
 to 36524 after the join.
 {code}
 OK
 STAGE DEPENDENCIES:
   Stage-1 is a root stage
   Stage-0 depends on stages: Stage-1
 STAGE PLANS:
   Stage: Stage-1
 Tez
   Edges:
 Map 1 - Map 4 (BROADCAST_EDGE)
 Reducer 2 - Map 1 (SIMPLE_EDGE)
 Reducer 3 - Reducer 2 (SIMPLE_EDGE)
   DagName: mmokhtar_20141019131818_086d663a-5621-456c-bf25-8ccb7112ee3b:6
   Vertices:
 Map 1
 Map Operator Tree:
 TableScan
   alias: store_sales
   filterExpr: ss_item_sk is not null (type: boolean)
   Statistics: Num rows: 82510879939 Data size: 6873789738208 
 Basic stats: COMPLETE Column stats: COMPLETE
   Filter Operator
 predicate: ss_item_sk is not null (type: boolean)
 Statistics: Num rows: 82510879939 Data size: 652315818272 
 Basic stats: COMPLETE Column stats: COMPLETE
 Map Join Operator
   condition map:
Inner Join 0 to 1
   condition expressions:
 0 {ss_item_sk} {ss_sales_price} {ss_sold_date_sk}
 1 {d_date_sk} {d_date} {d_month_seq}
   keys:
 0 ss_sold_date_sk (type: int)
 1 d_date_sk (type: int)
   outputColumnNames: _col1, _col12, _col22, _col26, 
 _col28, _col29
   input vertices:
 1 Map 4
   Statistics: Num rows: 36524 Data size: 4163736 Basic 
 stats: COMPLETE Column stats: COMPLETE
   Filter Operator
  

[jira] [Updated] (HIVE-8495) Add progress bar for Hive on Tez queries

2014-10-22 Thread Mostafa Mokhtar (JIRA)

 [ 
https://issues.apache.org/jira/browse/HIVE-8495?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Mostafa Mokhtar updated HIVE-8495:
--
Attachment: Screen Shot 2014-10-22 at 11.48.57 AM.png

[~hagleitn] [~gopalv]

For queries with a large number of vertices I can aggregate the vertex by 
status as shown in the image, this provides a less cluttered view of query 
progress.

 Add progress bar for Hive on Tez queries
 

 Key: HIVE-8495
 URL: https://issues.apache.org/jira/browse/HIVE-8495
 Project: Hive
  Issue Type: Bug
  Components: CLI
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Mostafa Mokhtar
 Fix For: 0.14.0

 Attachments: HIVE-8495.1.patch, Screen Shot 2014-10-16 at 9.35.26 
 PM.png, Screen Shot 2014-10-22 at 11.48.57 AM.png


 Build a Progress bar to provide overall progress on running tasks.
 Progress is calculated as : 
  (Completed tasks) / (Total number of tasks)



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)


[jira] [Updated] (HIVE-8517) When joining on partition column NDV gets overridden by StatsUtils.getColStatisticsFromExpression

2014-10-21 Thread Mostafa Mokhtar (JIRA)

 [ 
https://issues.apache.org/jira/browse/HIVE-8517?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Mostafa Mokhtar updated HIVE-8517:
--
Attachment: HIVE-8517.3.patch

Add Avg column length to the partitioned column stats as it was missing.

 When joining on partition column NDV gets overridden by 
 StatsUtils.getColStatisticsFromExpression
 -

 Key: HIVE-8517
 URL: https://issues.apache.org/jira/browse/HIVE-8517
 Project: Hive
  Issue Type: Bug
  Components: Physical Optimizer
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Mostafa Mokhtar
Priority: Critical
 Fix For: 0.14.0

 Attachments: HIVE-8517.1.patch, HIVE-8517.2.patch, HIVE-8517.3.patch


 When joining on partition column number of partitions is used as NDV which 
 gets overridden by StatsUtils.getColStatisticsFromExpression and the number 
 of partitions used as NDV is replaced by number of rows which results in the 
 same behavior as explained in 
 https://issues.apache.org/jira/browse/HIVE-8196. Joining on partition 
 columns with fetch column stats enabled results it very small CE which 
 negatively affects query performance 
 This is the call stack.
 {code}
 StatsUtils.getColStatisticsFromExpression(HiveConf, Statistics, ExprNodeDesc) 
 line: 1001  
 StatsRulesProcFactory$ReduceSinkStatsRule.process(Node, StackNode, 
 NodeProcessorCtx, Object...) line: 1479  
 DefaultRuleDispatcher.dispatch(Node, StackNode, Object...) line: 90 
 PreOrderWalker(DefaultGraphWalker).dispatchAndReturn(Node, StackNode) line: 
 94  
 PreOrderWalker(DefaultGraphWalker).dispatch(Node, StackNode) line: 78   
 PreOrderWalker.walk(Node) line: 54
 PreOrderWalker.walk(Node) line: 59
 PreOrderWalker.walk(Node) line: 59
 PreOrderWalker(DefaultGraphWalker).startWalking(CollectionNode, 
 HashMapNode,Object) line: 109 
 AnnotateWithStatistics.transform(ParseContext) line: 78   
 TezCompiler.runStatsAnnotation(OptimizeTezProcContext) line: 248  
 TezCompiler.optimizeOperatorPlan(ParseContext, SetReadEntity, 
 SetWriteEntity) line: 120   
 TezCompiler(TaskCompiler).compile(ParseContext, ListTaskSerializable, 
 HashSetReadEntity, HashSetWriteEntity) line: 99 
 SemanticAnalyzer.analyzeInternal(ASTNode) line: 10037 
 SemanticAnalyzer(BaseSemanticAnalyzer).analyze(ASTNode, Context) line: 221
 ExplainSemanticAnalyzer.analyzeInternal(ASTNode) line: 74 
 ExplainSemanticAnalyzer(BaseSemanticAnalyzer).analyze(ASTNode, Context) line: 
 221 
 Driver.compile(String, boolean) line: 415 
 {code}
 Query
 {code}
 select
   ss_item_sk item_sk, d_date, sum(ss_sales_price),
   sum(sum(ss_sales_price))
   over (partition by ss_item_sk order by d_date rows between unbounded 
 preceding and current row) cume_sales
 from store_sales
 ,date_dim
 where ss_sold_date_sk=d_date_sk
   and d_month_seq between 1193 and 1193+11
   and ss_item_sk is not NULL
 group by ss_item_sk, d_date
 {code}
 Plan 
 Notice in the Map join operator the number of rows drop from 82,510,879,939 
 to 36524 after the join.
 {code}
 OK
 STAGE DEPENDENCIES:
   Stage-1 is a root stage
   Stage-0 depends on stages: Stage-1
 STAGE PLANS:
   Stage: Stage-1
 Tez
   Edges:
 Map 1 - Map 4 (BROADCAST_EDGE)
 Reducer 2 - Map 1 (SIMPLE_EDGE)
 Reducer 3 - Reducer 2 (SIMPLE_EDGE)
   DagName: mmokhtar_20141019131818_086d663a-5621-456c-bf25-8ccb7112ee3b:6
   Vertices:
 Map 1
 Map Operator Tree:
 TableScan
   alias: store_sales
   filterExpr: ss_item_sk is not null (type: boolean)
   Statistics: Num rows: 82510879939 Data size: 6873789738208 
 Basic stats: COMPLETE Column stats: COMPLETE
   Filter Operator
 predicate: ss_item_sk is not null (type: boolean)
 Statistics: Num rows: 82510879939 Data size: 652315818272 
 Basic stats: COMPLETE Column stats: COMPLETE
 Map Join Operator
   condition map:
Inner Join 0 to 1
   condition expressions:
 0 {ss_item_sk} {ss_sales_price} {ss_sold_date_sk}
 1 {d_date_sk} {d_date} {d_month_seq}
   keys:
 0 ss_sold_date_sk (type: int)
 1 d_date_sk (type: int)
   outputColumnNames: _col1, _col12, _col22, _col26, 
 _col28, _col29
   input vertices:
 1 Map 4
   Statistics: Num rows: 36524 Data size: 4163736 Basic 
 stats: COMPLETE Column stats: COMPLETE
   Filter Operator
   

[jira] [Updated] (HIVE-8517) When joining on partition column NDV gets overridden by StatsUtils.getColStatisticsFromExpression

2014-10-21 Thread Mostafa Mokhtar (JIRA)

 [ 
https://issues.apache.org/jira/browse/HIVE-8517?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Mostafa Mokhtar updated HIVE-8517:
--
Status: Patch Available  (was: In Progress)

New patch which calculates the avg col length which was missing from the 
partitioned column statistics.

 When joining on partition column NDV gets overridden by 
 StatsUtils.getColStatisticsFromExpression
 -

 Key: HIVE-8517
 URL: https://issues.apache.org/jira/browse/HIVE-8517
 Project: Hive
  Issue Type: Bug
  Components: Physical Optimizer
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Mostafa Mokhtar
Priority: Critical
 Fix For: 0.14.0

 Attachments: HIVE-8517.1.patch, HIVE-8517.2.patch, HIVE-8517.3.patch


 When joining on partition column number of partitions is used as NDV which 
 gets overridden by StatsUtils.getColStatisticsFromExpression and the number 
 of partitions used as NDV is replaced by number of rows which results in the 
 same behavior as explained in 
 https://issues.apache.org/jira/browse/HIVE-8196. Joining on partition 
 columns with fetch column stats enabled results it very small CE which 
 negatively affects query performance 
 This is the call stack.
 {code}
 StatsUtils.getColStatisticsFromExpression(HiveConf, Statistics, ExprNodeDesc) 
 line: 1001  
 StatsRulesProcFactory$ReduceSinkStatsRule.process(Node, StackNode, 
 NodeProcessorCtx, Object...) line: 1479  
 DefaultRuleDispatcher.dispatch(Node, StackNode, Object...) line: 90 
 PreOrderWalker(DefaultGraphWalker).dispatchAndReturn(Node, StackNode) line: 
 94  
 PreOrderWalker(DefaultGraphWalker).dispatch(Node, StackNode) line: 78   
 PreOrderWalker.walk(Node) line: 54
 PreOrderWalker.walk(Node) line: 59
 PreOrderWalker.walk(Node) line: 59
 PreOrderWalker(DefaultGraphWalker).startWalking(CollectionNode, 
 HashMapNode,Object) line: 109 
 AnnotateWithStatistics.transform(ParseContext) line: 78   
 TezCompiler.runStatsAnnotation(OptimizeTezProcContext) line: 248  
 TezCompiler.optimizeOperatorPlan(ParseContext, SetReadEntity, 
 SetWriteEntity) line: 120   
 TezCompiler(TaskCompiler).compile(ParseContext, ListTaskSerializable, 
 HashSetReadEntity, HashSetWriteEntity) line: 99 
 SemanticAnalyzer.analyzeInternal(ASTNode) line: 10037 
 SemanticAnalyzer(BaseSemanticAnalyzer).analyze(ASTNode, Context) line: 221
 ExplainSemanticAnalyzer.analyzeInternal(ASTNode) line: 74 
 ExplainSemanticAnalyzer(BaseSemanticAnalyzer).analyze(ASTNode, Context) line: 
 221 
 Driver.compile(String, boolean) line: 415 
 {code}
 Query
 {code}
 select
   ss_item_sk item_sk, d_date, sum(ss_sales_price),
   sum(sum(ss_sales_price))
   over (partition by ss_item_sk order by d_date rows between unbounded 
 preceding and current row) cume_sales
 from store_sales
 ,date_dim
 where ss_sold_date_sk=d_date_sk
   and d_month_seq between 1193 and 1193+11
   and ss_item_sk is not NULL
 group by ss_item_sk, d_date
 {code}
 Plan 
 Notice in the Map join operator the number of rows drop from 82,510,879,939 
 to 36524 after the join.
 {code}
 OK
 STAGE DEPENDENCIES:
   Stage-1 is a root stage
   Stage-0 depends on stages: Stage-1
 STAGE PLANS:
   Stage: Stage-1
 Tez
   Edges:
 Map 1 - Map 4 (BROADCAST_EDGE)
 Reducer 2 - Map 1 (SIMPLE_EDGE)
 Reducer 3 - Reducer 2 (SIMPLE_EDGE)
   DagName: mmokhtar_20141019131818_086d663a-5621-456c-bf25-8ccb7112ee3b:6
   Vertices:
 Map 1
 Map Operator Tree:
 TableScan
   alias: store_sales
   filterExpr: ss_item_sk is not null (type: boolean)
   Statistics: Num rows: 82510879939 Data size: 6873789738208 
 Basic stats: COMPLETE Column stats: COMPLETE
   Filter Operator
 predicate: ss_item_sk is not null (type: boolean)
 Statistics: Num rows: 82510879939 Data size: 652315818272 
 Basic stats: COMPLETE Column stats: COMPLETE
 Map Join Operator
   condition map:
Inner Join 0 to 1
   condition expressions:
 0 {ss_item_sk} {ss_sales_price} {ss_sold_date_sk}
 1 {d_date_sk} {d_date} {d_month_seq}
   keys:
 0 ss_sold_date_sk (type: int)
 1 d_date_sk (type: int)
   outputColumnNames: _col1, _col12, _col22, _col26, 
 _col28, _col29
   input vertices:
 1 Map 4
   Statistics: Num rows: 36524 Data size: 4163736 Basic 
 stats: COMPLETE Column stats: COMPLETE

[jira] [Updated] (HIVE-8517) When joining on partition column NDV gets overridden by StatsUtils.getColStatisticsFromExpression

2014-10-21 Thread Mostafa Mokhtar (JIRA)

 [ 
https://issues.apache.org/jira/browse/HIVE-8517?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Mostafa Mokhtar updated HIVE-8517:
--
Status: In Progress  (was: Patch Available)

 When joining on partition column NDV gets overridden by 
 StatsUtils.getColStatisticsFromExpression
 -

 Key: HIVE-8517
 URL: https://issues.apache.org/jira/browse/HIVE-8517
 Project: Hive
  Issue Type: Bug
  Components: Physical Optimizer
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Mostafa Mokhtar
Priority: Critical
 Fix For: 0.14.0

 Attachments: HIVE-8517.1.patch, HIVE-8517.2.patch, HIVE-8517.3.patch


 When joining on partition column number of partitions is used as NDV which 
 gets overridden by StatsUtils.getColStatisticsFromExpression and the number 
 of partitions used as NDV is replaced by number of rows which results in the 
 same behavior as explained in 
 https://issues.apache.org/jira/browse/HIVE-8196. Joining on partition 
 columns with fetch column stats enabled results it very small CE which 
 negatively affects query performance 
 This is the call stack.
 {code}
 StatsUtils.getColStatisticsFromExpression(HiveConf, Statistics, ExprNodeDesc) 
 line: 1001  
 StatsRulesProcFactory$ReduceSinkStatsRule.process(Node, StackNode, 
 NodeProcessorCtx, Object...) line: 1479  
 DefaultRuleDispatcher.dispatch(Node, StackNode, Object...) line: 90 
 PreOrderWalker(DefaultGraphWalker).dispatchAndReturn(Node, StackNode) line: 
 94  
 PreOrderWalker(DefaultGraphWalker).dispatch(Node, StackNode) line: 78   
 PreOrderWalker.walk(Node) line: 54
 PreOrderWalker.walk(Node) line: 59
 PreOrderWalker.walk(Node) line: 59
 PreOrderWalker(DefaultGraphWalker).startWalking(CollectionNode, 
 HashMapNode,Object) line: 109 
 AnnotateWithStatistics.transform(ParseContext) line: 78   
 TezCompiler.runStatsAnnotation(OptimizeTezProcContext) line: 248  
 TezCompiler.optimizeOperatorPlan(ParseContext, SetReadEntity, 
 SetWriteEntity) line: 120   
 TezCompiler(TaskCompiler).compile(ParseContext, ListTaskSerializable, 
 HashSetReadEntity, HashSetWriteEntity) line: 99 
 SemanticAnalyzer.analyzeInternal(ASTNode) line: 10037 
 SemanticAnalyzer(BaseSemanticAnalyzer).analyze(ASTNode, Context) line: 221
 ExplainSemanticAnalyzer.analyzeInternal(ASTNode) line: 74 
 ExplainSemanticAnalyzer(BaseSemanticAnalyzer).analyze(ASTNode, Context) line: 
 221 
 Driver.compile(String, boolean) line: 415 
 {code}
 Query
 {code}
 select
   ss_item_sk item_sk, d_date, sum(ss_sales_price),
   sum(sum(ss_sales_price))
   over (partition by ss_item_sk order by d_date rows between unbounded 
 preceding and current row) cume_sales
 from store_sales
 ,date_dim
 where ss_sold_date_sk=d_date_sk
   and d_month_seq between 1193 and 1193+11
   and ss_item_sk is not NULL
 group by ss_item_sk, d_date
 {code}
 Plan 
 Notice in the Map join operator the number of rows drop from 82,510,879,939 
 to 36524 after the join.
 {code}
 OK
 STAGE DEPENDENCIES:
   Stage-1 is a root stage
   Stage-0 depends on stages: Stage-1
 STAGE PLANS:
   Stage: Stage-1
 Tez
   Edges:
 Map 1 - Map 4 (BROADCAST_EDGE)
 Reducer 2 - Map 1 (SIMPLE_EDGE)
 Reducer 3 - Reducer 2 (SIMPLE_EDGE)
   DagName: mmokhtar_20141019131818_086d663a-5621-456c-bf25-8ccb7112ee3b:6
   Vertices:
 Map 1
 Map Operator Tree:
 TableScan
   alias: store_sales
   filterExpr: ss_item_sk is not null (type: boolean)
   Statistics: Num rows: 82510879939 Data size: 6873789738208 
 Basic stats: COMPLETE Column stats: COMPLETE
   Filter Operator
 predicate: ss_item_sk is not null (type: boolean)
 Statistics: Num rows: 82510879939 Data size: 652315818272 
 Basic stats: COMPLETE Column stats: COMPLETE
 Map Join Operator
   condition map:
Inner Join 0 to 1
   condition expressions:
 0 {ss_item_sk} {ss_sales_price} {ss_sold_date_sk}
 1 {d_date_sk} {d_date} {d_month_seq}
   keys:
 0 ss_sold_date_sk (type: int)
 1 d_date_sk (type: int)
   outputColumnNames: _col1, _col12, _col22, _col26, 
 _col28, _col29
   input vertices:
 1 Map 4
   Statistics: Num rows: 36524 Data size: 4163736 Basic 
 stats: COMPLETE Column stats: COMPLETE
   Filter Operator
 predicate: (((_col22 = _col26) and _col29 BETWEEN 
 1193 

[jira] [Created] (HIVE-8524) When table is renamed stats are lost as changes are not propagated to metastore tables TAB_COL_STATS and PART_COL_STATS

2014-10-20 Thread Mostafa Mokhtar (JIRA)
Mostafa Mokhtar created HIVE-8524:
-

 Summary: When table is renamed stats are lost as changes are not 
propagated to metastore tables TAB_COL_STATS and PART_COL_STATS 
 Key: HIVE-8524
 URL: https://issues.apache.org/jira/browse/HIVE-8524
 Project: Hive
  Issue Type: Bug
  Components: Metastore
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Gunther Hagleitner
 Fix For: 0.14.0


When a Hive table is renamed that the name is not updated in TAB_COL_STATS and 
PART_COL_STATS.

Repro 
1) Create table 
2) insert rows
3) Analyze table t1 compute statistics for columns;
4) set hive.stats.fetch.column.stats=true;
5) Explain select * from t1 where c1  x 
6) ALTER TABLE t1 RENAME TO 2;
7) Explain select * from t2 where c1  x ; /* stats will be missing */
8) Query the Metastore tables to validate 

According to the documentation Metastore should be updated
{code}
This statement lets you change the name of a table to a different name.
As of version 0.6, a rename on a managed table moves its HDFS location as well. 
(Older Hive versions just renamed the table in the metastore without moving the 
HDFS location.)
{code}

Another related issue is that the schema of  the stats table is not consistent 
with TBLS and DBS as these two table are normalized while TAB_COL_STATS and 
PART_COL_STATS have TABLE_NAME and DB_NAME denormalized in them.



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)


[jira] [Created] (HIVE-8526) Hive : CBO incorrect join order in TPC-DS Q45 as self join selectivity has incorrect CE

2014-10-20 Thread Mostafa Mokhtar (JIRA)
Mostafa Mokhtar created HIVE-8526:
-

 Summary: Hive : CBO incorrect join order in TPC-DS Q45 as self 
join selectivity has incorrect CE
 Key: HIVE-8526
 URL: https://issues.apache.org/jira/browse/HIVE-8526
 Project: Hive
  Issue Type: Bug
  Components: CBO
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Harish Butani
 Fix For: 0.14.0


The join order has Item joined last where it should be joined first

Query 
{code}
select  ca_zip, ca_county, sum(ws_sales_price)
 from
web_sales
JOIN customer ON web_sales.ws_bill_customer_sk = customer.c_customer_sk
JOIN customer_address ON customer.c_current_addr_sk = 
customer_address.ca_address_sk 
JOIN date_dim ON web_sales.ws_sold_date_sk = date_dim.d_date_sk
JOIN item ON web_sales.ws_item_sk = item.i_item_sk 
 where
( item.i_item_id in (select i_item_id
 from item i2
 where i2.i_item_sk in (2, 3, 5, 7, 11, 13, 17, 19, 
23, 29)
 )
)
and d_qoy = 2 and d_year = 2000
 group by ca_zip, ca_county
 order by ca_zip, ca_county
 limit 100
{code}

Plan
{code}
2014-10-20 18:43:16,521 DEBUG [main]: parse.SemanticAnalyzer 
(SemanticAnalyzer.java:apply(12330)) - HiveSortRel(fetch=[100]): rowcount = 
1.710158597922807E7, cumulative cost = {7.169080587598123E10 rows, 
3.420317295845614E7 cpu, 0.0 io}, id = 579
  HiveSortRel(sort0=[$0], sort1=[$1], dir0=[ASC], dir1=[ASC]): rowcount = 
1.710158597922807E7, cumulative cost = {6.827294821015483E10 rows, 
1.710158697922807E7 cpu, 0.0 io}, id = 577
HiveProjectRel(ca_zip=[$0], ca_county=[$1], _o__c2=[$2]): rowcount = 
1.710158597922807E7, cumulative cost = {6.485509054432843E10 rows, 1.0 cpu, 0.0 
io}, id = 575
  HiveAggregateRel(group=[{0, 1}], agg#0=[sum($2)]): rowcount = 
1.710158597922807E7, cumulative cost = {6.485509054432843E10 rows, 1.0 cpu, 0.0 
io}, id = 573
HiveProjectRel($f0=[$2], $f1=[$1], $f2=[$0]): rowcount = 
6.0197670310147226E7, cumulative cost = {6.485509054432843E10 rows, 1.0 cpu, 
0.0 io}, id = 571
  HiveProjectRel(ws_sales_price=[$2], ca_county=[$7], ca_zip=[$8]): 
rowcount = 6.0197670310147226E7, cumulative cost = {6.485509054432843E10 rows, 
1.0 cpu, 0.0 io}, id = 569
HiveFilterRel(condition=[AND(=($11, 2), =($10, 2000))]): rowcount = 
6.0197670310147226E7, cumulative cost = {6.485509054432843E10 rows, 1.0 cpu, 
0.0 io}, id = 567
  SemiJoinRel(condition=[=($13, $14)], joinType=[inner]): rowcount 
= 3.371069537368245E10, cumulative cost = {6.485509054432843E10 rows, 1.0 cpu, 
0.0 io}, id = 565
HiveProjectRel(ws_item_sk=[$0], ws_bill_customer_sk=[$1], 
ws_sales_price=[$2], ws_sold_date_sk=[$3], c_customer_sk=[$9], 
c_current_addr_sk=[$10], ca_address_sk=[$11], ca_county=[$12], ca_zip=[$13], 
d_date_sk=[$6], d_year=[$7], d_qoy=[$8], i_item_sk=[$4], i_item_id=[$5]): 
rowcount = 3.371069537368245E10, cumulative cost = {6.485509054332843E10 rows, 
0.0 cpu, 0.0 io}, id = 669
  HiveJoinRel(condition=[=($1, $9)], joinType=[inner]): 
rowcount = 3.371069537368245E10, cumulative cost = {6.485509054332843E10 rows, 
0.0 cpu, 0.0 io}, id = 667
HiveJoinRel(condition=[=($3, $6)], joinType=[inner]): 
rowcount = 2.1594638446E10, cumulative cost = {4.3189811941E10 rows, 0.0 cpu, 
0.0 io}, id = 664
  HiveJoinRel(condition=[=($0, $4)], joinType=[inner]): 
rowcount = 2.1594638446E10, cumulative cost = {2.1595100446E10 rows, 0.0 cpu, 
0.0 io}, id = 601
HiveProjectRel(ws_item_sk=[$2], 
ws_bill_customer_sk=[$3], ws_sales_price=[$20], ws_sold_date_sk=[$33]): 
rowcount = 2.1594638446E10, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 
497
  
HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_3.web_sales]]): rowcount 
= 2.1594638446E10, cumulative cost = {0}, id = 341
HiveProjectRel(i_item_sk=[$0], i_item_id=[$1]): 
rowcount = 462000.0, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 555
  
HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_3.item]]): rowcount = 
462000.0, cumulative cost = {0}, id = 340
  HiveProjectRel(d_date_sk=[$0], d_year=[$6], d_qoy=[$10]): 
rowcount = 73049.0, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 551

HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_3.date_dim]]): rowcount 
= 73049.0, cumulative cost = {0}, id = 342
HiveJoinRel(condition=[=($1, $2)], joinType=[inner]): 
rowcount = 7.064015632843196E7, cumulative cost = {1.2E8 rows, 0.0 cpu, 0.0 
io}, id = 598
  HiveProjectRel(c_customer_sk=[$0], 
c_current_addr_sk=[$4]): rowcount = 8.0E7, cumulative cost = {0.0 rows, 0.0 
cpu, 0.0 io}, id = 500
   

[jira] [Updated] (HIVE-8526) Hive : CBO incorrect join order in TPC-DS Q45 as self join selectivity has incorrect CE

2014-10-20 Thread Mostafa Mokhtar (JIRA)

 [ 
https://issues.apache.org/jira/browse/HIVE-8526?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Mostafa Mokhtar updated HIVE-8526:
--
Description: 
The join order has Item joined last where it should be joined first

Query 
{code}
select  ca_zip, ca_county, sum(ws_sales_price)
 from
web_sales
JOIN customer ON web_sales.ws_bill_customer_sk = customer.c_customer_sk
JOIN customer_address ON customer.c_current_addr_sk = 
customer_address.ca_address_sk 
JOIN date_dim ON web_sales.ws_sold_date_sk = date_dim.d_date_sk
JOIN item ON web_sales.ws_item_sk = item.i_item_sk 
 where
( item.i_item_id in (select i_item_id
 from item i2
 where i2.i_item_sk in (2, 3, 5, 7, 11, 13, 17, 19, 
23, 29)
 )
)
and d_qoy = 2 and d_year = 2000
 group by ca_zip, ca_county
 order by ca_zip, ca_county
 limit 100
{code}

Plan
{code}
2014-10-20 18:43:16,521 DEBUG [main]: parse.SemanticAnalyzer 
(SemanticAnalyzer.java:apply(12330)) - HiveSortRel(fetch=[100]): rowcount = 
1.710158597922807E7, cumulative cost = {7.169080587598123E10 rows, 
3.420317295845614E7 cpu, 0.0 io}, id = 579
  HiveSortRel(sort0=[$0], sort1=[$1], dir0=[ASC], dir1=[ASC]): rowcount = 
1.710158597922807E7, cumulative cost = {6.827294821015483E10 rows, 
1.710158697922807E7 cpu, 0.0 io}, id = 577
HiveProjectRel(ca_zip=[$0], ca_county=[$1], _o__c2=[$2]): rowcount = 
1.710158597922807E7, cumulative cost = {6.485509054432843E10 rows, 1.0 cpu, 0.0 
io}, id = 575
  HiveAggregateRel(group=[{0, 1}], agg#0=[sum($2)]): rowcount = 
1.710158597922807E7, cumulative cost = {6.485509054432843E10 rows, 1.0 cpu, 0.0 
io}, id = 573
HiveProjectRel($f0=[$2], $f1=[$1], $f2=[$0]): rowcount = 
6.0197670310147226E7, cumulative cost = {6.485509054432843E10 rows, 1.0 cpu, 
0.0 io}, id = 571
  HiveProjectRel(ws_sales_price=[$2], ca_county=[$7], ca_zip=[$8]): 
rowcount = 6.0197670310147226E7, cumulative cost = {6.485509054432843E10 rows, 
1.0 cpu, 0.0 io}, id = 569
HiveFilterRel(condition=[AND(=($11, 2), =($10, 2000))]): rowcount = 
6.0197670310147226E7, cumulative cost = {6.485509054432843E10 rows, 1.0 cpu, 
0.0 io}, id = 567
  SemiJoinRel(condition=[=($13, $14)], joinType=[inner]): rowcount 
= 3.371069537368245E10, cumulative cost = {6.485509054432843E10 rows, 1.0 cpu, 
0.0 io}, id = 565
HiveProjectRel(ws_item_sk=[$0], ws_bill_customer_sk=[$1], 
ws_sales_price=[$2], ws_sold_date_sk=[$3], c_customer_sk=[$9], 
c_current_addr_sk=[$10], ca_address_sk=[$11], ca_county=[$12], ca_zip=[$13], 
d_date_sk=[$6], d_year=[$7], d_qoy=[$8], i_item_sk=[$4], i_item_id=[$5]): 
rowcount = 3.371069537368245E10, cumulative cost = {6.485509054332843E10 rows, 
0.0 cpu, 0.0 io}, id = 669
  HiveJoinRel(condition=[=($1, $9)], joinType=[inner]): 
rowcount = 3.371069537368245E10, cumulative cost = {6.485509054332843E10 rows, 
0.0 cpu, 0.0 io}, id = 667
HiveJoinRel(condition=[=($3, $6)], joinType=[inner]): 
rowcount = 2.1594638446E10, cumulative cost = {4.3189811941E10 rows, 0.0 cpu, 
0.0 io}, id = 664
  HiveJoinRel(condition=[=($0, $4)], joinType=[inner]): 
rowcount = 2.1594638446E10, cumulative cost = {2.1595100446E10 rows, 0.0 cpu, 
0.0 io}, id = 601
HiveProjectRel(ws_item_sk=[$2], 
ws_bill_customer_sk=[$3], ws_sales_price=[$20], ws_sold_date_sk=[$33]): 
rowcount = 2.1594638446E10, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 
497
  
HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_3.web_sales]]): rowcount 
= 2.1594638446E10, cumulative cost = {0}, id = 341
HiveProjectRel(i_item_sk=[$0], i_item_id=[$1]): 
rowcount = 462000.0, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 555
  
HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_3.item]]): rowcount = 
462000.0, cumulative cost = {0}, id = 340
  HiveProjectRel(d_date_sk=[$0], d_year=[$6], d_qoy=[$10]): 
rowcount = 73049.0, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 551

HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_3.date_dim]]): rowcount 
= 73049.0, cumulative cost = {0}, id = 342
HiveJoinRel(condition=[=($1, $2)], joinType=[inner]): 
rowcount = 7.064015632843196E7, cumulative cost = {1.2E8 rows, 0.0 cpu, 0.0 
io}, id = 598
  HiveProjectRel(c_customer_sk=[$0], 
c_current_addr_sk=[$4]): rowcount = 8.0E7, cumulative cost = {0.0 rows, 0.0 
cpu, 0.0 io}, id = 500

HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_3.customer]]): rowcount 
= 8.0E7, cumulative cost = {0}, id = 343
  HiveProjectRel(ca_address_sk=[$0], ca_county=[$7], 
ca_zip=[$9]): rowcount = 4.0E7, cumulative cost = {0.0 rows, 0.0 cpu, 

[jira] [Updated] (HIVE-8517) When joining on partition column NDV gets overridden by StatsUtils.getColStatisticsFromExpression

2014-10-20 Thread Mostafa Mokhtar (JIRA)

 [ 
https://issues.apache.org/jira/browse/HIVE-8517?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Mostafa Mokhtar updated HIVE-8517:
--
Attachment: HIVE-8517.2.patch

 When joining on partition column NDV gets overridden by 
 StatsUtils.getColStatisticsFromExpression
 -

 Key: HIVE-8517
 URL: https://issues.apache.org/jira/browse/HIVE-8517
 Project: Hive
  Issue Type: Bug
  Components: Physical Optimizer
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Mostafa Mokhtar
Priority: Critical
 Fix For: 0.14.0

 Attachments: HIVE-8517.1.patch, HIVE-8517.2.patch


 When joining on partition column number of partitions is used as NDV which 
 gets overridden by StatsUtils.getColStatisticsFromExpression and the number 
 of partitions used as NDV is replaced by number of rows which results in the 
 same behavior as explained in 
 https://issues.apache.org/jira/browse/HIVE-8196. Joining on partition 
 columns with fetch column stats enabled results it very small CE which 
 negatively affects query performance 
 This is the call stack.
 {code}
 StatsUtils.getColStatisticsFromExpression(HiveConf, Statistics, ExprNodeDesc) 
 line: 1001  
 StatsRulesProcFactory$ReduceSinkStatsRule.process(Node, StackNode, 
 NodeProcessorCtx, Object...) line: 1479  
 DefaultRuleDispatcher.dispatch(Node, StackNode, Object...) line: 90 
 PreOrderWalker(DefaultGraphWalker).dispatchAndReturn(Node, StackNode) line: 
 94  
 PreOrderWalker(DefaultGraphWalker).dispatch(Node, StackNode) line: 78   
 PreOrderWalker.walk(Node) line: 54
 PreOrderWalker.walk(Node) line: 59
 PreOrderWalker.walk(Node) line: 59
 PreOrderWalker(DefaultGraphWalker).startWalking(CollectionNode, 
 HashMapNode,Object) line: 109 
 AnnotateWithStatistics.transform(ParseContext) line: 78   
 TezCompiler.runStatsAnnotation(OptimizeTezProcContext) line: 248  
 TezCompiler.optimizeOperatorPlan(ParseContext, SetReadEntity, 
 SetWriteEntity) line: 120   
 TezCompiler(TaskCompiler).compile(ParseContext, ListTaskSerializable, 
 HashSetReadEntity, HashSetWriteEntity) line: 99 
 SemanticAnalyzer.analyzeInternal(ASTNode) line: 10037 
 SemanticAnalyzer(BaseSemanticAnalyzer).analyze(ASTNode, Context) line: 221
 ExplainSemanticAnalyzer.analyzeInternal(ASTNode) line: 74 
 ExplainSemanticAnalyzer(BaseSemanticAnalyzer).analyze(ASTNode, Context) line: 
 221 
 Driver.compile(String, boolean) line: 415 
 {code}
 Query
 {code}
 select
   ss_item_sk item_sk, d_date, sum(ss_sales_price),
   sum(sum(ss_sales_price))
   over (partition by ss_item_sk order by d_date rows between unbounded 
 preceding and current row) cume_sales
 from store_sales
 ,date_dim
 where ss_sold_date_sk=d_date_sk
   and d_month_seq between 1193 and 1193+11
   and ss_item_sk is not NULL
 group by ss_item_sk, d_date
 {code}
 Plan 
 Notice in the Map join operator the number of rows drop from 82,510,879,939 
 to 36524 after the join.
 {code}
 OK
 STAGE DEPENDENCIES:
   Stage-1 is a root stage
   Stage-0 depends on stages: Stage-1
 STAGE PLANS:
   Stage: Stage-1
 Tez
   Edges:
 Map 1 - Map 4 (BROADCAST_EDGE)
 Reducer 2 - Map 1 (SIMPLE_EDGE)
 Reducer 3 - Reducer 2 (SIMPLE_EDGE)
   DagName: mmokhtar_20141019131818_086d663a-5621-456c-bf25-8ccb7112ee3b:6
   Vertices:
 Map 1
 Map Operator Tree:
 TableScan
   alias: store_sales
   filterExpr: ss_item_sk is not null (type: boolean)
   Statistics: Num rows: 82510879939 Data size: 6873789738208 
 Basic stats: COMPLETE Column stats: COMPLETE
   Filter Operator
 predicate: ss_item_sk is not null (type: boolean)
 Statistics: Num rows: 82510879939 Data size: 652315818272 
 Basic stats: COMPLETE Column stats: COMPLETE
 Map Join Operator
   condition map:
Inner Join 0 to 1
   condition expressions:
 0 {ss_item_sk} {ss_sales_price} {ss_sold_date_sk}
 1 {d_date_sk} {d_date} {d_month_seq}
   keys:
 0 ss_sold_date_sk (type: int)
 1 d_date_sk (type: int)
   outputColumnNames: _col1, _col12, _col22, _col26, 
 _col28, _col29
   input vertices:
 1 Map 4
   Statistics: Num rows: 36524 Data size: 4163736 Basic 
 stats: COMPLETE Column stats: COMPLETE
   Filter Operator
 predicate: (((_col22 = _col26) and _col29 BETWEEN 
 1193 AND 1204) and _col1 is not null) 

[jira] [Updated] (HIVE-8517) When joining on partition column NDV gets overridden by StatsUtils.getColStatisticsFromExpression

2014-10-20 Thread Mostafa Mokhtar (JIRA)

 [ 
https://issues.apache.org/jira/browse/HIVE-8517?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Mostafa Mokhtar updated HIVE-8517:
--
Status: Patch Available  (was: In Progress)

 When joining on partition column NDV gets overridden by 
 StatsUtils.getColStatisticsFromExpression
 -

 Key: HIVE-8517
 URL: https://issues.apache.org/jira/browse/HIVE-8517
 Project: Hive
  Issue Type: Bug
  Components: Physical Optimizer
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Mostafa Mokhtar
Priority: Critical
 Fix For: 0.14.0

 Attachments: HIVE-8517.1.patch, HIVE-8517.2.patch


 When joining on partition column number of partitions is used as NDV which 
 gets overridden by StatsUtils.getColStatisticsFromExpression and the number 
 of partitions used as NDV is replaced by number of rows which results in the 
 same behavior as explained in 
 https://issues.apache.org/jira/browse/HIVE-8196. Joining on partition 
 columns with fetch column stats enabled results it very small CE which 
 negatively affects query performance 
 This is the call stack.
 {code}
 StatsUtils.getColStatisticsFromExpression(HiveConf, Statistics, ExprNodeDesc) 
 line: 1001  
 StatsRulesProcFactory$ReduceSinkStatsRule.process(Node, StackNode, 
 NodeProcessorCtx, Object...) line: 1479  
 DefaultRuleDispatcher.dispatch(Node, StackNode, Object...) line: 90 
 PreOrderWalker(DefaultGraphWalker).dispatchAndReturn(Node, StackNode) line: 
 94  
 PreOrderWalker(DefaultGraphWalker).dispatch(Node, StackNode) line: 78   
 PreOrderWalker.walk(Node) line: 54
 PreOrderWalker.walk(Node) line: 59
 PreOrderWalker.walk(Node) line: 59
 PreOrderWalker(DefaultGraphWalker).startWalking(CollectionNode, 
 HashMapNode,Object) line: 109 
 AnnotateWithStatistics.transform(ParseContext) line: 78   
 TezCompiler.runStatsAnnotation(OptimizeTezProcContext) line: 248  
 TezCompiler.optimizeOperatorPlan(ParseContext, SetReadEntity, 
 SetWriteEntity) line: 120   
 TezCompiler(TaskCompiler).compile(ParseContext, ListTaskSerializable, 
 HashSetReadEntity, HashSetWriteEntity) line: 99 
 SemanticAnalyzer.analyzeInternal(ASTNode) line: 10037 
 SemanticAnalyzer(BaseSemanticAnalyzer).analyze(ASTNode, Context) line: 221
 ExplainSemanticAnalyzer.analyzeInternal(ASTNode) line: 74 
 ExplainSemanticAnalyzer(BaseSemanticAnalyzer).analyze(ASTNode, Context) line: 
 221 
 Driver.compile(String, boolean) line: 415 
 {code}
 Query
 {code}
 select
   ss_item_sk item_sk, d_date, sum(ss_sales_price),
   sum(sum(ss_sales_price))
   over (partition by ss_item_sk order by d_date rows between unbounded 
 preceding and current row) cume_sales
 from store_sales
 ,date_dim
 where ss_sold_date_sk=d_date_sk
   and d_month_seq between 1193 and 1193+11
   and ss_item_sk is not NULL
 group by ss_item_sk, d_date
 {code}
 Plan 
 Notice in the Map join operator the number of rows drop from 82,510,879,939 
 to 36524 after the join.
 {code}
 OK
 STAGE DEPENDENCIES:
   Stage-1 is a root stage
   Stage-0 depends on stages: Stage-1
 STAGE PLANS:
   Stage: Stage-1
 Tez
   Edges:
 Map 1 - Map 4 (BROADCAST_EDGE)
 Reducer 2 - Map 1 (SIMPLE_EDGE)
 Reducer 3 - Reducer 2 (SIMPLE_EDGE)
   DagName: mmokhtar_20141019131818_086d663a-5621-456c-bf25-8ccb7112ee3b:6
   Vertices:
 Map 1
 Map Operator Tree:
 TableScan
   alias: store_sales
   filterExpr: ss_item_sk is not null (type: boolean)
   Statistics: Num rows: 82510879939 Data size: 6873789738208 
 Basic stats: COMPLETE Column stats: COMPLETE
   Filter Operator
 predicate: ss_item_sk is not null (type: boolean)
 Statistics: Num rows: 82510879939 Data size: 652315818272 
 Basic stats: COMPLETE Column stats: COMPLETE
 Map Join Operator
   condition map:
Inner Join 0 to 1
   condition expressions:
 0 {ss_item_sk} {ss_sales_price} {ss_sold_date_sk}
 1 {d_date_sk} {d_date} {d_month_seq}
   keys:
 0 ss_sold_date_sk (type: int)
 1 d_date_sk (type: int)
   outputColumnNames: _col1, _col12, _col22, _col26, 
 _col28, _col29
   input vertices:
 1 Map 4
   Statistics: Num rows: 36524 Data size: 4163736 Basic 
 stats: COMPLETE Column stats: COMPLETE
   Filter Operator
 predicate: (((_col22 = _col26) and _col29 BETWEEN 
 1193 AND 1204) and _col1 

[jira] [Created] (HIVE-8517) When joining on partition column NDV gets overridden by StatsUtils.getColStatisticsFromExpression

2014-10-19 Thread Mostafa Mokhtar (JIRA)
Mostafa Mokhtar created HIVE-8517:
-

 Summary: When joining on partition column NDV gets overridden by 
StatsUtils.getColStatisticsFromExpression
 Key: HIVE-8517
 URL: https://issues.apache.org/jira/browse/HIVE-8517
 Project: Hive
  Issue Type: Bug
  Components: Physical Optimizer
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
 Fix For: 0.14.0


When joining on partition column number of partitions is used as NDV which gets 
overridden by StatsUtils.getColStatisticsFromExpression and the number of 
partitions used as NDV is replaced by number of rows which results in the same 
behavior as explained in https://issues.apache.org/jira/browse/HIVE-8196. 
Joining on partition columns with fetch column stats enabled results it very 
small CE which negatively affects query performance 

This is the call stack.
{code}
StatsUtils.getColStatisticsFromExpression(HiveConf, Statistics, ExprNodeDesc) 
line: 1001
StatsRulesProcFactory$ReduceSinkStatsRule.process(Node, StackNode, 
NodeProcessorCtx, Object...) line: 1479
DefaultRuleDispatcher.dispatch(Node, StackNode, Object...) line: 90   
PreOrderWalker(DefaultGraphWalker).dispatchAndReturn(Node, StackNode) line: 
94
PreOrderWalker(DefaultGraphWalker).dispatch(Node, StackNode) line: 78 
PreOrderWalker.walk(Node) line: 54  
PreOrderWalker.walk(Node) line: 59  
PreOrderWalker.walk(Node) line: 59  
PreOrderWalker(DefaultGraphWalker).startWalking(CollectionNode, 
HashMapNode,Object) line: 109   
AnnotateWithStatistics.transform(ParseContext) line: 78 
TezCompiler.runStatsAnnotation(OptimizeTezProcContext) line: 248
TezCompiler.optimizeOperatorPlan(ParseContext, SetReadEntity, 
SetWriteEntity) line: 120 
TezCompiler(TaskCompiler).compile(ParseContext, ListTaskSerializable, 
HashSetReadEntity, HashSetWriteEntity) line: 99   
SemanticAnalyzer.analyzeInternal(ASTNode) line: 10037   
SemanticAnalyzer(BaseSemanticAnalyzer).analyze(ASTNode, Context) line: 221  
ExplainSemanticAnalyzer.analyzeInternal(ASTNode) line: 74   
ExplainSemanticAnalyzer(BaseSemanticAnalyzer).analyze(ASTNode, Context) line: 
221   
Driver.compile(String, boolean) line: 415   
{code}

Query
{code}

select
  ss_item_sk item_sk, d_date, sum(ss_sales_price),
  sum(sum(ss_sales_price))
  over (partition by ss_item_sk order by d_date rows between unbounded 
preceding and current row) cume_sales
from store_sales
,date_dim
where ss_sold_date_sk=d_date_sk
  and d_month_seq between 1193 and 1193+11
  and ss_item_sk is not NULL
group by ss_item_sk, d_date
{code}

Plan 
Notice in the Map join operator the number of rows drop from 82,510,879,939 to 
36524 after the join.
{code}
OK
STAGE DEPENDENCIES:
  Stage-1 is a root stage
  Stage-0 depends on stages: Stage-1

STAGE PLANS:
  Stage: Stage-1
Tez
  Edges:
Map 1 - Map 4 (BROADCAST_EDGE)
Reducer 2 - Map 1 (SIMPLE_EDGE)
Reducer 3 - Reducer 2 (SIMPLE_EDGE)
  DagName: mmokhtar_20141019131818_086d663a-5621-456c-bf25-8ccb7112ee3b:6
  Vertices:
Map 1
Map Operator Tree:
TableScan
  alias: store_sales
  filterExpr: ss_item_sk is not null (type: boolean)
  Statistics: Num rows: 82510879939 Data size: 6873789738208 
Basic stats: COMPLETE Column stats: COMPLETE
  Filter Operator
predicate: ss_item_sk is not null (type: boolean)
Statistics: Num rows: 82510879939 Data size: 652315818272 
Basic stats: COMPLETE Column stats: COMPLETE
Map Join Operator
  condition map:
   Inner Join 0 to 1
  condition expressions:
0 {ss_item_sk} {ss_sales_price} {ss_sold_date_sk}
1 {d_date_sk} {d_date} {d_month_seq}
  keys:
0 ss_sold_date_sk (type: int)
1 d_date_sk (type: int)
  outputColumnNames: _col1, _col12, _col22, _col26, _col28, 
_col29
  input vertices:
1 Map 4
  Statistics: Num rows: 36524 Data size: 4163736 Basic 
stats: COMPLETE Column stats: COMPLETE
  Filter Operator
predicate: (((_col22 = _col26) and _col29 BETWEEN 1193 
AND 1204) and _col1 is not null) (type: boolean)
Statistics: Num rows: 9131 Data size: 1040934 Basic 
stats: COMPLETE Column stats: COMPLETE
Select Operator
  expressions: _col1 (type: int), _col28 (type: 
string), _col12 (type: float)
  outputColumnNames: _col1, _col28, _col12
  Statistics: Num rows: 9131 Data size: 1040934 Basic 
stats: COMPLETE 

[jira] [Updated] (HIVE-8517) When joining on partition column NDV gets overridden by StatsUtils.getColStatisticsFromExpression

2014-10-19 Thread Mostafa Mokhtar (JIRA)

 [ 
https://issues.apache.org/jira/browse/HIVE-8517?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Mostafa Mokhtar updated HIVE-8517:
--
Assignee: Mostafa Mokhtar

 When joining on partition column NDV gets overridden by 
 StatsUtils.getColStatisticsFromExpression
 -

 Key: HIVE-8517
 URL: https://issues.apache.org/jira/browse/HIVE-8517
 Project: Hive
  Issue Type: Bug
  Components: Physical Optimizer
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Mostafa Mokhtar
 Fix For: 0.14.0


 When joining on partition column number of partitions is used as NDV which 
 gets overridden by StatsUtils.getColStatisticsFromExpression and the number 
 of partitions used as NDV is replaced by number of rows which results in the 
 same behavior as explained in 
 https://issues.apache.org/jira/browse/HIVE-8196. Joining on partition 
 columns with fetch column stats enabled results it very small CE which 
 negatively affects query performance 
 This is the call stack.
 {code}
 StatsUtils.getColStatisticsFromExpression(HiveConf, Statistics, ExprNodeDesc) 
 line: 1001  
 StatsRulesProcFactory$ReduceSinkStatsRule.process(Node, StackNode, 
 NodeProcessorCtx, Object...) line: 1479  
 DefaultRuleDispatcher.dispatch(Node, StackNode, Object...) line: 90 
 PreOrderWalker(DefaultGraphWalker).dispatchAndReturn(Node, StackNode) line: 
 94  
 PreOrderWalker(DefaultGraphWalker).dispatch(Node, StackNode) line: 78   
 PreOrderWalker.walk(Node) line: 54
 PreOrderWalker.walk(Node) line: 59
 PreOrderWalker.walk(Node) line: 59
 PreOrderWalker(DefaultGraphWalker).startWalking(CollectionNode, 
 HashMapNode,Object) line: 109 
 AnnotateWithStatistics.transform(ParseContext) line: 78   
 TezCompiler.runStatsAnnotation(OptimizeTezProcContext) line: 248  
 TezCompiler.optimizeOperatorPlan(ParseContext, SetReadEntity, 
 SetWriteEntity) line: 120   
 TezCompiler(TaskCompiler).compile(ParseContext, ListTaskSerializable, 
 HashSetReadEntity, HashSetWriteEntity) line: 99 
 SemanticAnalyzer.analyzeInternal(ASTNode) line: 10037 
 SemanticAnalyzer(BaseSemanticAnalyzer).analyze(ASTNode, Context) line: 221
 ExplainSemanticAnalyzer.analyzeInternal(ASTNode) line: 74 
 ExplainSemanticAnalyzer(BaseSemanticAnalyzer).analyze(ASTNode, Context) line: 
 221 
 Driver.compile(String, boolean) line: 415 
 {code}
 Query
 {code}
 select
   ss_item_sk item_sk, d_date, sum(ss_sales_price),
   sum(sum(ss_sales_price))
   over (partition by ss_item_sk order by d_date rows between unbounded 
 preceding and current row) cume_sales
 from store_sales
 ,date_dim
 where ss_sold_date_sk=d_date_sk
   and d_month_seq between 1193 and 1193+11
   and ss_item_sk is not NULL
 group by ss_item_sk, d_date
 {code}
 Plan 
 Notice in the Map join operator the number of rows drop from 82,510,879,939 
 to 36524 after the join.
 {code}
 OK
 STAGE DEPENDENCIES:
   Stage-1 is a root stage
   Stage-0 depends on stages: Stage-1
 STAGE PLANS:
   Stage: Stage-1
 Tez
   Edges:
 Map 1 - Map 4 (BROADCAST_EDGE)
 Reducer 2 - Map 1 (SIMPLE_EDGE)
 Reducer 3 - Reducer 2 (SIMPLE_EDGE)
   DagName: mmokhtar_20141019131818_086d663a-5621-456c-bf25-8ccb7112ee3b:6
   Vertices:
 Map 1
 Map Operator Tree:
 TableScan
   alias: store_sales
   filterExpr: ss_item_sk is not null (type: boolean)
   Statistics: Num rows: 82510879939 Data size: 6873789738208 
 Basic stats: COMPLETE Column stats: COMPLETE
   Filter Operator
 predicate: ss_item_sk is not null (type: boolean)
 Statistics: Num rows: 82510879939 Data size: 652315818272 
 Basic stats: COMPLETE Column stats: COMPLETE
 Map Join Operator
   condition map:
Inner Join 0 to 1
   condition expressions:
 0 {ss_item_sk} {ss_sales_price} {ss_sold_date_sk}
 1 {d_date_sk} {d_date} {d_month_seq}
   keys:
 0 ss_sold_date_sk (type: int)
 1 d_date_sk (type: int)
   outputColumnNames: _col1, _col12, _col22, _col26, 
 _col28, _col29
   input vertices:
 1 Map 4
   Statistics: Num rows: 36524 Data size: 4163736 Basic 
 stats: COMPLETE Column stats: COMPLETE
   Filter Operator
 predicate: (((_col22 = _col26) and _col29 BETWEEN 
 1193 AND 1204) and _col1 is not null) (type: boolean)
 Statistics: Num rows: 9131 Data size: 1040934 Basic 
 

[jira] [Updated] (HIVE-8517) When joining on partition column NDV gets overridden by StatsUtils.getColStatisticsFromExpression

2014-10-19 Thread Mostafa Mokhtar (JIRA)

 [ 
https://issues.apache.org/jira/browse/HIVE-8517?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Mostafa Mokhtar updated HIVE-8517:
--
Status: Open  (was: Patch Available)

 When joining on partition column NDV gets overridden by 
 StatsUtils.getColStatisticsFromExpression
 -

 Key: HIVE-8517
 URL: https://issues.apache.org/jira/browse/HIVE-8517
 Project: Hive
  Issue Type: Bug
  Components: Physical Optimizer
Affects Versions: 0.14.0
Reporter: Mostafa Mokhtar
Assignee: Mostafa Mokhtar
 Fix For: 0.14.0

 Attachments: HIVE-8517.1.patch


 When joining on partition column number of partitions is used as NDV which 
 gets overridden by StatsUtils.getColStatisticsFromExpression and the number 
 of partitions used as NDV is replaced by number of rows which results in the 
 same behavior as explained in 
 https://issues.apache.org/jira/browse/HIVE-8196. Joining on partition 
 columns with fetch column stats enabled results it very small CE which 
 negatively affects query performance 
 This is the call stack.
 {code}
 StatsUtils.getColStatisticsFromExpression(HiveConf, Statistics, ExprNodeDesc) 
 line: 1001  
 StatsRulesProcFactory$ReduceSinkStatsRule.process(Node, StackNode, 
 NodeProcessorCtx, Object...) line: 1479  
 DefaultRuleDispatcher.dispatch(Node, StackNode, Object...) line: 90 
 PreOrderWalker(DefaultGraphWalker).dispatchAndReturn(Node, StackNode) line: 
 94  
 PreOrderWalker(DefaultGraphWalker).dispatch(Node, StackNode) line: 78   
 PreOrderWalker.walk(Node) line: 54
 PreOrderWalker.walk(Node) line: 59
 PreOrderWalker.walk(Node) line: 59
 PreOrderWalker(DefaultGraphWalker).startWalking(CollectionNode, 
 HashMapNode,Object) line: 109 
 AnnotateWithStatistics.transform(ParseContext) line: 78   
 TezCompiler.runStatsAnnotation(OptimizeTezProcContext) line: 248  
 TezCompiler.optimizeOperatorPlan(ParseContext, SetReadEntity, 
 SetWriteEntity) line: 120   
 TezCompiler(TaskCompiler).compile(ParseContext, ListTaskSerializable, 
 HashSetReadEntity, HashSetWriteEntity) line: 99 
 SemanticAnalyzer.analyzeInternal(ASTNode) line: 10037 
 SemanticAnalyzer(BaseSemanticAnalyzer).analyze(ASTNode, Context) line: 221
 ExplainSemanticAnalyzer.analyzeInternal(ASTNode) line: 74 
 ExplainSemanticAnalyzer(BaseSemanticAnalyzer).analyze(ASTNode, Context) line: 
 221 
 Driver.compile(String, boolean) line: 415 
 {code}
 Query
 {code}
 select
   ss_item_sk item_sk, d_date, sum(ss_sales_price),
   sum(sum(ss_sales_price))
   over (partition by ss_item_sk order by d_date rows between unbounded 
 preceding and current row) cume_sales
 from store_sales
 ,date_dim
 where ss_sold_date_sk=d_date_sk
   and d_month_seq between 1193 and 1193+11
   and ss_item_sk is not NULL
 group by ss_item_sk, d_date
 {code}
 Plan 
 Notice in the Map join operator the number of rows drop from 82,510,879,939 
 to 36524 after the join.
 {code}
 OK
 STAGE DEPENDENCIES:
   Stage-1 is a root stage
   Stage-0 depends on stages: Stage-1
 STAGE PLANS:
   Stage: Stage-1
 Tez
   Edges:
 Map 1 - Map 4 (BROADCAST_EDGE)
 Reducer 2 - Map 1 (SIMPLE_EDGE)
 Reducer 3 - Reducer 2 (SIMPLE_EDGE)
   DagName: mmokhtar_20141019131818_086d663a-5621-456c-bf25-8ccb7112ee3b:6
   Vertices:
 Map 1
 Map Operator Tree:
 TableScan
   alias: store_sales
   filterExpr: ss_item_sk is not null (type: boolean)
   Statistics: Num rows: 82510879939 Data size: 6873789738208 
 Basic stats: COMPLETE Column stats: COMPLETE
   Filter Operator
 predicate: ss_item_sk is not null (type: boolean)
 Statistics: Num rows: 82510879939 Data size: 652315818272 
 Basic stats: COMPLETE Column stats: COMPLETE
 Map Join Operator
   condition map:
Inner Join 0 to 1
   condition expressions:
 0 {ss_item_sk} {ss_sales_price} {ss_sold_date_sk}
 1 {d_date_sk} {d_date} {d_month_seq}
   keys:
 0 ss_sold_date_sk (type: int)
 1 d_date_sk (type: int)
   outputColumnNames: _col1, _col12, _col22, _col26, 
 _col28, _col29
   input vertices:
 1 Map 4
   Statistics: Num rows: 36524 Data size: 4163736 Basic 
 stats: COMPLETE Column stats: COMPLETE
   Filter Operator
 predicate: (((_col22 = _col26) and _col29 BETWEEN 
 1193 AND 1204) and _col1 is not null) (type: boolean)
 

  1   2   3   4   >