Rajesh Balamohan created HIVE-26975:
---------------------------------------

             Summary: MERGE: Wrong reducer estimate causing smaller files to be 
created
                 Key: HIVE-26975
                 URL: https://issues.apache.org/jira/browse/HIVE-26975
             Project: Hive
          Issue Type: Improvement
          Components: Iceberg integration
            Reporter: Rajesh Balamohan


* "Merge into" estimates wrong number of reducers causing more number of small 
files to be created.* e.g 400+ files in 3+ MB file each.*
 * This can be reproduced by writing data into "store_sales" table in iceberg 
format via another source table (using merge-into).
 ** e.g  Running this few times will create wrong number of reduce tasks 
causing lot of small files to be created in iceberg table.

{noformat}
MERGE INTO store_sales_t t

using ssv s

ON ( t.ss_item_sk = s.ss_item_sk

     AND t.ss_customer_sk = s.ss_customer_sk

     AND t.ss_sold_date_sk = "2451181"

     AND ( ( Floor(( s.ss_item_sk ) / 1000) * 1000 ) BETWEEN 1000 AND 2000 )

     AND s.ss_ext_discount_amt < 0.0 )

WHEN matched AND t.ss_ext_discount_amt IS NULL THEN

  UPDATE SET ss_ext_discount_amt = 0.0

WHEN NOT matched THEN

  INSERT ( ss_sold_time_sk,

           ss_item_sk,

           ss_customer_sk,

           ss_cdemo_sk,

           ss_hdemo_sk,

           ss_addr_sk,

           ss_store_sk,

           ss_promo_sk,

           ss_ticket_number,

           ss_quantity,

           ss_wholesale_cost,

           ss_list_price,

           ss_sales_price,

           ss_ext_discount_amt,

           ss_ext_sales_price,

           ss_ext_wholesale_cost,

           ss_ext_list_price,

           ss_ext_tax,

           ss_coupon_amt,

           ss_net_paid,

           ss_net_paid_inc_tax,

           ss_net_profit,

           ss_sold_date_sk )

  VALUES ( s.ss_sold_time_sk,

           s.ss_item_sk,

           s.ss_customer_sk,

           s.ss_cdemo_sk,

           s.ss_hdemo_sk,

           s.ss_addr_sk,

           s.ss_store_sk,

           s.ss_promo_sk,

           s.ss_ticket_number,

           s.ss_quantity,

           s.ss_wholesale_cost,

           s.ss_list_price,

           s.ss_sales_price,

           s.ss_ext_discount_amt,

           s.ss_ext_sales_price,

           s.ss_ext_wholesale_cost,

           s.ss_ext_list_price,

           s.ss_ext_tax,

           s.ss_coupon_amt,

           s.ss_net_paid,

           s.ss_net_paid_inc_tax,

           s.ss_net_profit,

           "2451181") 

{noformat}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

Reply via email to