[ 
https://issues.apache.org/jira/browse/HIVE-12827?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15102353#comment-15102353
 ] 

Gopal V commented on HIVE-12827:
--------------------------------

The fill after every operation is unnecessary, the theory about the other patch 
was that some UDF in that query wasn't handling the hasNoNulls flag.

That is not true, the original issue was that a scratch column feeding the 
COALESCE() is reused for the Join output columns & setting a column value via 
FloatReader does not set the isNull[batchIndex] = false (and then the Filter on 
cr_return_amount removes those rows).

CBO rewrites the left outer join into an inner join, pushing the filter below 
the join & there's no more TS-FIL-MJ-FIL as the FIL migrates to the broadcast 
side.

Here's my simplified example, which I used to narrow down the issue to 
FloatTreeReader.

{code}
set hive.cbo.enable=false;
set hive.vectorized.execution.reducesink.new.enabled=false;
set hive.vectorized.execution.mapjoin.native.enabled=true;
set hive.vectorized.execution.reduce.enabled=false;
set hive.vectorized.execution.reduce.groupby.enabled=false;

use testing;

create table if not exists cs stored as orc as select IF (cs_item_sk
 IN (
1365 ,
2243 ,
2445 ,
3259 ,
3267 ,
4027 ,
5263 ,
6003 ,
8371 ,
9593 ,
10383,
10763,
11351,
12359,
12887,
13449,
16501,
16547
), cs_item_sk, 0) as cs_item_sk
, cs_order_number, cs_net_paid, cs_quantity from 
tpcds_bin_partitioned_orc_200.catalog_sales
where
 true
 and cs_sold_date_sk = 2452245
 and cs_net_profit > 1
 and cs_net_paid > 0
 and cs_quantity > 0
 and cs_item_sk between 1365 and 16547
;

create table if not exists cr as select cr_return_amount, cr_item_sk, 
cr_order_number from tpcds_bin_partitioned_orc_200.catalog_returns where 
cr_returned_date_sk between 2452351 and 2452400
and cr_item_sk
 IN (
1365 ,
2243 ,
2445 ,
3259 ,
3267 ,
4027 ,
5263 ,
6003 ,
8371 ,
9593 ,
10383,
10763,
11351,
12359,
12887,
13449,
16501,
16547
)
order by cr_item_sk
;

select * from
(select cs.cs_item_sk as item,
  coalesce(cr.cr_return_amount,0) as return_amount
 ,coalesce(cs.cs_net_paid,0) as net_paid
-- (cast(sum(coalesce(cr.cr_return_amount,0)) as double)/
--  cast(sum(coalesce(cs.cs_net_paid,0)) as double)) as currency_ratio
 from cs -- catalog_sales cs
 left outer join cr -- catalog_returns cr
 on cs.cs_order_number = cr.cr_order_number
 and cs.cs_item_sk = cr.cr_item_sk
 where cr.cr_return_amount > 10000
 and cs.cs_quantity > 0
-- group by cs.cs_item_sk 
) x;
{code}

> Vectorization: VectorCopyRow/VectorAssignRow/VectorDeserializeRow assign 
> needs explicit isNull[offset] modification
> -------------------------------------------------------------------------------------------------------------------
>
>                 Key: HIVE-12827
>                 URL: https://issues.apache.org/jira/browse/HIVE-12827
>             Project: Hive
>          Issue Type: Bug
>            Reporter: Gopal V
>            Assignee: Gopal V
>         Attachments: HIVE-12827.2.patch
>
>
> Some scenarios do set Double.NaN instead of isNull=true, but all types aren't 
> consistent.
> Examples of un-set isNull for the valid values are 
> {code}
>   private class FloatReader extends AbstractDoubleReader {
>     FloatReader(int columnIndex) {
>       super(columnIndex);
>     }
>     @Override
>     void apply(VectorizedRowBatch batch, int batchIndex) throws IOException {
>       DoubleColumnVector colVector = (DoubleColumnVector) 
> batch.cols[columnIndex];
>       if (deserializeRead.readCheckNull()) {
>         VectorizedBatchUtil.setNullColIsNullValue(colVector, batchIndex);
>       } else {
>         float value = deserializeRead.readFloat();
>         colVector.vector[batchIndex] = (double) value;
>       }
>     }
>   }
> {code}
> {code}
>   private class DoubleCopyRow extends CopyRow {
>     DoubleCopyRow(int inColumnIndex, int outColumnIndex) {
>       super(inColumnIndex, outColumnIndex);
>     }
>     @Override
>     void copy(VectorizedRowBatch inBatch, int inBatchIndex, 
> VectorizedRowBatch outBatch, int outBatchIndex) {
>       DoubleColumnVector inColVector = (DoubleColumnVector) 
> inBatch.cols[inColumnIndex];
>       DoubleColumnVector outColVector = (DoubleColumnVector) 
> outBatch.cols[outColumnIndex];
>       if (inColVector.isRepeating) {
>         if (inColVector.noNulls || !inColVector.isNull[0]) {
>           outColVector.vector[outBatchIndex] = inColVector.vector[0];
>         } else {
>           VectorizedBatchUtil.setNullColIsNullValue(outColVector, 
> outBatchIndex);
>         }
>       } else {
>         if (inColVector.noNulls || !inColVector.isNull[inBatchIndex]) {
>           outColVector.vector[outBatchIndex] = 
> inColVector.vector[inBatchIndex];
>         } else {
>           VectorizedBatchUtil.setNullColIsNullValue(outColVector, 
> outBatchIndex);
>         }
>       }
>     }
>   }
> {code}
> {code}
>  private static abstract class VectorDoubleColumnAssign
>     extends VectorColumnAssignVectorBase<DoubleColumnVector> {
>     protected void assignDouble(double value, int destIndex) {
>       outCol.vector[destIndex] = value;
>     }
>   }
> {code}
> The pattern to imitate would be the earlier code from VectorBatchUtil
> {code}
>     case DOUBLE: {
>       DoubleColumnVector dcv = (DoubleColumnVector) batch.cols[offset + 
> colIndex];
>       if (writableCol != null) {
>         dcv.vector[rowIndex] = ((DoubleWritable) writableCol).get();
>         dcv.isNull[rowIndex] = false;
>       } else {
>         dcv.vector[rowIndex] = Double.NaN;
>         setNullColIsNullValue(dcv, rowIndex);
>       }
>     }
>       break;
> {code}



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Reply via email to