(hive) branch master updated: HIVE-28880: Wrong result when output column in a vectorized expression is used as a scratch column by a child (#6000)

krisztiankasa Thu, 31 Jul 2025 01:50:36 -0700

This is an automated email from the ASF dual-hosted git repository.

krisztiankasa pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git



The following commit(s) were added to refs/heads/master by this push:
     new d82d73c6713 HIVE-28880: Wrong result when output column in a 
vectorized expression is used as a scratch column by a child (#6000)
d82d73c6713 is described below

commit d82d73c671307f55feb845a2c5d9b87b463057a9
Author: Soumyakanti Das <[email protected]>
AuthorDate: Thu Jul 31 01:50:25 2025 -0700

    HIVE-28880: Wrong result when output column in a vectorized expression is 
used as a scratch column by a child (#6000)
---
 .../vector/expressions/IfExprColumnCondExpr.java   |   7 +-
 .../clientpositive/scratch_col_reused_by_child.q   |  54 +++++
 .../llap/scratch_col_reused_by_child.q.out         | 252 +++++++++++++++++++++
 3 files changed, 311 insertions(+), 2 deletions(-)

diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/IfExprColumnCondExpr.java
 
b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/IfExprColumnCondExpr.java
index 85522e52c36..575bf0eefbf 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/IfExprColumnCondExpr.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/IfExprColumnCondExpr.java
@@ -95,13 +95,16 @@ public void evaluate(VectorizedRowBatch batch) throws 
HiveException {
     // The THEN expression is either IdentityExpression (a column) or a 
ConstantVectorExpression
     // (a scalar) and trivial to evaluate.
     childExpressions[1].evaluate(batch);
+
+    // Evaluate the third child (ELSE) expression conditionally.
+    conditionalEvaluate(batch, childExpressions[2], elseSelected, elseCount);
+    
     for (int i = 0; i < thenCount; i++) {
       final int batchIndex = thenSelected[i];
       outputIsNull[batchIndex] = false;
       outputColVector.setElement(batchIndex, batchIndex, thenColVector);
     }
-
-    conditionalEvaluate(batch, childExpressions[2], elseSelected, elseCount);
+    
     for (int i = 0; i < elseCount; i++) {
       final int batchIndex = elseSelected[i];
       outputIsNull[batchIndex] = false;
diff --git a/ql/src/test/queries/clientpositive/scratch_col_reused_by_child.q 
b/ql/src/test/queries/clientpositive/scratch_col_reused_by_child.q
new file mode 100644
index 00000000000..325fc6056cb
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/scratch_col_reused_by_child.q
@@ -0,0 +1,54 @@
+set hive.vectorized.execution.enabled=true;
+set hive.cbo.enable=true;
+set hive.auto.convert.join=true;
+
+CREATE EXTERNAL TABLE main_tbl(col1 string, col2 string) stored as orc;
+CREATE EXTERNAL TABLE sub_tbl(pdate date) stored as orc;
+insert into main_tbl values('20250331','BBB'),('20250331','AAAAAA');
+insert into sub_tbl values('2025-03-31');
+
+--selectExpressions: 
+--  IfExprColumnCondExpr(col 12:boolean, col 4:string, col 5:string)(
+--    children: 
+--      StringGroupColEqualStringScalar(col 5: string, val AAAAAA)(
+--        children: 
+--          StringUpper(col 4:string)(
+--            children: 
+--              StringTrimCol(col 1:string) -> 4:string
+--          ) -> 5:string
+--      ) -> 12:boolean, 
+--      
+--      ConstantVectorExpression(val AAAA_BBBB_CCCC_DDDD) -> 4:string, 
+--      
+--      IfExprStringScalarStringScalar(col 13:boolean, val 
WWWW_XXXX_YYYY_ZZZZ, val N/A)(
+--        children: 
+--          StringGroupColEqualStringScalar(col 6:string, val BBB)(
+--            children: 
+--              StringUpper(col 5:string)(
+--                children: 
+--                  StringTrimCol(col 1:string) -> 5:string
+--              ) -> 6:string
+--          ) -> 13:boolean
+--      ) -> 5:string
+--  ) -> 6:string
+
+-- In the above expression, we can see that col 6, which is the final output 
column, is also used by a child 
+-- expression to store the result of StringUpper. This may cause issues if we 
do not evaluate the child expression first.
+
+
+explain vectorization detail
+select case
+    when upper(trim(col2)) = 'AAAAAA' then 'AAAA_BBBB_CCCC_DDDD'
+    when upper(trim(col2)) = 'BBB' then 'WWWW_XXXX_YYYY_ZZZZ'
+    else 'N/A'
+end as result
+from main_tbl
+where 
cast(concat(substr(trim(col1),1,4),'-',substr(trim(col1),5,2),'-',substr(trim(col1),7,2))
 as date) in (select pdate from sub_tbl);
+
+select case
+    when upper(trim(col2)) = 'AAAAAA' then 'AAAA_BBBB_CCCC_DDDD'
+    when upper(trim(col2)) = 'BBB' then 'WWWW_XXXX_YYYY_ZZZZ'
+    else 'N/A'
+end as result
+from main_tbl
+where 
cast(concat(substr(trim(col1),1,4),'-',substr(trim(col1),5,2),'-',substr(trim(col1),7,2))
 as date) in (select pdate from sub_tbl);
diff --git 
a/ql/src/test/results/clientpositive/llap/scratch_col_reused_by_child.q.out 
b/ql/src/test/results/clientpositive/llap/scratch_col_reused_by_child.q.out
new file mode 100644
index 00000000000..a769d77d2c0
--- /dev/null
+++ b/ql/src/test/results/clientpositive/llap/scratch_col_reused_by_child.q.out
@@ -0,0 +1,252 @@
+PREHOOK: query: CREATE EXTERNAL TABLE main_tbl(col1 string, col2 string) 
stored as orc
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@main_tbl
+POSTHOOK: query: CREATE EXTERNAL TABLE main_tbl(col1 string, col2 string) 
stored as orc
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@main_tbl
+PREHOOK: query: CREATE EXTERNAL TABLE sub_tbl(pdate date) stored as orc
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@sub_tbl
+POSTHOOK: query: CREATE EXTERNAL TABLE sub_tbl(pdate date) stored as orc
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@sub_tbl
+PREHOOK: query: insert into main_tbl 
values('20250331','BBB'),('20250331','AAAAAA')
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@main_tbl
+POSTHOOK: query: insert into main_tbl 
values('20250331','BBB'),('20250331','AAAAAA')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@main_tbl
+POSTHOOK: Lineage: main_tbl.col1 SCRIPT []
+POSTHOOK: Lineage: main_tbl.col2 SCRIPT []
+PREHOOK: query: insert into sub_tbl values('2025-03-31')
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@sub_tbl
+POSTHOOK: query: insert into sub_tbl values('2025-03-31')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@sub_tbl
+POSTHOOK: Lineage: sub_tbl.pdate SCRIPT []
+PREHOOK: query: explain vectorization detail
+select case
+    when upper(trim(col2)) = 'AAAAAA' then 'AAAA_BBBB_CCCC_DDDD'
+    when upper(trim(col2)) = 'BBB' then 'WWWW_XXXX_YYYY_ZZZZ'
+    else 'N/A'
+end as result
+from main_tbl
+where 
cast(concat(substr(trim(col1),1,4),'-',substr(trim(col1),5,2),'-',substr(trim(col1),7,2))
 as date) in (select pdate from sub_tbl)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@main_tbl
+PREHOOK: Input: default@sub_tbl
+#### A masked pattern was here ####
+POSTHOOK: query: explain vectorization detail
+select case
+    when upper(trim(col2)) = 'AAAAAA' then 'AAAA_BBBB_CCCC_DDDD'
+    when upper(trim(col2)) = 'BBB' then 'WWWW_XXXX_YYYY_ZZZZ'
+    else 'N/A'
+end as result
+from main_tbl
+where 
cast(concat(substr(trim(col1),1,4),'-',substr(trim(col1),5,2),'-',substr(trim(col1),7,2))
 as date) in (select pdate from sub_tbl)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@main_tbl
+POSTHOOK: Input: default@sub_tbl
+#### A masked pattern was here ####
+PLAN VECTORIZATION:
+  enabled: true
+  enabledConditionsMet: [hive.vectorized.execution.enabled IS true]
+
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Map 1 <- Map 2 (BROADCAST_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: main_tbl
+                  filterExpr: CAST( concat(substr(trim(col1), 1, 4), '-', 
substr(trim(col1), 5, 2), '-', substr(trim(col1), 7, 2)) AS DATE) is not null 
(type: boolean)
+                  Statistics: Num rows: 2 Data size: 362 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  TableScan Vectorization:
+                      native: true
+                      vectorizationSchemaColumns: [0:col1:string, 
1:col2:string, 2:ROW__ID:struct<writeid:bigint,bucketid:int,rowid:bigint>, 
3:ROW__IS__DELETED:boolean]
+                  Filter Operator
+                    Filter Vectorization:
+                        className: VectorFilterOperator
+                        native: true
+                        predicateExpression: SelectColumnIsNotNull(col 
11:date)(children: CastStringToDate(col 10:string)(children: 
VectorUDFAdaptor(concat(substr(trim(col1), 1, 4), '-', substr(trim(col1), 5, 
2), '-', substr(trim(col1), 7, 2)))(children: StringSubstrColStartLen(col 
4:string, start 0, length 4)(children: StringTrimCol(col 0:string) -> 4:string) 
-> 5:string, StringSubstrColStartLen(col 6:string, start 4, length 2)(children: 
StringTrimCol(col 0:string) -> 6:string) -> 7:str [...]
+                    predicate: CAST( concat(substr(trim(col1), 1, 4), '-', 
substr(trim(col1), 5, 2), '-', substr(trim(col1), 7, 2)) AS DATE) is not null 
(type: boolean)
+                    Statistics: Num rows: 2 Data size: 362 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Select Operator
+                      expressions: col1 (type: string), col2 (type: string)
+                      outputColumnNames: _col0, _col1
+                      Select Vectorization:
+                          className: VectorSelectOperator
+                          native: true
+                          projectedOutputColumnNums: [0, 1]
+                      Statistics: Num rows: 2 Data size: 362 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      Map Join Operator
+                        condition map:
+                             Left Semi Join 0 to 1
+                        keys:
+                          0 CAST( concat(substr(trim(_col0), 1, 4), '-', 
substr(trim(_col0), 5, 2), '-', substr(trim(_col0), 7, 2)) AS DATE) (type: date)
+                          1 _col0 (type: date)
+                        Map Join Vectorization:
+                            bigTableKeyColumns: 11:date
+                            bigTableKeyExpressions: CastStringToDate(col 
4:string)(children: VectorUDFAdaptor(concat(substr(trim(_col0), 1, 4), '-', 
substr(trim(_col0), 5, 2), '-', substr(trim(_col0), 7, 2)))(children: 
StringSubstrColStartLen(col 4:string, start 0, length 4)(children: 
StringTrimCol(col 0:string) -> 4:string) -> 5:string, 
StringSubstrColStartLen(col 4:string, start 4, length 2)(children: 
StringTrimCol(col 0:string) -> 4:string) -> 6:string, 
StringSubstrColStartLen(col 4:s [...]
+                            bigTableRetainColumnNums: [1]
+                            bigTableValueColumns: 1:string
+                            className: VectorMapJoinLeftSemiLongOperator
+                            native: true
+                            nativeConditionsMet: 
hive.mapjoin.optimized.hashtable IS true, 
hive.vectorized.execution.mapjoin.native.enabled IS true, hive.execution.engine 
tez IN [tez] IS true, One MapJoin Condition IS true, No nullsafe IS true, Small 
table vectorizes IS true, Optimized Table and Supports Key Types IS true
+                            nonOuterSmallTableKeyMapping: []
+                            projectedOutput: 1:string
+                            hashTableImplementationType: OPTIMIZED
+                        outputColumnNames: _col1
+                        input vertices:
+                          1 Map 2
+                        Statistics: Num rows: 2 Data size: 178 Basic stats: 
COMPLETE Column stats: COMPLETE
+                        Select Operator
+                          expressions: CASE WHEN ((upper(trim(_col1)) = 
'AAAAAA')) THEN ('AAAA_BBBB_CCCC_DDDD') WHEN ((upper(trim(_col1)) = 'BBB')) 
THEN ('WWWW_XXXX_YYYY_ZZZZ') ELSE ('N/A') END (type: string)
+                          outputColumnNames: _col0
+                          Select Vectorization:
+                              className: VectorSelectOperator
+                              native: true
+                              projectedOutputColumnNums: [6]
+                              selectExpressions: IfExprColumnCondExpr(col 
12:boolean, col 4:stringcol 5:string)(children: 
StringGroupColEqualStringScalar(col 5:string, val AAAAAA)(children: 
StringUpper(col 4:string)(children: StringTrimCol(col 1:string) -> 4:string) -> 
5:string) -> 12:boolean, ConstantVectorExpression(val AAAA_BBBB_CCCC_DDDD) -> 
4:string, IfExprStringScalarStringScalar(col 13:boolean, val 
WWWW_XXXX_YYYY_ZZZZ, val N/A)(children: StringGroupColEqualStringScalar(col 
6:strin [...]
+                          Statistics: Num rows: 2 Data size: 206 Basic stats: 
COMPLETE Column stats: COMPLETE
+                          File Output Operator
+                            compressed: false
+                            File Sink Vectorization:
+                                className: VectorFileSinkOperator
+                                native: false
+                            Statistics: Num rows: 2 Data size: 206 Basic 
stats: COMPLETE Column stats: COMPLETE
+                            table:
+                                input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                                output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                                serde: 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+            Map Vectorization:
+                enabled: true
+                enabledConditionsMet: 
hive.vectorized.use.vectorized.input.format IS true
+                inputFormatFeatureSupport: [DECIMAL_64]
+                featureSupportInUse: [DECIMAL_64]
+                inputFileFormats: 
org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
+                allNative: false
+                usesVectorUDFAdaptor: true
+                vectorized: true
+                rowBatchContext:
+                    dataColumnCount: 2
+                    includeColumns: [0, 1]
+                    dataColumns: col1:string, col2:string
+                    partitionColumnCount: 0
+                    scratchColumnTypeNames: [string, string, string, string, 
string, string, string, bigint, bigint, bigint]
+        Map 2 
+            Map Operator Tree:
+                TableScan
+                  alias: sub_tbl
+                  filterExpr: pdate is not null (type: boolean)
+                  Statistics: Num rows: 1 Data size: 56 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  TableScan Vectorization:
+                      native: true
+                      vectorizationSchemaColumns: [0:pdate:date, 
1:ROW__ID:struct<writeid:bigint,bucketid:int,rowid:bigint>, 
2:ROW__IS__DELETED:boolean]
+                  Filter Operator
+                    Filter Vectorization:
+                        className: VectorFilterOperator
+                        native: true
+                        predicateExpression: SelectColumnIsNotNull(col 0:date)
+                    predicate: pdate is not null (type: boolean)
+                    Statistics: Num rows: 1 Data size: 56 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Select Operator
+                      expressions: pdate (type: date)
+                      outputColumnNames: _col0
+                      Select Vectorization:
+                          className: VectorSelectOperator
+                          native: true
+                          projectedOutputColumnNums: [0]
+                      Statistics: Num rows: 1 Data size: 56 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      Group By Operator
+                        Group By Vectorization:
+                            className: VectorGroupByOperator
+                            groupByMode: HASH
+                            keyExpressions: col 0:date
+                            native: false
+                            vectorProcessingMode: HASH
+                            projectedOutputColumnNums: []
+                        keys: _col0 (type: date)
+                        minReductionHashAggr: 0.99
+                        mode: hash
+                        outputColumnNames: _col0
+                        Statistics: Num rows: 1 Data size: 56 Basic stats: 
COMPLETE Column stats: COMPLETE
+                        Reduce Output Operator
+                          key expressions: _col0 (type: date)
+                          null sort order: z
+                          sort order: +
+                          Map-reduce partition columns: _col0 (type: date)
+                          Reduce Sink Vectorization:
+                              className: VectorReduceSinkLongOperator
+                              keyColumns: 0:date
+                              native: true
+                              nativeConditionsMet: 
hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine 
tez IN [tez] IS true, No PTF TopN IS true, No DISTINCT columns IS true, 
BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true
+                          Statistics: Num rows: 1 Data size: 56 Basic stats: 
COMPLETE Column stats: COMPLETE
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+            Map Vectorization:
+                enabled: true
+                enabledConditionsMet: 
hive.vectorized.use.vectorized.input.format IS true
+                inputFormatFeatureSupport: [DECIMAL_64]
+                featureSupportInUse: [DECIMAL_64]
+                inputFileFormats: 
org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
+                allNative: false
+                usesVectorUDFAdaptor: false
+                vectorized: true
+                rowBatchContext:
+                    dataColumnCount: 1
+                    includeColumns: [0]
+                    dataColumns: pdate:date
+                    partitionColumnCount: 0
+                    scratchColumnTypeNames: []
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select case
+    when upper(trim(col2)) = 'AAAAAA' then 'AAAA_BBBB_CCCC_DDDD'
+    when upper(trim(col2)) = 'BBB' then 'WWWW_XXXX_YYYY_ZZZZ'
+    else 'N/A'
+end as result
+from main_tbl
+where 
cast(concat(substr(trim(col1),1,4),'-',substr(trim(col1),5,2),'-',substr(trim(col1),7,2))
 as date) in (select pdate from sub_tbl)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@main_tbl
+PREHOOK: Input: default@sub_tbl
+#### A masked pattern was here ####
+POSTHOOK: query: select case
+    when upper(trim(col2)) = 'AAAAAA' then 'AAAA_BBBB_CCCC_DDDD'
+    when upper(trim(col2)) = 'BBB' then 'WWWW_XXXX_YYYY_ZZZZ'
+    else 'N/A'
+end as result
+from main_tbl
+where 
cast(concat(substr(trim(col1),1,4),'-',substr(trim(col1),5,2),'-',substr(trim(col1),7,2))
 as date) in (select pdate from sub_tbl)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@main_tbl
+POSTHOOK: Input: default@sub_tbl
+#### A masked pattern was here ####
+WWWW_XXXX_YYYY_ZZZZ
+AAAA_BBBB_CCCC_DDDD

(hive) branch master updated: HIVE-28880: Wrong result when output column in a vectorized expression is used as a scratch column by a child (#6000)

Reply via email to