This is an automated email from the ASF dual-hosted git repository.
krisztiankasa pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new d82d73c6713 HIVE-28880: Wrong result when output column in a
vectorized expression is used as a scratch column by a child (#6000)
d82d73c6713 is described below
commit d82d73c671307f55feb845a2c5d9b87b463057a9
Author: Soumyakanti Das <[email protected]>
AuthorDate: Thu Jul 31 01:50:25 2025 -0700
HIVE-28880: Wrong result when output column in a vectorized expression is
used as a scratch column by a child (#6000)
---
.../vector/expressions/IfExprColumnCondExpr.java | 7 +-
.../clientpositive/scratch_col_reused_by_child.q | 54 +++++
.../llap/scratch_col_reused_by_child.q.out | 252 +++++++++++++++++++++
3 files changed, 311 insertions(+), 2 deletions(-)
diff --git
a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/IfExprColumnCondExpr.java
b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/IfExprColumnCondExpr.java
index 85522e52c36..575bf0eefbf 100644
---
a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/IfExprColumnCondExpr.java
+++
b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/IfExprColumnCondExpr.java
@@ -95,13 +95,16 @@ public void evaluate(VectorizedRowBatch batch) throws
HiveException {
// The THEN expression is either IdentityExpression (a column) or a
ConstantVectorExpression
// (a scalar) and trivial to evaluate.
childExpressions[1].evaluate(batch);
+
+ // Evaluate the third child (ELSE) expression conditionally.
+ conditionalEvaluate(batch, childExpressions[2], elseSelected, elseCount);
+
for (int i = 0; i < thenCount; i++) {
final int batchIndex = thenSelected[i];
outputIsNull[batchIndex] = false;
outputColVector.setElement(batchIndex, batchIndex, thenColVector);
}
-
- conditionalEvaluate(batch, childExpressions[2], elseSelected, elseCount);
+
for (int i = 0; i < elseCount; i++) {
final int batchIndex = elseSelected[i];
outputIsNull[batchIndex] = false;
diff --git a/ql/src/test/queries/clientpositive/scratch_col_reused_by_child.q
b/ql/src/test/queries/clientpositive/scratch_col_reused_by_child.q
new file mode 100644
index 00000000000..325fc6056cb
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/scratch_col_reused_by_child.q
@@ -0,0 +1,54 @@
+set hive.vectorized.execution.enabled=true;
+set hive.cbo.enable=true;
+set hive.auto.convert.join=true;
+
+CREATE EXTERNAL TABLE main_tbl(col1 string, col2 string) stored as orc;
+CREATE EXTERNAL TABLE sub_tbl(pdate date) stored as orc;
+insert into main_tbl values('20250331','BBB'),('20250331','AAAAAA');
+insert into sub_tbl values('2025-03-31');
+
+--selectExpressions:
+-- IfExprColumnCondExpr(col 12:boolean, col 4:string, col 5:string)(
+-- children:
+-- StringGroupColEqualStringScalar(col 5: string, val AAAAAA)(
+-- children:
+-- StringUpper(col 4:string)(
+-- children:
+-- StringTrimCol(col 1:string) -> 4:string
+-- ) -> 5:string
+-- ) -> 12:boolean,
+--
+-- ConstantVectorExpression(val AAAA_BBBB_CCCC_DDDD) -> 4:string,
+--
+-- IfExprStringScalarStringScalar(col 13:boolean, val
WWWW_XXXX_YYYY_ZZZZ, val N/A)(
+-- children:
+-- StringGroupColEqualStringScalar(col 6:string, val BBB)(
+-- children:
+-- StringUpper(col 5:string)(
+-- children:
+-- StringTrimCol(col 1:string) -> 5:string
+-- ) -> 6:string
+-- ) -> 13:boolean
+-- ) -> 5:string
+-- ) -> 6:string
+
+-- In the above expression, we can see that col 6, which is the final output
column, is also used by a child
+-- expression to store the result of StringUpper. This may cause issues if we
do not evaluate the child expression first.
+
+
+explain vectorization detail
+select case
+ when upper(trim(col2)) = 'AAAAAA' then 'AAAA_BBBB_CCCC_DDDD'
+ when upper(trim(col2)) = 'BBB' then 'WWWW_XXXX_YYYY_ZZZZ'
+ else 'N/A'
+end as result
+from main_tbl
+where
cast(concat(substr(trim(col1),1,4),'-',substr(trim(col1),5,2),'-',substr(trim(col1),7,2))
as date) in (select pdate from sub_tbl);
+
+select case
+ when upper(trim(col2)) = 'AAAAAA' then 'AAAA_BBBB_CCCC_DDDD'
+ when upper(trim(col2)) = 'BBB' then 'WWWW_XXXX_YYYY_ZZZZ'
+ else 'N/A'
+end as result
+from main_tbl
+where
cast(concat(substr(trim(col1),1,4),'-',substr(trim(col1),5,2),'-',substr(trim(col1),7,2))
as date) in (select pdate from sub_tbl);
diff --git
a/ql/src/test/results/clientpositive/llap/scratch_col_reused_by_child.q.out
b/ql/src/test/results/clientpositive/llap/scratch_col_reused_by_child.q.out
new file mode 100644
index 00000000000..a769d77d2c0
--- /dev/null
+++ b/ql/src/test/results/clientpositive/llap/scratch_col_reused_by_child.q.out
@@ -0,0 +1,252 @@
+PREHOOK: query: CREATE EXTERNAL TABLE main_tbl(col1 string, col2 string)
stored as orc
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@main_tbl
+POSTHOOK: query: CREATE EXTERNAL TABLE main_tbl(col1 string, col2 string)
stored as orc
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@main_tbl
+PREHOOK: query: CREATE EXTERNAL TABLE sub_tbl(pdate date) stored as orc
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@sub_tbl
+POSTHOOK: query: CREATE EXTERNAL TABLE sub_tbl(pdate date) stored as orc
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@sub_tbl
+PREHOOK: query: insert into main_tbl
values('20250331','BBB'),('20250331','AAAAAA')
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@main_tbl
+POSTHOOK: query: insert into main_tbl
values('20250331','BBB'),('20250331','AAAAAA')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@main_tbl
+POSTHOOK: Lineage: main_tbl.col1 SCRIPT []
+POSTHOOK: Lineage: main_tbl.col2 SCRIPT []
+PREHOOK: query: insert into sub_tbl values('2025-03-31')
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@sub_tbl
+POSTHOOK: query: insert into sub_tbl values('2025-03-31')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@sub_tbl
+POSTHOOK: Lineage: sub_tbl.pdate SCRIPT []
+PREHOOK: query: explain vectorization detail
+select case
+ when upper(trim(col2)) = 'AAAAAA' then 'AAAA_BBBB_CCCC_DDDD'
+ when upper(trim(col2)) = 'BBB' then 'WWWW_XXXX_YYYY_ZZZZ'
+ else 'N/A'
+end as result
+from main_tbl
+where
cast(concat(substr(trim(col1),1,4),'-',substr(trim(col1),5,2),'-',substr(trim(col1),7,2))
as date) in (select pdate from sub_tbl)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@main_tbl
+PREHOOK: Input: default@sub_tbl
+#### A masked pattern was here ####
+POSTHOOK: query: explain vectorization detail
+select case
+ when upper(trim(col2)) = 'AAAAAA' then 'AAAA_BBBB_CCCC_DDDD'
+ when upper(trim(col2)) = 'BBB' then 'WWWW_XXXX_YYYY_ZZZZ'
+ else 'N/A'
+end as result
+from main_tbl
+where
cast(concat(substr(trim(col1),1,4),'-',substr(trim(col1),5,2),'-',substr(trim(col1),7,2))
as date) in (select pdate from sub_tbl)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@main_tbl
+POSTHOOK: Input: default@sub_tbl
+#### A masked pattern was here ####
+PLAN VECTORIZATION:
+ enabled: true
+ enabledConditionsMet: [hive.vectorized.execution.enabled IS true]
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Tez
+#### A masked pattern was here ####
+ Edges:
+ Map 1 <- Map 2 (BROADCAST_EDGE)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: main_tbl
+ filterExpr: CAST( concat(substr(trim(col1), 1, 4), '-',
substr(trim(col1), 5, 2), '-', substr(trim(col1), 7, 2)) AS DATE) is not null
(type: boolean)
+ Statistics: Num rows: 2 Data size: 362 Basic stats: COMPLETE
Column stats: COMPLETE
+ TableScan Vectorization:
+ native: true
+ vectorizationSchemaColumns: [0:col1:string,
1:col2:string, 2:ROW__ID:struct<writeid:bigint,bucketid:int,rowid:bigint>,
3:ROW__IS__DELETED:boolean]
+ Filter Operator
+ Filter Vectorization:
+ className: VectorFilterOperator
+ native: true
+ predicateExpression: SelectColumnIsNotNull(col
11:date)(children: CastStringToDate(col 10:string)(children:
VectorUDFAdaptor(concat(substr(trim(col1), 1, 4), '-', substr(trim(col1), 5,
2), '-', substr(trim(col1), 7, 2)))(children: StringSubstrColStartLen(col
4:string, start 0, length 4)(children: StringTrimCol(col 0:string) -> 4:string)
-> 5:string, StringSubstrColStartLen(col 6:string, start 4, length 2)(children:
StringTrimCol(col 0:string) -> 6:string) -> 7:str [...]
+ predicate: CAST( concat(substr(trim(col1), 1, 4), '-',
substr(trim(col1), 5, 2), '-', substr(trim(col1), 7, 2)) AS DATE) is not null
(type: boolean)
+ Statistics: Num rows: 2 Data size: 362 Basic stats:
COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: col1 (type: string), col2 (type: string)
+ outputColumnNames: _col0, _col1
+ Select Vectorization:
+ className: VectorSelectOperator
+ native: true
+ projectedOutputColumnNums: [0, 1]
+ Statistics: Num rows: 2 Data size: 362 Basic stats:
COMPLETE Column stats: COMPLETE
+ Map Join Operator
+ condition map:
+ Left Semi Join 0 to 1
+ keys:
+ 0 CAST( concat(substr(trim(_col0), 1, 4), '-',
substr(trim(_col0), 5, 2), '-', substr(trim(_col0), 7, 2)) AS DATE) (type: date)
+ 1 _col0 (type: date)
+ Map Join Vectorization:
+ bigTableKeyColumns: 11:date
+ bigTableKeyExpressions: CastStringToDate(col
4:string)(children: VectorUDFAdaptor(concat(substr(trim(_col0), 1, 4), '-',
substr(trim(_col0), 5, 2), '-', substr(trim(_col0), 7, 2)))(children:
StringSubstrColStartLen(col 4:string, start 0, length 4)(children:
StringTrimCol(col 0:string) -> 4:string) -> 5:string,
StringSubstrColStartLen(col 4:string, start 4, length 2)(children:
StringTrimCol(col 0:string) -> 4:string) -> 6:string,
StringSubstrColStartLen(col 4:s [...]
+ bigTableRetainColumnNums: [1]
+ bigTableValueColumns: 1:string
+ className: VectorMapJoinLeftSemiLongOperator
+ native: true
+ nativeConditionsMet:
hive.mapjoin.optimized.hashtable IS true,
hive.vectorized.execution.mapjoin.native.enabled IS true, hive.execution.engine
tez IN [tez] IS true, One MapJoin Condition IS true, No nullsafe IS true, Small
table vectorizes IS true, Optimized Table and Supports Key Types IS true
+ nonOuterSmallTableKeyMapping: []
+ projectedOutput: 1:string
+ hashTableImplementationType: OPTIMIZED
+ outputColumnNames: _col1
+ input vertices:
+ 1 Map 2
+ Statistics: Num rows: 2 Data size: 178 Basic stats:
COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: CASE WHEN ((upper(trim(_col1)) =
'AAAAAA')) THEN ('AAAA_BBBB_CCCC_DDDD') WHEN ((upper(trim(_col1)) = 'BBB'))
THEN ('WWWW_XXXX_YYYY_ZZZZ') ELSE ('N/A') END (type: string)
+ outputColumnNames: _col0
+ Select Vectorization:
+ className: VectorSelectOperator
+ native: true
+ projectedOutputColumnNums: [6]
+ selectExpressions: IfExprColumnCondExpr(col
12:boolean, col 4:stringcol 5:string)(children:
StringGroupColEqualStringScalar(col 5:string, val AAAAAA)(children:
StringUpper(col 4:string)(children: StringTrimCol(col 1:string) -> 4:string) ->
5:string) -> 12:boolean, ConstantVectorExpression(val AAAA_BBBB_CCCC_DDDD) ->
4:string, IfExprStringScalarStringScalar(col 13:boolean, val
WWWW_XXXX_YYYY_ZZZZ, val N/A)(children: StringGroupColEqualStringScalar(col
6:strin [...]
+ Statistics: Num rows: 2 Data size: 206 Basic stats:
COMPLETE Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ File Sink Vectorization:
+ className: VectorFileSinkOperator
+ native: false
+ Statistics: Num rows: 2 Data size: 206 Basic
stats: COMPLETE Column stats: COMPLETE
+ table:
+ input format:
org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format:
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde:
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ Execution mode: vectorized, llap
+ LLAP IO: all inputs
+ Map Vectorization:
+ enabled: true
+ enabledConditionsMet:
hive.vectorized.use.vectorized.input.format IS true
+ inputFormatFeatureSupport: [DECIMAL_64]
+ featureSupportInUse: [DECIMAL_64]
+ inputFileFormats:
org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
+ allNative: false
+ usesVectorUDFAdaptor: true
+ vectorized: true
+ rowBatchContext:
+ dataColumnCount: 2
+ includeColumns: [0, 1]
+ dataColumns: col1:string, col2:string
+ partitionColumnCount: 0
+ scratchColumnTypeNames: [string, string, string, string,
string, string, string, bigint, bigint, bigint]
+ Map 2
+ Map Operator Tree:
+ TableScan
+ alias: sub_tbl
+ filterExpr: pdate is not null (type: boolean)
+ Statistics: Num rows: 1 Data size: 56 Basic stats: COMPLETE
Column stats: COMPLETE
+ TableScan Vectorization:
+ native: true
+ vectorizationSchemaColumns: [0:pdate:date,
1:ROW__ID:struct<writeid:bigint,bucketid:int,rowid:bigint>,
2:ROW__IS__DELETED:boolean]
+ Filter Operator
+ Filter Vectorization:
+ className: VectorFilterOperator
+ native: true
+ predicateExpression: SelectColumnIsNotNull(col 0:date)
+ predicate: pdate is not null (type: boolean)
+ Statistics: Num rows: 1 Data size: 56 Basic stats:
COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: pdate (type: date)
+ outputColumnNames: _col0
+ Select Vectorization:
+ className: VectorSelectOperator
+ native: true
+ projectedOutputColumnNums: [0]
+ Statistics: Num rows: 1 Data size: 56 Basic stats:
COMPLETE Column stats: COMPLETE
+ Group By Operator
+ Group By Vectorization:
+ className: VectorGroupByOperator
+ groupByMode: HASH
+ keyExpressions: col 0:date
+ native: false
+ vectorProcessingMode: HASH
+ projectedOutputColumnNums: []
+ keys: _col0 (type: date)
+ minReductionHashAggr: 0.99
+ mode: hash
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 56 Basic stats:
COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: date)
+ null sort order: z
+ sort order: +
+ Map-reduce partition columns: _col0 (type: date)
+ Reduce Sink Vectorization:
+ className: VectorReduceSinkLongOperator
+ keyColumns: 0:date
+ native: true
+ nativeConditionsMet:
hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine
tez IN [tez] IS true, No PTF TopN IS true, No DISTINCT columns IS true,
BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true
+ Statistics: Num rows: 1 Data size: 56 Basic stats:
COMPLETE Column stats: COMPLETE
+ Execution mode: vectorized, llap
+ LLAP IO: all inputs
+ Map Vectorization:
+ enabled: true
+ enabledConditionsMet:
hive.vectorized.use.vectorized.input.format IS true
+ inputFormatFeatureSupport: [DECIMAL_64]
+ featureSupportInUse: [DECIMAL_64]
+ inputFileFormats:
org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
+ allNative: false
+ usesVectorUDFAdaptor: false
+ vectorized: true
+ rowBatchContext:
+ dataColumnCount: 1
+ includeColumns: [0]
+ dataColumns: pdate:date
+ partitionColumnCount: 0
+ scratchColumnTypeNames: []
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: select case
+ when upper(trim(col2)) = 'AAAAAA' then 'AAAA_BBBB_CCCC_DDDD'
+ when upper(trim(col2)) = 'BBB' then 'WWWW_XXXX_YYYY_ZZZZ'
+ else 'N/A'
+end as result
+from main_tbl
+where
cast(concat(substr(trim(col1),1,4),'-',substr(trim(col1),5,2),'-',substr(trim(col1),7,2))
as date) in (select pdate from sub_tbl)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@main_tbl
+PREHOOK: Input: default@sub_tbl
+#### A masked pattern was here ####
+POSTHOOK: query: select case
+ when upper(trim(col2)) = 'AAAAAA' then 'AAAA_BBBB_CCCC_DDDD'
+ when upper(trim(col2)) = 'BBB' then 'WWWW_XXXX_YYYY_ZZZZ'
+ else 'N/A'
+end as result
+from main_tbl
+where
cast(concat(substr(trim(col1),1,4),'-',substr(trim(col1),5,2),'-',substr(trim(col1),7,2))
as date) in (select pdate from sub_tbl)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@main_tbl
+POSTHOOK: Input: default@sub_tbl
+#### A masked pattern was here ####
+WWWW_XXXX_YYYY_ZZZZ
+AAAA_BBBB_CCCC_DDDD