This is an automated email from the ASF dual-hosted git repository. zhangbutao pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push: new 36d32ec7fd6 HIVE-26713: StringExpr ArrayIndexOutOfBoundsException with LIKE '%xxx%' (#4999)(Ryu Kobayashi, reviewed by Attila Turoczy, Butao Zhang) 36d32ec7fd6 is described below commit 36d32ec7fd6ac9053e6a9d28f01dd431149a5ac4 Author: Ryu Kobayashi <beter....@gmail.com> AuthorDate: Tue Jan 23 13:05:48 2024 +0900 HIVE-26713: StringExpr ArrayIndexOutOfBoundsException with LIKE '%xxx%' (#4999)(Ryu Kobayashi, reviewed by Attila Turoczy, Butao Zhang) --- data/files/control_characters.txt | 1 + .../clientpositive/like_control_characters.q | 13 +++ .../llap/like_control_characters.q.out | 93 ++++++++++++++++++++++ .../ql/exec/vector/expressions/StringExpr.java | 10 ++- .../ql/exec/vector/expressions/TestStringExpr.java | 24 +++++- 5 files changed, 138 insertions(+), 3 deletions(-) diff --git a/data/files/control_characters.txt b/data/files/control_characters.txt new file mode 100644 index 00000000000..4e3fc6c4535 --- /dev/null +++ b/data/files/control_characters.txt @@ -0,0 +1 @@ +abcde�fghi \ No newline at end of file diff --git a/ql/src/test/queries/clientpositive/like_control_characters.q b/ql/src/test/queries/clientpositive/like_control_characters.q new file mode 100644 index 00000000000..5f9772ed2ef --- /dev/null +++ b/ql/src/test/queries/clientpositive/like_control_characters.q @@ -0,0 +1,13 @@ +set hive.mapred.mode=nonstrict; +set hive.explain.user=false; +set hive.vectorized.execution.enabled=true; + +create temporary table foo (col string); + +-- SORT_QUERY_RESULTS + +LOAD DATA LOCAL INPATH '../../data/files/control_characters.txt' INTO TABLE foo; + +explain select col, count(*) from foo where col like '%fg%' group by col; +select col, count(*) from foo where col like '%fg%' group by col; + diff --git a/ql/src/test/results/clientpositive/llap/like_control_characters.q.out b/ql/src/test/results/clientpositive/llap/like_control_characters.q.out new file mode 100644 index 00000000000..14aa86328db --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/like_control_characters.q.out @@ -0,0 +1,93 @@ +PREHOOK: query: create temporary table foo (col string) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@foo +POSTHOOK: query: create temporary table foo (col string) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@foo +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/control_characters.txt' INTO TABLE foo +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@foo +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/control_characters.txt' INTO TABLE foo +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@foo +PREHOOK: query: explain select col, count(*) from foo where col like '%fg%' group by col +PREHOOK: type: QUERY +PREHOOK: Input: default@foo +#### A masked pattern was here #### +POSTHOOK: query: explain select col, count(*) from foo where col like '%fg%' group by col +POSTHOOK: type: QUERY +POSTHOOK: Input: default@foo +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: foo + filterExpr: (col like '%fg%') (type: boolean) + Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (col like '%fg%') (type: boolean) + Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + keys: col (type: string) + minReductionHashAggr: 0.99 + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint) + Execution mode: vectorized, llap + LLAP IO: all inputs + Reducer 2 + Execution mode: vectorized, llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select col, count(*) from foo where col like '%fg%' group by col +PREHOOK: type: QUERY +PREHOOK: Input: default@foo +#### A masked pattern was here #### +POSTHOOK: query: select col, count(*) from foo where col like '%fg%' group by col +POSTHOOK: type: QUERY +POSTHOOK: Input: default@foo +#### A masked pattern was here #### +abcde�fghi 1 diff --git a/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java b/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java index b6d3184ffed..34097167ac1 100644 --- a/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java +++ b/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java @@ -342,7 +342,15 @@ public class StringExpr { } s_tmp--; } - next += shift[input[next] & MAX_BYTE]; + + // if the character string contains control characters, + // overflow occurs. + int shiftIndex = input[next] & MAX_BYTE; + if (shiftIndex >= MAX_BYTE) { + next++; + } else { + next += shift[shiftIndex]; + } } return -1; } diff --git a/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestStringExpr.java b/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestStringExpr.java index 6fb66115277..483eb68b979 100644 --- a/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestStringExpr.java +++ b/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestStringExpr.java @@ -20,9 +20,11 @@ package org.apache.hadoop.hive.ql.exec.vector.expressions; import org.junit.Test; -import java.nio.charset.StandardCharsets; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.*; +import java.io.ByteArrayOutputStream; +import java.nio.charset.StandardCharsets; public class TestStringExpr { @Test @@ -49,6 +51,24 @@ public class TestStringExpr { assertEquals("Testing match at end of string", 24, find(pattern, input4)); } + @Test + public void testControlCharacters() throws Exception { + StringExpr.Finder pattern = compile("pattern"); + assertNotNull(pattern); + + byte b = -1; + byte[] controlBytes1 = "abcedf".getBytes(StandardCharsets.UTF_8); + byte[] controlBytes2 = "pattern".getBytes(StandardCharsets.UTF_8); + ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); + outputStream.write(controlBytes1); + outputStream.write(b); + outputStream.write(controlBytes2); + byte[] controlChar = outputStream.toByteArray(); + outputStream.close(); + + assertEquals("Testing valid match", 7, pattern.find(controlChar, 0, controlChar.length)); + } + private StringExpr.Finder compile(String pattern) { return StringExpr.compile(pattern.getBytes(StandardCharsets.UTF_8)); }