This is an automated email from the ASF dual-hosted git repository.

zhangbutao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git


The following commit(s) were added to refs/heads/master by this push:
     new 36d32ec7fd6 HIVE-26713: StringExpr ArrayIndexOutOfBoundsException with 
LIKE '%xxx%' (#4999)(Ryu Kobayashi, reviewed by Attila Turoczy, Butao Zhang)
36d32ec7fd6 is described below

commit 36d32ec7fd6ac9053e6a9d28f01dd431149a5ac4
Author: Ryu Kobayashi <beter....@gmail.com>
AuthorDate: Tue Jan 23 13:05:48 2024 +0900

    HIVE-26713: StringExpr ArrayIndexOutOfBoundsException with LIKE '%xxx%' 
(#4999)(Ryu Kobayashi, reviewed by Attila Turoczy, Butao Zhang)
---
 data/files/control_characters.txt                  |  1 +
 .../clientpositive/like_control_characters.q       | 13 +++
 .../llap/like_control_characters.q.out             | 93 ++++++++++++++++++++++
 .../ql/exec/vector/expressions/StringExpr.java     | 10 ++-
 .../ql/exec/vector/expressions/TestStringExpr.java | 24 +++++-
 5 files changed, 138 insertions(+), 3 deletions(-)

diff --git a/data/files/control_characters.txt 
b/data/files/control_characters.txt
new file mode 100644
index 00000000000..4e3fc6c4535
--- /dev/null
+++ b/data/files/control_characters.txt
@@ -0,0 +1 @@
+abcde�fghi
\ No newline at end of file
diff --git a/ql/src/test/queries/clientpositive/like_control_characters.q 
b/ql/src/test/queries/clientpositive/like_control_characters.q
new file mode 100644
index 00000000000..5f9772ed2ef
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/like_control_characters.q
@@ -0,0 +1,13 @@
+set hive.mapred.mode=nonstrict;
+set hive.explain.user=false;
+set hive.vectorized.execution.enabled=true;
+
+create temporary table foo (col string);
+
+-- SORT_QUERY_RESULTS
+
+LOAD DATA LOCAL INPATH '../../data/files/control_characters.txt' INTO TABLE 
foo;
+
+explain select col, count(*) from foo where col like '%fg%' group by col;
+select col, count(*) from foo where col like '%fg%' group by col;
+
diff --git 
a/ql/src/test/results/clientpositive/llap/like_control_characters.q.out 
b/ql/src/test/results/clientpositive/llap/like_control_characters.q.out
new file mode 100644
index 00000000000..14aa86328db
--- /dev/null
+++ b/ql/src/test/results/clientpositive/llap/like_control_characters.q.out
@@ -0,0 +1,93 @@
+PREHOOK: query: create temporary table foo (col string)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@foo
+POSTHOOK: query: create temporary table foo (col string)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@foo
+PREHOOK: query: LOAD DATA LOCAL INPATH 
'../../data/files/control_characters.txt' INTO TABLE foo
+PREHOOK: type: LOAD
+#### A masked pattern was here ####
+PREHOOK: Output: default@foo
+POSTHOOK: query: LOAD DATA LOCAL INPATH 
'../../data/files/control_characters.txt' INTO TABLE foo
+POSTHOOK: type: LOAD
+#### A masked pattern was here ####
+POSTHOOK: Output: default@foo
+PREHOOK: query: explain select col, count(*) from foo where col like '%fg%' 
group by col
+PREHOOK: type: QUERY
+PREHOOK: Input: default@foo
+#### A masked pattern was here ####
+POSTHOOK: query: explain select col, count(*) from foo where col like '%fg%' 
group by col
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@foo
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 2 <- Map 1 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: foo
+                  filterExpr: (col like '%fg%') (type: boolean)
+                  Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE 
Column stats: NONE
+                  Filter Operator
+                    predicate: (col like '%fg%') (type: boolean)
+                    Statistics: Num rows: 1 Data size: 184 Basic stats: 
COMPLETE Column stats: NONE
+                    Group By Operator
+                      aggregations: count()
+                      keys: col (type: string)
+                      minReductionHashAggr: 0.99
+                      mode: hash
+                      outputColumnNames: _col0, _col1
+                      Statistics: Num rows: 1 Data size: 184 Basic stats: 
COMPLETE Column stats: NONE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: string)
+                        null sort order: z
+                        sort order: +
+                        Map-reduce partition columns: _col0 (type: string)
+                        Statistics: Num rows: 1 Data size: 184 Basic stats: 
COMPLETE Column stats: NONE
+                        value expressions: _col1 (type: bigint)
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+        Reducer 2 
+            Execution mode: vectorized, llap
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: count(VALUE._col0)
+                keys: KEY._col0 (type: string)
+                mode: mergepartial
+                outputColumnNames: _col0, _col1
+                Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE 
Column stats: NONE
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE 
Column stats: NONE
+                  table:
+                      input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select col, count(*) from foo where col like '%fg%' group by 
col
+PREHOOK: type: QUERY
+PREHOOK: Input: default@foo
+#### A masked pattern was here ####
+POSTHOOK: query: select col, count(*) from foo where col like '%fg%' group by 
col
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@foo
+#### A masked pattern was here ####
+abcde�fghi     1
diff --git 
a/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java
 
b/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java
index b6d3184ffed..34097167ac1 100644
--- 
a/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java
+++ 
b/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java
@@ -342,7 +342,15 @@ public class StringExpr {
           }
           s_tmp--;
         }
-        next += shift[input[next] & MAX_BYTE];
+
+        // if the character string contains control characters,
+        // overflow occurs.
+        int shiftIndex = input[next] & MAX_BYTE;
+        if (shiftIndex >= MAX_BYTE) {
+          next++;
+        } else {
+          next += shift[shiftIndex];
+        }
       }
       return -1;
     }
diff --git 
a/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestStringExpr.java
 
b/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestStringExpr.java
index 6fb66115277..483eb68b979 100644
--- 
a/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestStringExpr.java
+++ 
b/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestStringExpr.java
@@ -20,9 +20,11 @@ package org.apache.hadoop.hive.ql.exec.vector.expressions;
 
 import org.junit.Test;
 
-import java.nio.charset.StandardCharsets;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
 
-import static org.junit.Assert.*;
+import java.io.ByteArrayOutputStream;
+import java.nio.charset.StandardCharsets;
 
 public class TestStringExpr {
   @Test
@@ -49,6 +51,24 @@ public class TestStringExpr {
     assertEquals("Testing match at end of string", 24, find(pattern, input4));
   }
 
+  @Test
+  public void testControlCharacters() throws Exception {
+    StringExpr.Finder pattern = compile("pattern");
+    assertNotNull(pattern);
+
+    byte b = -1;
+    byte[] controlBytes1 = "abcedf".getBytes(StandardCharsets.UTF_8);
+    byte[] controlBytes2 = "pattern".getBytes(StandardCharsets.UTF_8);
+    ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
+    outputStream.write(controlBytes1);
+    outputStream.write(b);
+    outputStream.write(controlBytes2);
+    byte[] controlChar = outputStream.toByteArray();
+    outputStream.close();
+
+    assertEquals("Testing valid match", 7, pattern.find(controlChar, 0, 
controlChar.length));
+  }
+
   private StringExpr.Finder compile(String pattern) {
     return StringExpr.compile(pattern.getBytes(StandardCharsets.UTF_8));
   }

Reply via email to