This is an automated email from the ASF dual-hosted git repository.

sankarh pushed a commit to branch branch-3
in repository https://gitbox.apache.org/repos/asf/hive.git


The following commit(s) were added to refs/heads/branch-3 by this push:
     new a3070e0dbfe HIVE-27552: Backport of HIVE-22360, HIVE-20619 to branch-3 
(#4535)
a3070e0dbfe is described below

commit a3070e0dbfeb5de3620b5c953461f25cce6038fe
Author: Aman Raj <104416558+amanraj2...@users.noreply.github.com>
AuthorDate: Tue Aug 22 13:00:13 2023 +0530

    HIVE-27552: Backport of HIVE-22360, HIVE-20619 to branch-3 (#4535)
    
    * HIVE-22360: MultiDelimitSerDe returns wrong results in last column when 
the loaded file has more columns than those in table schema (Shubham Chaurasia, 
reviewed by Sankar Hariappan)
    * HIVE-20619: Include MultiDelimitSerDe in HiveServer2 By Default (Alice 
Fan, reviewed by Naveen Gangam)
    
    Signed-off-by: Sankar Hariappan <sank...@apache.org>
    Closes (#4535)
---
 data/files/t11_csv_serde.csv                       |  10 +
 data/files/t1_multi_delimit.csv                    |  10 +
 data/files/t2_multi_delimit.csv                    |   4 +
 data/files/t3_multi_delimit.csv                    |  10 +
 .../queries/clientpositive/serde_multi_delimit.q   |  65 ++++++
 .../clientpositive/serde_multi_delimit.q.out       | 232 +++++++++++++++++++++
 .../hadoop/hive}/serde2/MultiDelimitSerDe.java     |  13 +-
 .../apache/hadoop/hive/serde2/lazy/LazyStruct.java |  56 ++---
 8 files changed, 362 insertions(+), 38 deletions(-)

diff --git a/data/files/t11_csv_serde.csv b/data/files/t11_csv_serde.csv
new file mode 100644
index 00000000000..6e7060919ee
--- /dev/null
+++ b/data/files/t11_csv_serde.csv
@@ -0,0 +1,10 @@
+1,1,,0,0
+2,1,,0,1
+3,1,,0,0
+4,1,,0,1
+5,5
+
+7777
+8,8,,8,8,8
+9,9,,9,9,9,9,,9,9,9
+10101010
\ No newline at end of file
diff --git a/data/files/t1_multi_delimit.csv b/data/files/t1_multi_delimit.csv
new file mode 100644
index 00000000000..6c4e729f428
--- /dev/null
+++ b/data/files/t1_multi_delimit.csv
@@ -0,0 +1,10 @@
+1^,1^,^,0^,0
+2^,1^,^,0^,1
+3^,1^,^,0^,0
+4^,1^,^,0^,1
+5^,5
+
+7777
+8^,8^,^,8^,8^,8
+9^,9^,^,9^,9^,9^,9^,^,9^,9^,9
+10101010
\ No newline at end of file
diff --git a/data/files/t2_multi_delimit.csv b/data/files/t2_multi_delimit.csv
new file mode 100644
index 00000000000..0dd42e1dfb6
--- /dev/null
+++ b/data/files/t2_multi_delimit.csv
@@ -0,0 +1,4 @@
+1^,1^,^,0^,0^,0
+2^,1^,^,0^,1^,0
+3^,1^,^,0^,0^,0
+4^,1^,^,0^,1^,0
diff --git a/data/files/t3_multi_delimit.csv b/data/files/t3_multi_delimit.csv
new file mode 100644
index 00000000000..8c49f6f3837
--- /dev/null
+++ b/data/files/t3_multi_delimit.csv
@@ -0,0 +1,10 @@
+1^^^^^1^^^^^^^^^^0^^^^^0
+2^^^^^1^^^^^^^^^^0^^^^^1
+3^^^^^1^^^^^^^^^^0^^^^^0
+4^^^^^1^^^^^^^^^^0^^^^^1
+5^^^^^5
+
+7777
+8^^^^^8^^^^^^^^^^8^^^^^8^^^^^8
+9^^^^^9^^^^^^^^^^9^^^^^9^^^^^9
+10101010
\ No newline at end of file
diff --git a/ql/src/test/queries/clientpositive/serde_multi_delimit.q 
b/ql/src/test/queries/clientpositive/serde_multi_delimit.q
new file mode 100644
index 00000000000..0d851752867
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/serde_multi_delimit.q
@@ -0,0 +1,65 @@
+-- in this table, rows of different lengths(different number of columns) are 
loaded
+CREATE TABLE t1_multi_delimit(colA int,
+  colB tinyint,
+  colC timestamp,
+  colD smallint,
+  colE smallint)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe'
+WITH SERDEPROPERTIES ("field.delim"="^,")STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH "../../data/files/t1_multi_delimit.csv" INTO TABLE 
t1_multi_delimit;
+
+SELECT * FROM t1_multi_delimit;
+
+-- in this table, rows of different lengths(different number of columns) and 
it uses csv serde
+CREATE TABLE t11_csv_serde(colA int,
+  colB tinyint,
+  colC timestamp,
+  colD smallint,
+  colE smallint)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
+WITH SERDEPROPERTIES ("separatorChar" = ",")STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH "../../data/files/t11_csv_serde.csv" INTO TABLE 
t11_csv_serde;
+
+SELECT * FROM t11_csv_serde;
+
+-- there should not be any difference between MultiDelimitSerDe table and 
OpenCSVSerde table results
+
+SELECT EXISTS (
+SELECT colA, colB, colC, colD, colE FROM t1_multi_delimit
+MINUS
+SELECT cast(colA as int), cast(colB as tinyint), cast(colC as timestamp), 
cast(colD as smallint), cast(colE as smallint) FROM t11_csv_serde
+);
+
+-- in this table, file having extra column is loaded
+CREATE TABLE t2_multi_delimit(colA int,
+  colB tinyint,
+  colC timestamp,
+  colD smallint,
+  colE smallint)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe'
+WITH SERDEPROPERTIES ("field.delim"="^,")STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH "../../data/files/t2_multi_delimit.csv" INTO TABLE 
t2_multi_delimit;
+
+SELECT * FROM t2_multi_delimit;
+
+-- in this table, delimiter of 5 characters is used
+CREATE TABLE t3_multi_delimit(colA int,
+  colB tinyint,
+  colC timestamp,
+  colD smallint,
+  colE smallint)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe'
+WITH SERDEPROPERTIES ("field.delim"="^^^^^")STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH "../../data/files/t3_multi_delimit.csv" INTO TABLE 
t3_multi_delimit;
+
+SELECT * FROM t3_multi_delimit;
+
+
+DROP TABLE t1_multi_delimit;
+DROP TABLE t11_csv_serde;
+DROP TABLE t2_multi_delimit;
+DROP TABLE t3_multi_delimit;
\ No newline at end of file
diff --git a/ql/src/test/results/clientpositive/serde_multi_delimit.q.out 
b/ql/src/test/results/clientpositive/serde_multi_delimit.q.out
new file mode 100644
index 00000000000..f13aa59d5aa
--- /dev/null
+++ b/ql/src/test/results/clientpositive/serde_multi_delimit.q.out
@@ -0,0 +1,232 @@
+PREHOOK: query: CREATE TABLE t1_multi_delimit(colA int,
+  colB tinyint,
+  colC timestamp,
+  colD smallint,
+  colE smallint)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe'
+WITH SERDEPROPERTIES ("field.delim"="^,")STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@t1_multi_delimit
+POSTHOOK: query: CREATE TABLE t1_multi_delimit(colA int,
+  colB tinyint,
+  colC timestamp,
+  colD smallint,
+  colE smallint)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe'
+WITH SERDEPROPERTIES ("field.delim"="^,")STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@t1_multi_delimit
+PREHOOK: query: LOAD DATA LOCAL INPATH "../../data/files/t1_multi_delimit.csv" 
INTO TABLE t1_multi_delimit
+PREHOOK: type: LOAD
+#### A masked pattern was here ####
+PREHOOK: Output: default@t1_multi_delimit
+POSTHOOK: query: LOAD DATA LOCAL INPATH 
"../../data/files/t1_multi_delimit.csv" INTO TABLE t1_multi_delimit
+POSTHOOK: type: LOAD
+#### A masked pattern was here ####
+POSTHOOK: Output: default@t1_multi_delimit
+PREHOOK: query: SELECT * FROM t1_multi_delimit
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1_multi_delimit
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM t1_multi_delimit
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1_multi_delimit
+#### A masked pattern was here ####
+1      1       NULL    0       0
+2      1       NULL    0       1
+3      1       NULL    0       0
+4      1       NULL    0       1
+5      5       NULL    NULL    NULL
+NULL   NULL    NULL    NULL    NULL
+7777   NULL    NULL    NULL    NULL
+8      8       NULL    8       8
+9      9       NULL    9       9
+10101010       NULL    NULL    NULL    NULL
+PREHOOK: query: CREATE TABLE t11_csv_serde(colA int,
+  colB tinyint,
+  colC timestamp,
+  colD smallint,
+  colE smallint)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
+WITH SERDEPROPERTIES ("separatorChar" = ",")STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@t11_csv_serde
+POSTHOOK: query: CREATE TABLE t11_csv_serde(colA int,
+  colB tinyint,
+  colC timestamp,
+  colD smallint,
+  colE smallint)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
+WITH SERDEPROPERTIES ("separatorChar" = ",")STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@t11_csv_serde
+PREHOOK: query: LOAD DATA LOCAL INPATH "../../data/files/t11_csv_serde.csv" 
INTO TABLE t11_csv_serde
+PREHOOK: type: LOAD
+#### A masked pattern was here ####
+PREHOOK: Output: default@t11_csv_serde
+POSTHOOK: query: LOAD DATA LOCAL INPATH "../../data/files/t11_csv_serde.csv" 
INTO TABLE t11_csv_serde
+POSTHOOK: type: LOAD
+#### A masked pattern was here ####
+POSTHOOK: Output: default@t11_csv_serde
+PREHOOK: query: SELECT * FROM t11_csv_serde
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t11_csv_serde
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM t11_csv_serde
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t11_csv_serde
+#### A masked pattern was here ####
+1      1               0       0
+2      1               0       1
+3      1               0       0
+4      1               0       1
+5      5       NULL    NULL    NULL
+NULL   NULL    NULL    NULL    NULL
+7777   NULL    NULL    NULL    NULL
+8      8               8       8
+9      9               9       9
+10101010       NULL    NULL    NULL    NULL
+Warning: Shuffle Join JOIN[30][tables = [$hdt$_0, $hdt$_1]] in Stage 
'Stage-1:MAPRED' is a cross product
+PREHOOK: query: SELECT EXISTS (
+SELECT colA, colB, colC, colD, colE FROM t1_multi_delimit
+MINUS
+SELECT cast(colA as int), cast(colB as tinyint), cast(colC as timestamp), 
cast(colD as smallint), cast(colE as smallint) FROM t11_csv_serde
+)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Input: default@t11_csv_serde
+PREHOOK: Input: default@t1_multi_delimit
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT EXISTS (
+SELECT colA, colB, colC, colD, colE FROM t1_multi_delimit
+MINUS
+SELECT cast(colA as int), cast(colB as tinyint), cast(colC as timestamp), 
cast(colD as smallint), cast(colE as smallint) FROM t11_csv_serde
+)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Input: default@t11_csv_serde
+POSTHOOK: Input: default@t1_multi_delimit
+#### A masked pattern was here ####
+false
+PREHOOK: query: CREATE TABLE t2_multi_delimit(colA int,
+  colB tinyint,
+  colC timestamp,
+  colD smallint,
+  colE smallint)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe'
+WITH SERDEPROPERTIES ("field.delim"="^,")STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@t2_multi_delimit
+POSTHOOK: query: CREATE TABLE t2_multi_delimit(colA int,
+  colB tinyint,
+  colC timestamp,
+  colD smallint,
+  colE smallint)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe'
+WITH SERDEPROPERTIES ("field.delim"="^,")STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@t2_multi_delimit
+PREHOOK: query: LOAD DATA LOCAL INPATH "../../data/files/t2_multi_delimit.csv" 
INTO TABLE t2_multi_delimit
+PREHOOK: type: LOAD
+#### A masked pattern was here ####
+PREHOOK: Output: default@t2_multi_delimit
+POSTHOOK: query: LOAD DATA LOCAL INPATH 
"../../data/files/t2_multi_delimit.csv" INTO TABLE t2_multi_delimit
+POSTHOOK: type: LOAD
+#### A masked pattern was here ####
+POSTHOOK: Output: default@t2_multi_delimit
+PREHOOK: query: SELECT * FROM t2_multi_delimit
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t2_multi_delimit
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM t2_multi_delimit
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t2_multi_delimit
+#### A masked pattern was here ####
+1      1       NULL    0       0
+2      1       NULL    0       1
+3      1       NULL    0       0
+4      1       NULL    0       1
+PREHOOK: query: CREATE TABLE t3_multi_delimit(colA int,
+  colB tinyint,
+  colC timestamp,
+  colD smallint,
+  colE smallint)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe'
+WITH SERDEPROPERTIES ("field.delim"="^^^^^")STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@t3_multi_delimit
+POSTHOOK: query: CREATE TABLE t3_multi_delimit(colA int,
+  colB tinyint,
+  colC timestamp,
+  colD smallint,
+  colE smallint)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe'
+WITH SERDEPROPERTIES ("field.delim"="^^^^^")STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@t3_multi_delimit
+PREHOOK: query: LOAD DATA LOCAL INPATH "../../data/files/t3_multi_delimit.csv" 
INTO TABLE t3_multi_delimit
+PREHOOK: type: LOAD
+#### A masked pattern was here ####
+PREHOOK: Output: default@t3_multi_delimit
+POSTHOOK: query: LOAD DATA LOCAL INPATH 
"../../data/files/t3_multi_delimit.csv" INTO TABLE t3_multi_delimit
+POSTHOOK: type: LOAD
+#### A masked pattern was here ####
+POSTHOOK: Output: default@t3_multi_delimit
+PREHOOK: query: SELECT * FROM t3_multi_delimit
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t3_multi_delimit
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM t3_multi_delimit
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t3_multi_delimit
+#### A masked pattern was here ####
+1      1       NULL    0       0
+2      1       NULL    0       1
+3      1       NULL    0       0
+4      1       NULL    0       1
+5      5       NULL    NULL    NULL
+NULL   NULL    NULL    NULL    NULL
+7777   NULL    NULL    NULL    NULL
+8      8       NULL    8       8
+9      9       NULL    9       9
+10101010       NULL    NULL    NULL    NULL
+PREHOOK: query: DROP TABLE t1_multi_delimit
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@t1_multi_delimit
+PREHOOK: Output: default@t1_multi_delimit
+POSTHOOK: query: DROP TABLE t1_multi_delimit
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@t1_multi_delimit
+POSTHOOK: Output: default@t1_multi_delimit
+PREHOOK: query: DROP TABLE t11_csv_serde
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@t11_csv_serde
+PREHOOK: Output: default@t11_csv_serde
+POSTHOOK: query: DROP TABLE t11_csv_serde
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@t11_csv_serde
+POSTHOOK: Output: default@t11_csv_serde
+PREHOOK: query: DROP TABLE t2_multi_delimit
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@t2_multi_delimit
+PREHOOK: Output: default@t2_multi_delimit
+POSTHOOK: query: DROP TABLE t2_multi_delimit
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@t2_multi_delimit
+POSTHOOK: Output: default@t2_multi_delimit
+PREHOOK: query: DROP TABLE t3_multi_delimit
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@t3_multi_delimit
+PREHOOK: Output: default@t3_multi_delimit
+POSTHOOK: query: DROP TABLE t3_multi_delimit
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@t3_multi_delimit
+POSTHOOK: Output: default@t3_multi_delimit
diff --git 
a/contrib/src/java/org/apache/hadoop/hive/contrib/serde2/MultiDelimitSerDe.java 
b/serde/src/java/org/apache/hadoop/hive/serde2/MultiDelimitSerDe.java
similarity index 96%
rename from 
contrib/src/java/org/apache/hadoop/hive/contrib/serde2/MultiDelimitSerDe.java
rename to serde/src/java/org/apache/hadoop/hive/serde2/MultiDelimitSerDe.java
index 6d51bb00ceb..efe6597ffb9 100644
--- 
a/contrib/src/java/org/apache/hadoop/hive/contrib/serde2/MultiDelimitSerDe.java
+++ b/serde/src/java/org/apache/hadoop/hive/serde2/MultiDelimitSerDe.java
@@ -17,7 +17,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.hadoop.hive.contrib.serde2;
+package org.apache.hadoop.hive.serde2;
 
 import java.io.IOException;
 import java.util.List;
@@ -69,6 +69,9 @@ public class MultiDelimitSerDe extends 
AbstractEncodingAwareSerDe {
   // Due to HIVE-6404, define our own constant
   private static final String COLLECTION_DELIM = "collection.delim";
 
+  // actual delimiter(fieldDelimited) is replaced by REPLACEMENT_DELIM in row.
+  private static final String REPLACEMENT_DELIM = "\1";
+
   private int numColumns;
   private String fieldDelimited;
   // we don't support using multiple chars as delimiters within complex types
@@ -90,6 +93,8 @@ public class MultiDelimitSerDe extends 
AbstractEncodingAwareSerDe {
   private final ByteStream.Output serializeStream = new ByteStream.Output();
   // The Writable to return in serialize
   private final Text serializeCache = new Text();
+  // pattern for delimiter
+  private Pattern delimiterPattern;
 
   @Override
   public void initialize(Configuration conf, Properties tbl) throws 
SerDeException {
@@ -101,7 +106,7 @@ public class MultiDelimitSerDe extends 
AbstractEncodingAwareSerDe {
     if (fieldDelimited == null || fieldDelimited.isEmpty()) {
       throw new SerDeException("This table does not have serde property 
\"field.delim\"!");
     }
-
+    delimiterPattern = Pattern.compile(fieldDelimited, Pattern.LITERAL);
     // get the collection separator and map key separator
     // TODO: use serdeConstants.COLLECTION_DELIM when the typo is fixed
     collSep = LazyUtils.getByte(tbl.getProperty(COLLECTION_DELIM),
@@ -154,10 +159,10 @@ public class MultiDelimitSerDe extends 
AbstractEncodingAwareSerDe {
     } else {
       throw new SerDeException(getClass() + ": expects either BytesWritable or 
Text object!");
     }
-    byteArrayRef.setData(rowStr.replaceAll(Pattern.quote(fieldDelimited), 
"\1").getBytes());
+    byteArrayRef.setData(rowStr.replaceAll(Pattern.quote(fieldDelimited), 
REPLACEMENT_DELIM).getBytes());
     cachedLazyStruct.init(byteArrayRef, 0, byteArrayRef.getData().length);
     // use the multi-char delimiter to parse the lazy struct
-    cachedLazyStruct.parseMultiDelimit(rowStr.getBytes(), 
fieldDelimited.getBytes());
+    cachedLazyStruct.parseMultiDelimit(rowStr, delimiterPattern, 
REPLACEMENT_DELIM);
     return cachedLazyStruct;
   }
 
diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyStruct.java 
b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyStruct.java
index f066aaa3bf5..916382402ba 100644
--- a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyStruct.java
+++ b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyStruct.java
@@ -20,8 +20,9 @@ package org.apache.hadoop.hive.serde2.lazy;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
-import com.google.common.primitives.Bytes;
 import org.apache.hadoop.hive.serde2.SerDeException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -278,8 +279,14 @@ public class LazyStruct extends 
LazyNonPrimitive<LazySimpleStructObjectInspector
     return serializedSize;
   }
 
-  // parse the struct using multi-char delimiter
-  public void parseMultiDelimit(byte[] rawRow, byte[] fieldDelimit) {
+  /**
+   *  Parses rawRow using multi-char delimiter.
+   *
+   * @param rawRow row to be parsed, delimited by fieldDelimit
+   * @param fieldDelimit pattern of multi-char delimiter
+   * @param replacementDelim delimiter with which fieldDelimit has been 
replaced in rawRow
+   */
+  public void parseMultiDelimit(final String rawRow, final Pattern 
fieldDelimit, final String replacementDelim) {
     if (rawRow == null || fieldDelimit == null) {
       return;
     }
@@ -292,47 +299,28 @@ public class LazyStruct extends 
LazyNonPrimitive<LazySimpleStructObjectInspector
       fieldInited = new boolean[fields.length];
       startPosition = new int[fields.length + 1];
     }
-    // the indexes of the delimiters
-    int[] delimitIndexes = findIndexes(rawRow, fieldDelimit);
-    int diff = fieldDelimit.length - 1;
+    final int delimiterLength = fieldDelimit.toString().length();
+    final int extraBytesInDelim = delimiterLength - replacementDelim.length();
+
     // first field always starts from 0, even when missing
     startPosition[0] = 0;
-    for (int i = 1; i < fields.length; i++) {
-      if (delimitIndexes[i - 1] != -1) {
-        int start = delimitIndexes[i - 1] + fieldDelimit.length;
-        startPosition[i] = start - i * diff;
+    Matcher delimiterMatcher = fieldDelimit.matcher(rawRow);
+    for (int i = 1; i <= fields.length; i++) {
+      if (delimiterMatcher.find()) {
+        // MultiDelimitSerDe replaces actual multi-char delimiter by 
replacementDelim("\1") which reduces the length
+        // however here we are getting rawRow with original multi-char 
delimiter
+        // due to this we have to subtract those extra chars to match length 
of LazyNonPrimitive#bytes which are used
+        // while reading data, see uncheckedGetField()
+        startPosition[i] = delimiterMatcher.start() + delimiterLength - i * 
extraBytesInDelim;
       } else {
         startPosition[i] = length + 1;
       }
     }
-    startPosition[fields.length] = length + 1;
+
     Arrays.fill(fieldInited, false);
     parsed = true;
   }
 
-  // find all the indexes of the sub byte[]
-  private int[] findIndexes(byte[] array, byte[] target) {
-    if (fields.length <= 1) {
-      return new int[0];
-    }
-    int[] indexes = new int[fields.length - 1];
-    Arrays.fill(indexes, -1);
-    indexes[0] = Bytes.indexOf(array, target);
-    if (indexes[0] == -1) {
-      return indexes;
-    }
-    int indexInNewArray = indexes[0];
-    for (int i = 1; i < indexes.length; i++) {
-      array = Arrays.copyOfRange(array, indexInNewArray + target.length, 
array.length);
-      indexInNewArray = Bytes.indexOf(array, target);
-      if (indexInNewArray == -1) {
-        break;
-      }
-      indexes[i] = indexInNewArray + indexes[i - 1] + target.length;
-    }
-    return indexes;
-  }
-
   /**
    * Return the data in bytes corresponding to this given struct. This is 
useful specifically in
    * cases where the data is stored in serialized formats like protobufs or 
thrift and would need

Reply via email to