This is an automated email from the ASF dual-hosted git repository. sankarh pushed a commit to branch branch-3 in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/branch-3 by this push: new a3070e0dbfe HIVE-27552: Backport of HIVE-22360, HIVE-20619 to branch-3 (#4535) a3070e0dbfe is described below commit a3070e0dbfeb5de3620b5c953461f25cce6038fe Author: Aman Raj <104416558+amanraj2...@users.noreply.github.com> AuthorDate: Tue Aug 22 13:00:13 2023 +0530 HIVE-27552: Backport of HIVE-22360, HIVE-20619 to branch-3 (#4535) * HIVE-22360: MultiDelimitSerDe returns wrong results in last column when the loaded file has more columns than those in table schema (Shubham Chaurasia, reviewed by Sankar Hariappan) * HIVE-20619: Include MultiDelimitSerDe in HiveServer2 By Default (Alice Fan, reviewed by Naveen Gangam) Signed-off-by: Sankar Hariappan <sank...@apache.org> Closes (#4535) --- data/files/t11_csv_serde.csv | 10 + data/files/t1_multi_delimit.csv | 10 + data/files/t2_multi_delimit.csv | 4 + data/files/t3_multi_delimit.csv | 10 + .../queries/clientpositive/serde_multi_delimit.q | 65 ++++++ .../clientpositive/serde_multi_delimit.q.out | 232 +++++++++++++++++++++ .../hadoop/hive}/serde2/MultiDelimitSerDe.java | 13 +- .../apache/hadoop/hive/serde2/lazy/LazyStruct.java | 56 ++--- 8 files changed, 362 insertions(+), 38 deletions(-) diff --git a/data/files/t11_csv_serde.csv b/data/files/t11_csv_serde.csv new file mode 100644 index 00000000000..6e7060919ee --- /dev/null +++ b/data/files/t11_csv_serde.csv @@ -0,0 +1,10 @@ +1,1,,0,0 +2,1,,0,1 +3,1,,0,0 +4,1,,0,1 +5,5 + +7777 +8,8,,8,8,8 +9,9,,9,9,9,9,,9,9,9 +10101010 \ No newline at end of file diff --git a/data/files/t1_multi_delimit.csv b/data/files/t1_multi_delimit.csv new file mode 100644 index 00000000000..6c4e729f428 --- /dev/null +++ b/data/files/t1_multi_delimit.csv @@ -0,0 +1,10 @@ +1^,1^,^,0^,0 +2^,1^,^,0^,1 +3^,1^,^,0^,0 +4^,1^,^,0^,1 +5^,5 + +7777 +8^,8^,^,8^,8^,8 +9^,9^,^,9^,9^,9^,9^,^,9^,9^,9 +10101010 \ No newline at end of file diff --git a/data/files/t2_multi_delimit.csv b/data/files/t2_multi_delimit.csv new file mode 100644 index 00000000000..0dd42e1dfb6 --- /dev/null +++ b/data/files/t2_multi_delimit.csv @@ -0,0 +1,4 @@ +1^,1^,^,0^,0^,0 +2^,1^,^,0^,1^,0 +3^,1^,^,0^,0^,0 +4^,1^,^,0^,1^,0 diff --git a/data/files/t3_multi_delimit.csv b/data/files/t3_multi_delimit.csv new file mode 100644 index 00000000000..8c49f6f3837 --- /dev/null +++ b/data/files/t3_multi_delimit.csv @@ -0,0 +1,10 @@ +1^^^^^1^^^^^^^^^^0^^^^^0 +2^^^^^1^^^^^^^^^^0^^^^^1 +3^^^^^1^^^^^^^^^^0^^^^^0 +4^^^^^1^^^^^^^^^^0^^^^^1 +5^^^^^5 + +7777 +8^^^^^8^^^^^^^^^^8^^^^^8^^^^^8 +9^^^^^9^^^^^^^^^^9^^^^^9^^^^^9 +10101010 \ No newline at end of file diff --git a/ql/src/test/queries/clientpositive/serde_multi_delimit.q b/ql/src/test/queries/clientpositive/serde_multi_delimit.q new file mode 100644 index 00000000000..0d851752867 --- /dev/null +++ b/ql/src/test/queries/clientpositive/serde_multi_delimit.q @@ -0,0 +1,65 @@ +-- in this table, rows of different lengths(different number of columns) are loaded +CREATE TABLE t1_multi_delimit(colA int, + colB tinyint, + colC timestamp, + colD smallint, + colE smallint) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe' +WITH SERDEPROPERTIES ("field.delim"="^,")STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH "../../data/files/t1_multi_delimit.csv" INTO TABLE t1_multi_delimit; + +SELECT * FROM t1_multi_delimit; + +-- in this table, rows of different lengths(different number of columns) and it uses csv serde +CREATE TABLE t11_csv_serde(colA int, + colB tinyint, + colC timestamp, + colD smallint, + colE smallint) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' +WITH SERDEPROPERTIES ("separatorChar" = ",")STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH "../../data/files/t11_csv_serde.csv" INTO TABLE t11_csv_serde; + +SELECT * FROM t11_csv_serde; + +-- there should not be any difference between MultiDelimitSerDe table and OpenCSVSerde table results + +SELECT EXISTS ( +SELECT colA, colB, colC, colD, colE FROM t1_multi_delimit +MINUS +SELECT cast(colA as int), cast(colB as tinyint), cast(colC as timestamp), cast(colD as smallint), cast(colE as smallint) FROM t11_csv_serde +); + +-- in this table, file having extra column is loaded +CREATE TABLE t2_multi_delimit(colA int, + colB tinyint, + colC timestamp, + colD smallint, + colE smallint) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe' +WITH SERDEPROPERTIES ("field.delim"="^,")STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH "../../data/files/t2_multi_delimit.csv" INTO TABLE t2_multi_delimit; + +SELECT * FROM t2_multi_delimit; + +-- in this table, delimiter of 5 characters is used +CREATE TABLE t3_multi_delimit(colA int, + colB tinyint, + colC timestamp, + colD smallint, + colE smallint) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe' +WITH SERDEPROPERTIES ("field.delim"="^^^^^")STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH "../../data/files/t3_multi_delimit.csv" INTO TABLE t3_multi_delimit; + +SELECT * FROM t3_multi_delimit; + + +DROP TABLE t1_multi_delimit; +DROP TABLE t11_csv_serde; +DROP TABLE t2_multi_delimit; +DROP TABLE t3_multi_delimit; \ No newline at end of file diff --git a/ql/src/test/results/clientpositive/serde_multi_delimit.q.out b/ql/src/test/results/clientpositive/serde_multi_delimit.q.out new file mode 100644 index 00000000000..f13aa59d5aa --- /dev/null +++ b/ql/src/test/results/clientpositive/serde_multi_delimit.q.out @@ -0,0 +1,232 @@ +PREHOOK: query: CREATE TABLE t1_multi_delimit(colA int, + colB tinyint, + colC timestamp, + colD smallint, + colE smallint) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe' +WITH SERDEPROPERTIES ("field.delim"="^,")STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@t1_multi_delimit +POSTHOOK: query: CREATE TABLE t1_multi_delimit(colA int, + colB tinyint, + colC timestamp, + colD smallint, + colE smallint) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe' +WITH SERDEPROPERTIES ("field.delim"="^,")STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t1_multi_delimit +PREHOOK: query: LOAD DATA LOCAL INPATH "../../data/files/t1_multi_delimit.csv" INTO TABLE t1_multi_delimit +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@t1_multi_delimit +POSTHOOK: query: LOAD DATA LOCAL INPATH "../../data/files/t1_multi_delimit.csv" INTO TABLE t1_multi_delimit +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@t1_multi_delimit +PREHOOK: query: SELECT * FROM t1_multi_delimit +PREHOOK: type: QUERY +PREHOOK: Input: default@t1_multi_delimit +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM t1_multi_delimit +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1_multi_delimit +#### A masked pattern was here #### +1 1 NULL 0 0 +2 1 NULL 0 1 +3 1 NULL 0 0 +4 1 NULL 0 1 +5 5 NULL NULL NULL +NULL NULL NULL NULL NULL +7777 NULL NULL NULL NULL +8 8 NULL 8 8 +9 9 NULL 9 9 +10101010 NULL NULL NULL NULL +PREHOOK: query: CREATE TABLE t11_csv_serde(colA int, + colB tinyint, + colC timestamp, + colD smallint, + colE smallint) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' +WITH SERDEPROPERTIES ("separatorChar" = ",")STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@t11_csv_serde +POSTHOOK: query: CREATE TABLE t11_csv_serde(colA int, + colB tinyint, + colC timestamp, + colD smallint, + colE smallint) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' +WITH SERDEPROPERTIES ("separatorChar" = ",")STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t11_csv_serde +PREHOOK: query: LOAD DATA LOCAL INPATH "../../data/files/t11_csv_serde.csv" INTO TABLE t11_csv_serde +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@t11_csv_serde +POSTHOOK: query: LOAD DATA LOCAL INPATH "../../data/files/t11_csv_serde.csv" INTO TABLE t11_csv_serde +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@t11_csv_serde +PREHOOK: query: SELECT * FROM t11_csv_serde +PREHOOK: type: QUERY +PREHOOK: Input: default@t11_csv_serde +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM t11_csv_serde +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t11_csv_serde +#### A masked pattern was here #### +1 1 0 0 +2 1 0 1 +3 1 0 0 +4 1 0 1 +5 5 NULL NULL NULL +NULL NULL NULL NULL NULL +7777 NULL NULL NULL NULL +8 8 8 8 +9 9 9 9 +10101010 NULL NULL NULL NULL +Warning: Shuffle Join JOIN[30][tables = [$hdt$_0, $hdt$_1]] in Stage 'Stage-1:MAPRED' is a cross product +PREHOOK: query: SELECT EXISTS ( +SELECT colA, colB, colC, colD, colE FROM t1_multi_delimit +MINUS +SELECT cast(colA as int), cast(colB as tinyint), cast(colC as timestamp), cast(colD as smallint), cast(colE as smallint) FROM t11_csv_serde +) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Input: default@t11_csv_serde +PREHOOK: Input: default@t1_multi_delimit +#### A masked pattern was here #### +POSTHOOK: query: SELECT EXISTS ( +SELECT colA, colB, colC, colD, colE FROM t1_multi_delimit +MINUS +SELECT cast(colA as int), cast(colB as tinyint), cast(colC as timestamp), cast(colD as smallint), cast(colE as smallint) FROM t11_csv_serde +) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Input: default@t11_csv_serde +POSTHOOK: Input: default@t1_multi_delimit +#### A masked pattern was here #### +false +PREHOOK: query: CREATE TABLE t2_multi_delimit(colA int, + colB tinyint, + colC timestamp, + colD smallint, + colE smallint) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe' +WITH SERDEPROPERTIES ("field.delim"="^,")STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@t2_multi_delimit +POSTHOOK: query: CREATE TABLE t2_multi_delimit(colA int, + colB tinyint, + colC timestamp, + colD smallint, + colE smallint) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe' +WITH SERDEPROPERTIES ("field.delim"="^,")STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t2_multi_delimit +PREHOOK: query: LOAD DATA LOCAL INPATH "../../data/files/t2_multi_delimit.csv" INTO TABLE t2_multi_delimit +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@t2_multi_delimit +POSTHOOK: query: LOAD DATA LOCAL INPATH "../../data/files/t2_multi_delimit.csv" INTO TABLE t2_multi_delimit +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@t2_multi_delimit +PREHOOK: query: SELECT * FROM t2_multi_delimit +PREHOOK: type: QUERY +PREHOOK: Input: default@t2_multi_delimit +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM t2_multi_delimit +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t2_multi_delimit +#### A masked pattern was here #### +1 1 NULL 0 0 +2 1 NULL 0 1 +3 1 NULL 0 0 +4 1 NULL 0 1 +PREHOOK: query: CREATE TABLE t3_multi_delimit(colA int, + colB tinyint, + colC timestamp, + colD smallint, + colE smallint) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe' +WITH SERDEPROPERTIES ("field.delim"="^^^^^")STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@t3_multi_delimit +POSTHOOK: query: CREATE TABLE t3_multi_delimit(colA int, + colB tinyint, + colC timestamp, + colD smallint, + colE smallint) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe' +WITH SERDEPROPERTIES ("field.delim"="^^^^^")STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t3_multi_delimit +PREHOOK: query: LOAD DATA LOCAL INPATH "../../data/files/t3_multi_delimit.csv" INTO TABLE t3_multi_delimit +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@t3_multi_delimit +POSTHOOK: query: LOAD DATA LOCAL INPATH "../../data/files/t3_multi_delimit.csv" INTO TABLE t3_multi_delimit +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@t3_multi_delimit +PREHOOK: query: SELECT * FROM t3_multi_delimit +PREHOOK: type: QUERY +PREHOOK: Input: default@t3_multi_delimit +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM t3_multi_delimit +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t3_multi_delimit +#### A masked pattern was here #### +1 1 NULL 0 0 +2 1 NULL 0 1 +3 1 NULL 0 0 +4 1 NULL 0 1 +5 5 NULL NULL NULL +NULL NULL NULL NULL NULL +7777 NULL NULL NULL NULL +8 8 NULL 8 8 +9 9 NULL 9 9 +10101010 NULL NULL NULL NULL +PREHOOK: query: DROP TABLE t1_multi_delimit +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@t1_multi_delimit +PREHOOK: Output: default@t1_multi_delimit +POSTHOOK: query: DROP TABLE t1_multi_delimit +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@t1_multi_delimit +POSTHOOK: Output: default@t1_multi_delimit +PREHOOK: query: DROP TABLE t11_csv_serde +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@t11_csv_serde +PREHOOK: Output: default@t11_csv_serde +POSTHOOK: query: DROP TABLE t11_csv_serde +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@t11_csv_serde +POSTHOOK: Output: default@t11_csv_serde +PREHOOK: query: DROP TABLE t2_multi_delimit +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@t2_multi_delimit +PREHOOK: Output: default@t2_multi_delimit +POSTHOOK: query: DROP TABLE t2_multi_delimit +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@t2_multi_delimit +POSTHOOK: Output: default@t2_multi_delimit +PREHOOK: query: DROP TABLE t3_multi_delimit +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@t3_multi_delimit +PREHOOK: Output: default@t3_multi_delimit +POSTHOOK: query: DROP TABLE t3_multi_delimit +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@t3_multi_delimit +POSTHOOK: Output: default@t3_multi_delimit diff --git a/contrib/src/java/org/apache/hadoop/hive/contrib/serde2/MultiDelimitSerDe.java b/serde/src/java/org/apache/hadoop/hive/serde2/MultiDelimitSerDe.java similarity index 96% rename from contrib/src/java/org/apache/hadoop/hive/contrib/serde2/MultiDelimitSerDe.java rename to serde/src/java/org/apache/hadoop/hive/serde2/MultiDelimitSerDe.java index 6d51bb00ceb..efe6597ffb9 100644 --- a/contrib/src/java/org/apache/hadoop/hive/contrib/serde2/MultiDelimitSerDe.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/MultiDelimitSerDe.java @@ -17,7 +17,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.hadoop.hive.contrib.serde2; +package org.apache.hadoop.hive.serde2; import java.io.IOException; import java.util.List; @@ -69,6 +69,9 @@ public class MultiDelimitSerDe extends AbstractEncodingAwareSerDe { // Due to HIVE-6404, define our own constant private static final String COLLECTION_DELIM = "collection.delim"; + // actual delimiter(fieldDelimited) is replaced by REPLACEMENT_DELIM in row. + private static final String REPLACEMENT_DELIM = "\1"; + private int numColumns; private String fieldDelimited; // we don't support using multiple chars as delimiters within complex types @@ -90,6 +93,8 @@ public class MultiDelimitSerDe extends AbstractEncodingAwareSerDe { private final ByteStream.Output serializeStream = new ByteStream.Output(); // The Writable to return in serialize private final Text serializeCache = new Text(); + // pattern for delimiter + private Pattern delimiterPattern; @Override public void initialize(Configuration conf, Properties tbl) throws SerDeException { @@ -101,7 +106,7 @@ public class MultiDelimitSerDe extends AbstractEncodingAwareSerDe { if (fieldDelimited == null || fieldDelimited.isEmpty()) { throw new SerDeException("This table does not have serde property \"field.delim\"!"); } - + delimiterPattern = Pattern.compile(fieldDelimited, Pattern.LITERAL); // get the collection separator and map key separator // TODO: use serdeConstants.COLLECTION_DELIM when the typo is fixed collSep = LazyUtils.getByte(tbl.getProperty(COLLECTION_DELIM), @@ -154,10 +159,10 @@ public class MultiDelimitSerDe extends AbstractEncodingAwareSerDe { } else { throw new SerDeException(getClass() + ": expects either BytesWritable or Text object!"); } - byteArrayRef.setData(rowStr.replaceAll(Pattern.quote(fieldDelimited), "\1").getBytes()); + byteArrayRef.setData(rowStr.replaceAll(Pattern.quote(fieldDelimited), REPLACEMENT_DELIM).getBytes()); cachedLazyStruct.init(byteArrayRef, 0, byteArrayRef.getData().length); // use the multi-char delimiter to parse the lazy struct - cachedLazyStruct.parseMultiDelimit(rowStr.getBytes(), fieldDelimited.getBytes()); + cachedLazyStruct.parseMultiDelimit(rowStr, delimiterPattern, REPLACEMENT_DELIM); return cachedLazyStruct; } diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyStruct.java b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyStruct.java index f066aaa3bf5..916382402ba 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyStruct.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazyStruct.java @@ -20,8 +20,9 @@ package org.apache.hadoop.hive.serde2.lazy; import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; -import com.google.common.primitives.Bytes; import org.apache.hadoop.hive.serde2.SerDeException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -278,8 +279,14 @@ public class LazyStruct extends LazyNonPrimitive<LazySimpleStructObjectInspector return serializedSize; } - // parse the struct using multi-char delimiter - public void parseMultiDelimit(byte[] rawRow, byte[] fieldDelimit) { + /** + * Parses rawRow using multi-char delimiter. + * + * @param rawRow row to be parsed, delimited by fieldDelimit + * @param fieldDelimit pattern of multi-char delimiter + * @param replacementDelim delimiter with which fieldDelimit has been replaced in rawRow + */ + public void parseMultiDelimit(final String rawRow, final Pattern fieldDelimit, final String replacementDelim) { if (rawRow == null || fieldDelimit == null) { return; } @@ -292,47 +299,28 @@ public class LazyStruct extends LazyNonPrimitive<LazySimpleStructObjectInspector fieldInited = new boolean[fields.length]; startPosition = new int[fields.length + 1]; } - // the indexes of the delimiters - int[] delimitIndexes = findIndexes(rawRow, fieldDelimit); - int diff = fieldDelimit.length - 1; + final int delimiterLength = fieldDelimit.toString().length(); + final int extraBytesInDelim = delimiterLength - replacementDelim.length(); + // first field always starts from 0, even when missing startPosition[0] = 0; - for (int i = 1; i < fields.length; i++) { - if (delimitIndexes[i - 1] != -1) { - int start = delimitIndexes[i - 1] + fieldDelimit.length; - startPosition[i] = start - i * diff; + Matcher delimiterMatcher = fieldDelimit.matcher(rawRow); + for (int i = 1; i <= fields.length; i++) { + if (delimiterMatcher.find()) { + // MultiDelimitSerDe replaces actual multi-char delimiter by replacementDelim("\1") which reduces the length + // however here we are getting rawRow with original multi-char delimiter + // due to this we have to subtract those extra chars to match length of LazyNonPrimitive#bytes which are used + // while reading data, see uncheckedGetField() + startPosition[i] = delimiterMatcher.start() + delimiterLength - i * extraBytesInDelim; } else { startPosition[i] = length + 1; } } - startPosition[fields.length] = length + 1; + Arrays.fill(fieldInited, false); parsed = true; } - // find all the indexes of the sub byte[] - private int[] findIndexes(byte[] array, byte[] target) { - if (fields.length <= 1) { - return new int[0]; - } - int[] indexes = new int[fields.length - 1]; - Arrays.fill(indexes, -1); - indexes[0] = Bytes.indexOf(array, target); - if (indexes[0] == -1) { - return indexes; - } - int indexInNewArray = indexes[0]; - for (int i = 1; i < indexes.length; i++) { - array = Arrays.copyOfRange(array, indexInNewArray + target.length, array.length); - indexInNewArray = Bytes.indexOf(array, target); - if (indexInNewArray == -1) { - break; - } - indexes[i] = indexInNewArray + indexes[i - 1] + target.length; - } - return indexes; - } - /** * Return the data in bytes corresponding to this given struct. This is useful specifically in * cases where the data is stored in serialized formats like protobufs or thrift and would need