This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch branch-1.7
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/branch-1.7 by this push:
new 118d110 ORC-1034: Fix the indexOf algorithm in `FileDump` (#943)
118d110 is described below
commit 118d110aefbb13f76cb8adeb8b44c9a9f871eced
Author: Guiyanakaung <[email protected]>
AuthorDate: Thu Oct 21 04:21:20 2021 +0800
ORC-1034: Fix the indexOf algorithm in `FileDump` (#943)
This matching algorithm is wrong when i does not backtrack after a failed
match in the middle. As a simple example data = OOORC, pattern= ORC, index = 1,
this algorithm will return -1.
This pr aims to fix the indexOf algorithm.
indexOf is used to find the ORC file ending identifier to recover the file,
it is important to ensure that the method is correct.
Add UT.
(cherry picked from commit 65ce439fc7cce84f12657e73659165d20a306bd9)
Signed-off-by: Dongjoon Hyun <[email protected]>
---
.../src/java/org/apache/orc/tools/FileDump.java | 21 +++++++++------------
.../src/test/org/apache/orc/tools/TestFileDump.java | 8 ++++++++
2 files changed, 17 insertions(+), 12 deletions(-)
diff --git a/java/tools/src/java/org/apache/orc/tools/FileDump.java
b/java/tools/src/java/org/apache/orc/tools/FileDump.java
index 174f04a..bfc6e79 100644
--- a/java/tools/src/java/org/apache/orc/tools/FileDump.java
+++ b/java/tools/src/java/org/apache/orc/tools/FileDump.java
@@ -632,25 +632,22 @@ public final class FileDump {
}
// search for byte pattern in another byte array
- private static int indexOf(final byte[] data, final byte[] pattern, final
int index) {
+ public static int indexOf(final byte[] data, final byte[] pattern, final int
index) {
if (data == null || data.length == 0 || pattern == null || pattern.length
== 0 ||
index > data.length || index < 0) {
return -1;
}
- int j = 0;
- for (int i = index; i < data.length; i++) {
- if (pattern[j] == data[i]) {
- j++;
- } else {
- j = 0;
- }
-
- if (j == pattern.length) {
- return i - pattern.length + 1;
+ for (int i = index; i < data.length - pattern.length + 1; i++) {
+ boolean found = true;
+ for (int j = 0; j < pattern.length; j++) {
+ if (data[i + j] != pattern[j]) {
+ found = false;
+ break;
+ }
}
+ if (found) return i;
}
-
return -1;
}
diff --git a/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
b/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
index 5b20148..8f494f5 100644
--- a/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
+++ b/java/tools/src/test/org/apache/orc/tools/TestFileDump.java
@@ -703,4 +703,12 @@ public class TestFileDump {
assumeTrue(!System.getProperty("os.name").startsWith("Windows"));
TestFileDump.checkOutput(outputFilename, workDir + File.separator +
outputFilename);
}
+
+ @Test
+ public void testIndexOf() {
+ byte[] bytes = ("OO" + OrcFile.MAGIC).getBytes(StandardCharsets.UTF_8);
+ byte[] pattern = OrcFile.MAGIC.getBytes(StandardCharsets.UTF_8);
+
+ assertEquals(2, FileDump.indexOf(bytes, pattern, 1));
+ }
}