>From Ritik Raj <[email protected]>: Ritik Raj has submitted this change. ( https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/21023?usp=email )
Change subject: [ASTERIXDB-3715][FUNC] Fix double-offset bug in KMP string search ...................................................................... [ASTERIXDB-3715][FUNC] Fix double-offset bug in KMP string search - user model changes: no - storage format changes: no - interface changes: no In `UTF8StringPointable.kmpMatch()`, the algorithm was incorrectly iterating from the beginning of the string to `startMatchPos` while incrementing `codePointCount`. This caused `findInCodePoint` to inadvertently return an absolute index instead of a relative distance, which broke callers like that add the start offset back to the result (causing the double-offset bug). Ext-ref: MB-68178 Change-Id: If63a452d804d90757aa52fd2e65d689a0fe6e833 Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/21023 Integration-Tests: Jenkins <[email protected]> Tested-by: Jenkins <[email protected]> Reviewed-by: Ritik Raj <[email protected]> Reviewed-by: Michael Blow <[email protected]> Reviewed-by: Ian Maxon <[email protected]> --- M hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java M hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java 2 files changed, 12 insertions(+), 22 deletions(-) Approvals: Jenkins: Verified; Verified Ian Maxon: Looks good to me, approved Michael Blow: Looks good to me, approved Ritik Raj: Looks good to me, but someone else must approve diff --git a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java index 4932e2fe..6100a49 100644 --- a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java +++ b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java @@ -333,7 +333,7 @@ final int srcUtfLen = src.getUTF8Length(); final int srcStart = src.getMetaDataLength(); int codePointCount = 0; - int c1 = 0; // index in bytes for src + int c1 = startMatchPos; // index in bytes for src int j = 0; // index for patternChars boolean prevHigh = false; @@ -343,27 +343,6 @@ int ringIdx = 0; while (c1 < srcUtfLen) { - if (c1 < startMatchPos) { - char ch = src.charAt(srcStart + c1); - c1 += src.charSize(srcStart + c1); - if (!resultInByte) { - if (Character.isHighSurrogate(ch)) { - prevHigh = true; - } else if (Character.isLowSurrogate(ch)) { - if (prevHigh) { - codePointCount++; - prevHigh = false; - } else { - throw HyracksDataException.create(INVALID_STRING_UNICODE, - LOW_SURROGATE_WITHOUT_HIGH_SURROGATE); - } - } else { - codePointCount++; - } - } - continue; - } - char ch1 = src.charAt(srcStart + c1); char matchCh1 = (ignoreCase && !Character.isHighSurrogate(ch1) && !Character.isLowSurrogate(ch1)) ? Character.toLowerCase(ch1) : ch1; diff --git a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java index fbba46f..95f7492 100644 --- a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java +++ b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java @@ -144,6 +144,17 @@ } @Test + public void testFindWithOffset() throws HyracksDataException { + UTF8StringPointable src = generateUTF8Pointable("this is the king's palace"); + UTF8StringPointable pattern = generateUTF8Pointable("'s"); + + // startMatchPos points to 'i' at byte offset 2 (index 2) + assertEquals(16, UTF8StringPointable.find(src, pattern, false, 2)); + // code point difference: from index 2 to index 16 -> 14 + assertEquals(14, UTF8StringPointable.findInCodePoint(src, pattern, false, 2)); + } + + @Test public void testContains() throws Exception { assertTrue(STRING_UTF8_MIX.contains(STRING_UTF8_MIX, false)); assertTrue(STRING_UTF8_MIX.contains(STRING_UTF8_MIX, true)); -- To view, visit https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/21023?usp=email To unsubscribe, or for help writing mail filters, visit https://asterix-gerrit.ics.uci.edu/settings?usp=email Gerrit-MessageType: merged Gerrit-Project: asterixdb Gerrit-Branch: lumina Gerrit-Change-Id: If63a452d804d90757aa52fd2e65d689a0fe6e833 Gerrit-Change-Number: 21023 Gerrit-PatchSet: 2 Gerrit-Owner: Ritik Raj <[email protected]> Gerrit-Reviewer: Ian Maxon <[email protected]> Gerrit-Reviewer: Jenkins <[email protected]> Gerrit-Reviewer: Michael Blow <[email protected]> Gerrit-Reviewer: Ritik Raj <[email protected]> Gerrit-CC: Anon. E. Moose #1000171
