>From Janhavi Tripurwar <[email protected]>:
Attention is currently required from: Ian Maxon, Michael Blow, Ritik Raj.
Hello Ian Maxon, Jenkins, Michael Blow, Ritik Raj,
I'd like you to do a code review.
Please visit
https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/21032?usp=email
to review the following change.
Change subject: [ASTERIXDB-3715][RT] Upgrade UTF8StringPointable string search
to KMP
......................................................................
[ASTERIXDB-3715][RT] Upgrade UTF8StringPointable string search to KMP
- user model changes: no
- storage format changes: no
- interface changes: no
Details:
Replace the naive O(N*M) string matching algorithm in
UTF8StringPointable.findInByteOrCodePoint with the
Knuth-Morris-Pratt (KMP) algorithm to achieve O(N+M) complexity.
Co-authored-by: Gemini 3.1 Pro <[email protected]>
Ext-ref: MB-68178
Change-Id: Ia00fbce6499a5258127c91d3ce62270722b89112
Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/20984
Tested-by: Jenkins <[email protected]>
Reviewed-by: Ritik Raj <[email protected]>
Reviewed-by: Ian Maxon <[email protected]>
Reviewed-by: Michael Blow <[email protected]>
---
M
hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
M
hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java
2 files changed, 197 insertions(+), 45 deletions(-)
git pull ssh://asterix-gerrit.ics.uci.edu:29418/asterixdb
refs/changes/32/21032/1
diff --git
a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
index 3744350..4932e2fe 100644
---
a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
+++
b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
@@ -278,65 +278,152 @@
int startMatchPos = startMatch;
final int srcUtfLen = src.getUTF8Length();
final int pttnUtfLen = pattern.getUTF8Length();
+ if (pttnUtfLen == 0) {
+ return resultInByte ? startMatchPos : 0;
+ }
+
+ char[] patternChars = getPatternChars(pattern, ignoreCase);
+ int[] lps = computeLPS(patternChars);
+ return kmpMatch(src, patternChars, lps, ignoreCase, startMatchPos,
resultInByte);
+ }
+
+ private static char[] getPatternChars(UTF8StringPointable pattern, boolean
ignoreCase) {
+ int pttnUtfLen = pattern.getUTF8Length();
+ int pttnStart = pattern.getMetaDataLength();
+ char[] chars = new char[pttnUtfLen];
+ int byteIdx = 0;
+ int charIdx = 0;
+ while (byteIdx < pttnUtfLen) {
+ char ch = pattern.charAt(pttnStart + byteIdx);
+ if (ignoreCase && !Character.isHighSurrogate(ch) &&
!Character.isLowSurrogate(ch)) {
+ ch = Character.toLowerCase(ch);
+ }
+ chars[charIdx++] = ch;
+ byteIdx += pattern.charSize(pttnStart + byteIdx);
+ }
+ char[] exactChars = new char[charIdx];
+ System.arraycopy(chars, 0, exactChars, 0, charIdx);
+ return exactChars;
+ }
+
+ private static int[] computeLPS(char[] patternChars) {
+ int[] lps = new int[patternChars.length];
+ int len = 0;
+ int i = 1;
+ lps[0] = 0;
+ while (i < patternChars.length) {
+ if (patternChars[i] == patternChars[len]) {
+ len++;
+ lps[i] = len;
+ i++;
+ } else {
+ if (len != 0) {
+ len = lps[len - 1];
+ } else {
+ lps[i] = 0;
+ i++;
+ }
+ }
+ }
+ return lps;
+ }
+
+ private static int kmpMatch(UTF8StringPointable src, char[] patternChars,
int[] lps, boolean ignoreCase,
+ int startMatchPos, boolean resultInByte) throws
HyracksDataException {
+ final int srcUtfLen = src.getUTF8Length();
final int srcStart = src.getMetaDataLength();
- final int pttnStart = pattern.getMetaDataLength();
int codePointCount = 0;
+ int c1 = 0; // index in bytes for src
+ int j = 0; // index for patternChars
- int maxStart = srcUtfLen - pttnUtfLen;
- boolean prevHighSurrogate = false;
- while (startMatchPos <= maxStart) {
- int c1 = startMatchPos;
- int c2 = 0;
- while (c1 < srcUtfLen && c2 < pttnUtfLen) {
- char ch1 = src.charAt(srcStart + c1);
- char ch2 = pattern.charAt(pttnStart + c2);
+ boolean prevHigh = false;
- if (ch1 != ch2) {
- // Currently, the ignoreCase is only valid for
one-surrogate characters
- // (e.g. characters whose UTF-16 encoding is 2-byte (1
Java char) instead of 4-byte (2 Java chars).
- // We may need to support the two-surrogate characters in
the future
- //
- // Another edge case is that one letter may have different
forms of lower cases in different languages
- // For example, the letter I may have "i" as the lower
case in English but "ı" in Turkish.
- // We may need to use methods such as
String.toLowerCase(Locale locale) to support other languages in the future
- // Reference:
https://stackoverflow.com/questions/11063102/using-locales-with-javas-tolowercase-and-touppercase
- if (!ignoreCase || Character.toLowerCase(ch1) !=
Character.toLowerCase(ch2)) {
- break;
+ int[] ringBytes = new int[patternChars.length + 1];
+ int[] ringCPs = new int[patternChars.length + 1];
+ int ringIdx = 0;
+
+ while (c1 < srcUtfLen) {
+ if (c1 < startMatchPos) {
+ char ch = src.charAt(srcStart + c1);
+ c1 += src.charSize(srcStart + c1);
+ if (!resultInByte) {
+ if (Character.isHighSurrogate(ch)) {
+ prevHigh = true;
+ } else if (Character.isLowSurrogate(ch)) {
+ if (prevHigh) {
+ codePointCount++;
+ prevHigh = false;
+ } else {
+ throw
HyracksDataException.create(INVALID_STRING_UNICODE,
+ LOW_SURROGATE_WITHOUT_HIGH_SURROGATE);
+ }
+ } else {
+ codePointCount++;
}
}
- c1 += src.charSize(srcStart + c1);
- c2 += pattern.charSize(pttnStart + c2);
+ continue;
}
- if (c2 == pttnUtfLen) {
- if (resultInByte) {
- return startMatchPos;
- } else {
- if (prevHighSurrogate) {
+ char ch1 = src.charAt(srcStart + c1);
+ char matchCh1 = (ignoreCase && !Character.isHighSurrogate(ch1) &&
!Character.isLowSurrogate(ch1))
+ ? Character.toLowerCase(ch1) : ch1;
+
+ if (j == 0) {
+ ringBytes[ringIdx % ringBytes.length] = c1;
+ ringCPs[ringIdx % ringCPs.length] = codePointCount;
+ }
+
+ if (matchCh1 == patternChars[j]) {
+ j++;
+ int size = src.charSize(srcStart + c1);
+ c1 += size;
+
+ if (!resultInByte) {
+ if (Character.isHighSurrogate(ch1)) {
+ prevHigh = true;
+ } else if (Character.isLowSurrogate(ch1)) {
+ if (prevHigh) {
+ codePointCount++;
+ prevHigh = false;
+ } else {
+ throw
HyracksDataException.create(INVALID_STRING_UNICODE,
+ LOW_SURROGATE_WITHOUT_HIGH_SURROGATE);
+ }
+ } else {
+ codePointCount++;
+ }
+ }
+
+ if (j == patternChars.length) {
+ if (!resultInByte && prevHigh) {
throw
HyracksDataException.create(INVALID_STRING_UNICODE,
HIGH_SURROGATE_WITHOUT_LOW_SURROGATE);
}
- return codePointCount;
+ return resultInByte ? ringBytes[ringIdx %
ringBytes.length] : ringCPs[ringIdx % ringCPs.length];
}
- }
-
- // The result is counted in code point instead of bytes
- if (!resultInByte) {
- char ch = src.charAt(srcStart + startMatchPos);
- if (Character.isHighSurrogate(ch)) {
- prevHighSurrogate = true;
- } else if (Character.isLowSurrogate(ch)) {
- if (prevHighSurrogate) {
- codePointCount++;
- prevHighSurrogate = false;
- } else {
- throw
HyracksDataException.create(INVALID_STRING_UNICODE,
LOW_SURROGATE_WITHOUT_HIGH_SURROGATE);
- }
+ } else {
+ if (j != 0) {
+ j = lps[j - 1];
+ ringIdx++;
} else {
- codePointCount++;
+ int size = src.charSize(srcStart + c1);
+ c1 += size;
+ if (!resultInByte) {
+ if (Character.isHighSurrogate(ch1)) {
+ prevHigh = true;
+ } else if (Character.isLowSurrogate(ch1)) {
+ if (prevHigh) {
+ codePointCount++;
+ prevHigh = false;
+ } else {
+ throw
HyracksDataException.create(INVALID_STRING_UNICODE,
+ LOW_SURROGATE_WITHOUT_HIGH_SURROGATE);
+ }
+ } else {
+ codePointCount++;
+ }
+ }
}
}
-
- startMatchPos += src.charSize(srcStart + startMatchPos);
}
return -1;
}
diff --git
a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java
b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java
index 45a5ba3..fbba46f 100644
---
a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java
+++
b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java
@@ -41,6 +41,71 @@
import it.unimi.dsi.fastutil.ints.IntCollection;
public class UTF8StringPointableTest {
+
+ @Test
+ public void testFindWithKMP() throws Exception {
+ // Standard matching (returns byte pos)
+ assertTrue(findReturnPos("hello world", "world") >= 0);
+ assertEquals(0, findReturnPos("hello world", "hello"));
+ assertTrue(findReturnPos("hello world", "o w") >= 0);
+
+ // Exact same pattern boundary overlaps (KMP jump testing)
+ assertEquals(0, findReturnPos("abacababc", "abacab"));
+ assertTrue(findReturnPos("abacababc", "ababc") >= 0);
+
+ // Match not found
+ assertEquals(-1, findReturnPos("hello world", "xyz"));
+
+ // Multi-byte Unicode Characters (length checking bytes vs codepoints)
+ // 'é' (U+00E9) is 2 bytes in UTF-8, 1 code point
+ // '€' (U+20AC) is 3 bytes in UTF-8, 1 code point
+ // '😀' (U+1F600) is a surrogate pair in Java, encoded as 6 bytes in
Modified UTF-8!
+
+ // Testing 2-byte character: expected w byte pos = 5(hello) + 2(é) = 7
+ assertEquals(7, findReturnPos("helloéworld", "w"));
+ assertEquals(6, findReturnCodePointPos("helloéworld", "w")); //
5(hello) + 1(é) = 6
+
+ // Testing 3-byte character: expected w byte pos = 5(hello) + 3(€) = 8
+ assertEquals(8, findReturnPos("hello€world", "w"));
+ assertEquals(6, findReturnCodePointPos("hello€world", "w"));
+
+ // Testing Surrogate Pair (Modified UTF-8 encodes as 2x 3-byte
characters = 6 bytes)
+ assertEquals(11, findReturnPos("hello😀world", "w")); // 5 + 6 = 11
+ assertEquals(6, findReturnCodePointPos("hello😀world", "w")); // 5 + 1
= 6
+
+ // Find the emoji itself
+ assertTrue(findReturnPos("hello😀world", "😀") > 0);
+ assertEquals(0, findReturnCodePointPos("😀😀😀", "😀😀"));
+
+ // Case insensitivity
+ assertTrue(findReturnPosIgnoreCase("hello WORLD", "world") >= 0);
+ assertTrue(findReturnPosIgnoreCase("hello WORLD", "WOrLd") >= 0);
+ assertEquals(-1, findReturnPosIgnoreCase("hello WORLD", "XYZ"));
+ }
+
+ private int findReturnPos(String srcStr, String patternStr) throws
Exception {
+ return findReturnPosInternal(srcStr, patternStr, false, true);
+ }
+
+ private int findReturnCodePointPos(String srcStr, String patternStr)
throws Exception {
+ return findReturnPosInternal(srcStr, patternStr, false, false);
+ }
+
+ private int findReturnPosIgnoreCase(String srcStr, String patternStr)
throws Exception {
+ return findReturnPosInternal(srcStr, patternStr, true, true);
+ }
+
+ private int findReturnPosInternal(String srcStr, String patternStr,
boolean ignoreCase, boolean resultInByte)
+ throws Exception {
+ UTF8StringPointable src = generateUTF8Pointable(srcStr);
+ UTF8StringPointable pattern = generateUTF8Pointable(patternStr);
+ if (resultInByte) {
+ return UTF8StringPointable.find(src, pattern, ignoreCase, 0);
+ } else {
+ return UTF8StringPointable.findInCodePoint(src, pattern,
ignoreCase, 0);
+ }
+ }
+
public static UTF8StringPointable STRING_EMPTY =
generateUTF8Pointable(UTF8StringSample.EMPTY_STRING);
public static UTF8StringPointable STRING_UTF8_MIX =
generateUTF8Pointable(UTF8StringSample.STRING_UTF8_MIX);
public static UTF8StringPointable STRING_UTF8_MIX_LOWERCASE =
--
To view, visit https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/21032?usp=email
To unsubscribe, or for help writing mail filters, visit
https://asterix-gerrit.ics.uci.edu/settings?usp=email
Gerrit-MessageType: newchange
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Change-Id: Ia00fbce6499a5258127c91d3ce62270722b89112
Gerrit-Change-Number: 21032
Gerrit-PatchSet: 1
Gerrit-Owner: Janhavi Tripurwar <[email protected]>
Gerrit-Reviewer: Ian Maxon <[email protected]>
Gerrit-Reviewer: Jenkins <[email protected]>
Gerrit-Reviewer: Michael Blow <[email protected]>
Gerrit-Reviewer: Ritik Raj <[email protected]>
Gerrit-Attention: Ian Maxon <[email protected]>
Gerrit-Attention: Michael Blow <[email protected]>
Gerrit-Attention: Ritik Raj <[email protected]>