uros-db commented on code in PR #46511:
URL: https://github.com/apache/spark/pull/46511#discussion_r1596738413


##########
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java:
##########
@@ -34,6 +34,15 @@
  * Utility class for collation-aware UTF8String operations.
  */
 public class CollationAwareUTF8String {
+
+  public static boolean lowercaseMatchAt(final UTF8String l, final UTF8String 
r,
+      int pos, int len) {
+    if (len + pos > l.numChars() || pos < 0) {
+      return false;
+    }
+    return l.substring(pos, pos + len).toLowerCase().equals(r);

Review Comment:
   > Shouldn't this be?
   > 
   > return l.substring(pos, pos + len).toLowerCase().equals(r.toLowerCase());
   
   not here, because we know needle is not gonna change (we won't take any 
substrings of needle), then we can lowercase the needle only once (before the 
function call) in order to avoid multiple lowercasing (this is only important 
for `contains` which turns out to be quadratic - just like ICU StringSearch 
algorithm, while `startsWith` and `endsWith` are linear in nature because one 
bound is known in advance)



##########
common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java:
##########
@@ -102,20 +102,28 @@ public void testContains() throws SparkException {
     assertContains("äbćδe", "ÄbćδE", "UNICODE_CI", true);
     assertContains("äbćδe", "ÄBcΔÉ", "UNICODE_CI", false);
     // Case-variable character length
-    assertContains("abİo12", "i̇o", "UNICODE_CI", true);
-    assertContains("abi̇o12", "İo", "UNICODE_CI", true);
-    assertContains("the İodine", "the i̇odine", "UTF8_BINARY_LCASE", true);
-    assertContains("the i̇odine", "the İodine", "UTF8_BINARY_LCASE", true);
-    assertContains("The İodiNe", " i̇oDin", "UTF8_BINARY_LCASE", true);
-    assertContains("İodiNe", "i̇oDine", "UTF8_BINARY_LCASE", true);
-    assertContains("İodiNe", " i̇oDin", "UTF8_BINARY_LCASE", false);
-    // Characters with the same binary lowercase representation
-    assertContains("The Kelvin.", "Kelvin", "UTF8_BINARY_LCASE", true);
-    assertContains("The Kelvin.", "Kelvin", "UTF8_BINARY_LCASE", true);
-    assertContains("The KKelvin.", "KKelvin", "UTF8_BINARY_LCASE", true);
-    assertContains("2 Kelvin.", "2 Kelvin", "UTF8_BINARY_LCASE", true);
-    assertContains("2 Kelvin.", "2 Kelvin", "UTF8_BINARY_LCASE", true);
-    assertContains("The KKelvin.", "KKelvin,", "UTF8_BINARY_LCASE", false);
+    assertContains("i̇", "i", "UNICODE_CI", false);
+    assertContains("i̇", "İ", "UNICODE_CI", true);
+    assertContains("İ", "i", "UNICODE_CI", false);
+    assertContains("adi̇os", "io", "UNICODE_CI", false);
+    assertContains("adi̇os", "Io", "UNICODE_CI", false);
+    assertContains("adi̇os", "i̇o", "UNICODE_CI", true);
+    assertContains("adi̇os", "İo", "UNICODE_CI", true);
+    assertContains("adİos", "io", "UNICODE_CI", false);
+    assertContains("adİos", "Io", "UNICODE_CI", false);
+    assertContains("adİos", "i̇o", "UNICODE_CI", true);
+    assertContains("adİos", "İo", "UNICODE_CI", true);
+    assertContains("i̇", "i", "UTF8_BINARY_LCASE", true); // note: different 
from UNICODE_CI
+    assertContains("i̇", "İ", "UTF8_BINARY_LCASE", false); // note: different 
from UNICODE_CI
+    assertContains("İ", "i", "UTF8_BINARY_LCASE", false);
+    assertContains("adi̇os", "io", "UTF8_BINARY_LCASE", false);
+    assertContains("adi̇os", "Io", "UTF8_BINARY_LCASE", false);
+    assertContains("adi̇os", "i̇o", "UTF8_BINARY_LCASE", true);
+    assertContains("adi̇os", "İo", "UTF8_BINARY_LCASE", false); // note: 
different from UNICODE_CI
+    assertContains("adİos", "io", "UTF8_BINARY_LCASE", false);
+    assertContains("adİos", "Io", "UTF8_BINARY_LCASE", false);
+    assertContains("adİos", "i̇o", "UTF8_BINARY_LCASE", false); // note: 
different from UNICODE_CI

Review Comment:
   update: resolved in latest commit



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Reply via email to