Re: [PR] [SPARK-48283][SQL] Modify string comparison for UTF8_BINARY_LCASE [spark]

via GitHub Fri, 24 May 2024 05:20:14 -0700


uros-db commented on code in PR #46700:
URL: https://github.com/apache/spark/pull/46700#discussion_r1613391451



##########
common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java:
##########
@@ -17,15 +17,136 @@
 package org.apache.spark.unsafe.types;
 
 import org.apache.spark.SparkException;
+import org.apache.spark.sql.catalyst.util.CollationAwareUTF8String;
 import org.apache.spark.sql.catalyst.util.CollationFactory;
 import org.apache.spark.sql.catalyst.util.CollationSupport;
 import org.junit.jupiter.api.Test;
 
 import static org.junit.jupiter.api.Assertions.*;
 
-
+// checkstyle.off: AvoidEscapedUnicodeCharacters
 public class CollationSupportSuite {
 
+  /**
+   * Collation-aware UTF8String comparison.
+   */
+
+  private void assertStringCompare(String s1, String s2, String collationName, 
int expected)
+          throws SparkException {
+    UTF8String l = UTF8String.fromString(s1);
+    UTF8String r = UTF8String.fromString(s2);
+    int compare = 
CollationFactory.fetchCollation(collationName).comparator.compare(l, r);
+    assertEquals(Integer.signum(expected), Integer.signum(compare));
+  }
+
+  @Test
+  public void testCompare() throws SparkException {
+    // Edge cases
+    assertStringCompare("", "", "UTF8_BINARY", 0);
+    assertStringCompare("a", "", "UTF8_BINARY", 1);
+    assertStringCompare("", "a", "UTF8_BINARY", -1);
+    assertStringCompare("", "", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("a", "", "UTF8_BINARY_LCASE", 1);
+    assertStringCompare("", "a", "UTF8_BINARY_LCASE", -1);
+    assertStringCompare("", "", "UNICODE", 0);
+    assertStringCompare("a", "", "UNICODE", 1);
+    assertStringCompare("", "a", "UNICODE", -1);
+    assertStringCompare("", "", "UNICODE_CI", 0);
+    assertStringCompare("a", "", "UNICODE_CI", 1);
+    assertStringCompare("", "a", "UNICODE_CI", -1);
+    // Basic tests
+    assertStringCompare("AbCd", "aBcD", "UTF8_BINARY", -1);
+    assertStringCompare("ABCD", "abcd", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("AbcD", "aBCd", "UNICODE", 1);
+    assertStringCompare("abcd", "ABCD", "UNICODE_CI", 0);
+    // Accent variation
+    assertStringCompare("aBćD", "ABĆD", "UTF8_BINARY", 1);
+    assertStringCompare("AbCδ", "ABCΔ", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("äBCd", "ÄBCD", "UNICODE", -1);
+    assertStringCompare("Ab́cD", "AB́CD", "UNICODE_CI", 0);
+    // Case-variable character length
+    assertStringCompare("i\u0307", "İ", "UTF8_BINARY", -1);
+    assertStringCompare("İ", "i\u0307", "UTF8_BINARY", 1);
+    assertStringCompare("i\u0307", "İ", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("İ", "i\u0307", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("i\u0307", "İ", "UNICODE", -1);
+    assertStringCompare("İ", "i\u0307", "UNICODE", 1);
+    assertStringCompare("i\u0307", "İ", "UNICODE_CI", 0);
+    assertStringCompare("İ", "i\u0307", "UNICODE_CI", 0);
+    assertStringCompare("i\u0307İ", "i\u0307İ", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("i\u0307İ", "İi\u0307", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("İi\u0307", "i\u0307İ", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("İi\u0307", "İi\u0307", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("i\u0307İ", "i\u0307İ", "UNICODE_CI", 0);
+    assertStringCompare("i\u0307İ", "İi\u0307", "UNICODE_CI", 0);
+    assertStringCompare("İi\u0307", "i\u0307İ", "UNICODE_CI", 0);
+    assertStringCompare("İi\u0307", "İi\u0307", "UNICODE_CI", 0);
+    // Conditional case mapping
+    assertStringCompare("ς", "σ", "UTF8_BINARY", -1);
+    assertStringCompare("ς", "Σ", "UTF8_BINARY", 1);
+    assertStringCompare("σ", "Σ", "UTF8_BINARY", 1);
+    assertStringCompare("ς", "σ", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("ς", "Σ", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("σ", "Σ", "UTF8_BINARY_LCASE", 0);
+    assertStringCompare("ς", "σ", "UNICODE", 1);
+    assertStringCompare("ς", "Σ", "UNICODE", 1);
+    assertStringCompare("σ", "Σ", "UNICODE", -1);
+    assertStringCompare("ς", "σ", "UNICODE_CI", 0);
+    assertStringCompare("ς", "Σ", "UNICODE_CI", 0);
+    assertStringCompare("σ", "Σ", "UNICODE_CI", 0);
+  }
+
+  private void assertLcaseCompare(String target, String expected, String 
collationName)
+      throws SparkException {
+    if (collationName.equals("UTF8_BINARY")) {
+      UTF8String targetUTF8 = UTF8String.fromString(target);
+      UTF8String expectedUTF8 = UTF8String.fromString(expected);
+      assertEquals(expectedUTF8, targetUTF8.toLowerCase());
+    } else if (collationName.equals("UTF8_BINARY_LCASE")) {
+      assertEquals(expected, 
CollationAwareUTF8String.lowerCaseCodePoints(target));
+    } else {
+      int collationId = CollationFactory.collationNameToId(collationName);
+      assertEquals(expected, CollationAwareUTF8String.toLowerCase(target, 
collationId));
+    }
+  }
+
+  @Test
+  public void testLcaseCompare() throws SparkException {
+    // Edge cases
+    assertLcaseCompare("", "", "UTF8_BINARY");
+    assertLcaseCompare("", "", "UTF8_BINARY_LCASE");
+    assertLcaseCompare("", "", "UNICODE");
+    assertLcaseCompare("", "", "UNICODE_CI");
+    // Basic tests
+    assertLcaseCompare("abcd", "abcd", "UTF8_BINARY");
+    assertLcaseCompare("AbCd", "abcd", "UTF8_BINARY");
+    assertLcaseCompare("abcd", "abcd", "UTF8_BINARY_LCASE");
+    assertLcaseCompare("aBcD", "abcd", "UTF8_BINARY_LCASE");
+    assertLcaseCompare("abcd", "abcd", "UNICODE");
+    assertLcaseCompare("aBCd", "abcd", "UNICODE");
+    assertLcaseCompare("abcd", "abcd", "UNICODE_CI");
+    assertLcaseCompare("AbcD", "abcd", "UNICODE_CI");
+    // Accent variation
+    assertLcaseCompare("AbĆd", "abćd", "UTF8_BINARY");
+    assertLcaseCompare("aBcΔ", "abcδ", "UTF8_BINARY_LCASE");
+    assertLcaseCompare("ÄbcD", "äbcd", "UNICODE");
+    assertLcaseCompare("aB́Cd", "ab́cd", "UNICODE_CI");
+    // Case-variable character length
+    assertLcaseCompare("İoDiNe", "i̇odine", "UTF8_BINARY");
+    assertLcaseCompare("Abi̇o12", "abi̇o12", "UTF8_BINARY");
+    assertLcaseCompare("İodInE", "i̇odine", "UTF8_BINARY_LCASE");
+    assertLcaseCompare("aBi̇o12", "abi̇o12", "UTF8_BINARY_LCASE");
+    assertLcaseCompare("İoDinE", "i̇odine", "UNICODE");
+    assertLcaseCompare("abi̇O12", "abi̇o12", "UNICODE");
+    assertLcaseCompare("İodINe", "i̇odine", "UNICODE_CI");
+    assertLcaseCompare("ABi̇o12", "abi̇o12", "UNICODE_CI");
+    // Conditional case mapping
+    assertLcaseCompare("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινος", "UTF8_BINARY");
+    assertLcaseCompare("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινοσ", "UTF8_BINARY_LCASE"); // != 
UNICODE_CI
+    assertLcaseCompare("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινος", "UNICODE");
+    assertLcaseCompare("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινος", "UNICODE_CI");

Review Comment:
   `testLower` tests the output of the appropriate `toLowerCase` method for 
each collation, that is not the intent here
   
   instead, I want `testLcaseCompare` to do the other thing you mentioned, 
which is: compare `CollationAwareUTF8String.lowerCaseCodePoints` against 
`CollationAwareUTF8String.toLowerCase` for the UTF8_BINARY_LCASE



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Re: [PR] [SPARK-48283][SQL] Modify string comparison for UTF8_BINARY_LCASE [spark]

Reply via email to