Re: [PR] [SPARK-47131][SQL][COLLATION] String function support: contains, startswith, endswith [spark]

via GitHub Fri, 01 Mar 2024 06:24:56 -0800


uros-db commented on code in PR #45216:
URL: https://github.com/apache/spark/pull/45216#discussion_r1509076371



##########
sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala:
##########
@@ -183,6 +183,266 @@ class CollationSuite extends DatasourceV2SQLBase {
     }
   }
 
+  test("checkCollation throws exception for incompatible collationIds") {
+    val left: String = "abc" // collate with 'UNICODE_CI'
+    val leftCollationName: String = "UNICODE_CI";
+    var right: String = null // collate with 'UNICODE'
+    val rightCollationName: String = "UNICODE";
+    // contains
+    right = left.substring(1, 2);
+    checkError(
+      exception = intercept[SparkException] {
+        spark.sql(s"SELECT contains(collate('$left', '$leftCollationName')," +
+          s"collate('$right', '$rightCollationName'))").collect()
+      },
+      errorClass = "COLLATION_MISMATCH",
+      sqlState = "42K09",
+      parameters = Map(
+        "collationNameLeft" -> s"$leftCollationName",
+        "collationNameRight" -> s"$rightCollationName"
+      )
+    )
+    // startsWith
+    right = left.substring(0, 1);
+    checkError(
+      exception = intercept[SparkException] {
+        spark.sql(s"SELECT startsWith(collate('$left', '$leftCollationName')," 
+
+          s"collate('$right', '$rightCollationName'))").collect()
+      },
+      errorClass = "COLLATION_MISMATCH",
+      sqlState = "42K09",
+      parameters = Map(
+        "collationNameLeft" -> s"$leftCollationName",
+        "collationNameRight" -> s"$rightCollationName"
+      )
+    )
+    // endsWith
+    right = left.substring(2, 3);
+    checkError(
+      exception = intercept[SparkException] {
+        spark.sql(s"SELECT endsWith(collate('$left', '$leftCollationName')," +
+          s"collate('$right', '$rightCollationName'))").collect()
+      },
+      errorClass = "COLLATION_MISMATCH",
+      sqlState = "42K09",
+      parameters = Map(
+        "collationNameLeft" -> s"$leftCollationName",
+        "collationNameRight" -> s"$rightCollationName"
+      )
+    )
+  }
+
+  test("Support contains string expression with Collation") {
+    // Test 'contains' with different collations
+    var listLeft: List[String] = List()
+    var listRight: List[String] = List()
+    var listResult: List[Boolean] = List()
+
+    // UCS_BASIC (default) & UNICODE collation
+    listLeft = List("", "c", "abc", "cde", "abde", "abcde", "C", "ABC", "CDE", 
"ABDE", "ABCDE")
+    listRight = List("", "c", "abc", "cde", "abde", "abcde", "C", "ABC", 
"CDE", "ABDE", "ABCDE")
+    listResult = List(
+    //  ""     c     abc    cde   abde  abcde    C     ABC    CDE    ABDE  
ABCDE
+      true, false, false, false, false, false, false, false, false, false, 
false, //  ""
+      true, true, false, false, false, false, false, false, false, false, 
false,  //   c
+      true, true, true, false, false, false, false, false, false, false, 
false,   // abc
+      true, true, false, true, false, false, false, false, false, false, 
false,   //   cde
+      true, false, false, false, true, false, false, false, false, false, 
false,  // abde
+      true, true, true, true, false, true, false, false, false, false, false,  
   // abcde
+      true, false, false, false, false, false, true, false, false, false, 
false,  //   C
+      true, false, false, false, false, false, true, true, false, false, 
false,   // ABC
+      true, false, false, false, false, false, true, false, true, false, 
false,   //   CDE
+      true, false, false, false, false, false, false, false, false, true, 
false,  // ABDE
+      true, false, false, false, false, false, true, true, true, false, true)  
   // ABCDE

Review Comment:
   while it may seem a bit unusual at first, I think this matrix approach 
covers a broad spectrum of test cases and generally works really well for this 
set of functions - covering various edge-cases and different collation types 
(this was especially useful when debugging and experimenting with new 
collations) ex. imagine throwing Serbian (ć, Ć) or German collations (ä, Ä) 
into the mix with other possible `abc`s
   
   when I first wrote it as a standard linear set of tests, it was much harder 
to see how and why these functions behave the way they do with different 
collations, while just looking a bit more into this nicely aligned matrix gives 
a pretty clear overiview all in one place (in addition, previously it was easy 
to miss something and not cover all cases, and also hard-coding the expected 
results was extra-tedious)



##########
sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala:
##########
@@ -183,6 +183,266 @@ class CollationSuite extends DatasourceV2SQLBase {
     }
   }
 
+  test("checkCollation throws exception for incompatible collationIds") {
+    val left: String = "abc" // collate with 'UNICODE_CI'
+    val leftCollationName: String = "UNICODE_CI";
+    var right: String = null // collate with 'UNICODE'
+    val rightCollationName: String = "UNICODE";
+    // contains
+    right = left.substring(1, 2);
+    checkError(
+      exception = intercept[SparkException] {
+        spark.sql(s"SELECT contains(collate('$left', '$leftCollationName')," +
+          s"collate('$right', '$rightCollationName'))").collect()
+      },
+      errorClass = "COLLATION_MISMATCH",
+      sqlState = "42K09",
+      parameters = Map(
+        "collationNameLeft" -> s"$leftCollationName",
+        "collationNameRight" -> s"$rightCollationName"
+      )
+    )
+    // startsWith
+    right = left.substring(0, 1);
+    checkError(
+      exception = intercept[SparkException] {
+        spark.sql(s"SELECT startsWith(collate('$left', '$leftCollationName')," 
+
+          s"collate('$right', '$rightCollationName'))").collect()
+      },
+      errorClass = "COLLATION_MISMATCH",
+      sqlState = "42K09",
+      parameters = Map(
+        "collationNameLeft" -> s"$leftCollationName",
+        "collationNameRight" -> s"$rightCollationName"
+      )
+    )
+    // endsWith
+    right = left.substring(2, 3);
+    checkError(
+      exception = intercept[SparkException] {
+        spark.sql(s"SELECT endsWith(collate('$left', '$leftCollationName')," +
+          s"collate('$right', '$rightCollationName'))").collect()
+      },
+      errorClass = "COLLATION_MISMATCH",
+      sqlState = "42K09",
+      parameters = Map(
+        "collationNameLeft" -> s"$leftCollationName",
+        "collationNameRight" -> s"$rightCollationName"
+      )
+    )
+  }
+
+  test("Support contains string expression with Collation") {
+    // Test 'contains' with different collations
+    var listLeft: List[String] = List()
+    var listRight: List[String] = List()
+    var listResult: List[Boolean] = List()
+
+    // UCS_BASIC (default) & UNICODE collation
+    listLeft = List("", "c", "abc", "cde", "abde", "abcde", "C", "ABC", "CDE", 
"ABDE", "ABCDE")
+    listRight = List("", "c", "abc", "cde", "abde", "abcde", "C", "ABC", 
"CDE", "ABDE", "ABCDE")
+    listResult = List(
+    //  ""     c     abc    cde   abde  abcde    C     ABC    CDE    ABDE  
ABCDE
+      true, false, false, false, false, false, false, false, false, false, 
false, //  ""
+      true, true, false, false, false, false, false, false, false, false, 
false,  //   c
+      true, true, true, false, false, false, false, false, false, false, 
false,   // abc
+      true, true, false, true, false, false, false, false, false, false, 
false,   //   cde
+      true, false, false, false, true, false, false, false, false, false, 
false,  // abde
+      true, true, true, true, false, true, false, false, false, false, false,  
   // abcde
+      true, false, false, false, false, false, true, false, false, false, 
false,  //   C
+      true, false, false, false, false, false, true, true, false, false, 
false,   // ABC
+      true, false, false, false, false, false, true, false, true, false, 
false,   //   CDE
+      true, false, false, false, false, false, false, false, false, true, 
false,  // ABDE
+      true, false, false, false, false, false, true, true, true, false, true)  
   // ABCDE

Review Comment:
   while it may seem a bit unusual at first, I think this matrix approach 
covers a broad spectrum of test cases and generally works really well for this 
set of functions - covering various edge-cases and different collation types 
(this was especially useful when debugging and experimenting with new 
collations) ex. imagine throwing Serbian (ć, Ć) or German collations (ä, Ä) 
into the mix with other possible `abc`s
   
   when I first wrote it as a standard linear set of tests, it was much harder 
to see how and why these functions behave the way they do with different 
collations, while just looking a bit more into this nicely aligned matrix gives 
a pretty clear overview all in one place (in addition, previously it was easy 
to miss something and not cover all cases, and also hard-coding the expected 
results was extra-tedious)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Re: [PR] [SPARK-47131][SQL][COLLATION] String function support: contains, startswith, endswith [spark]

Reply via email to