uros-db commented on code in PR #46180:
URL: https://github.com/apache/spark/pull/46180#discussion_r1579147729


##########
common/unsafe/src/test/scala/org/apache/spark/unsafe/types/CollationFactorySuite.scala:
##########
@@ -152,4 +219,147 @@ class CollationFactorySuite extends AnyFunSuite with 
Matchers { // scalastyle:ig
       }
     })
   }
+
+  test("test collation caching") {
+    Seq(
+      "UTF8_BINARY",
+      "UTF8_BINARY_LCASE",
+      "UTF8_BINARY_UCASE",
+      "UNICODE",
+      "UNICODE_LCASE",
+      "UNICODE_UCASE",
+      "UNICODE_CI",
+      "UNICODE_AI_CI",
+      "UNICODE_AI_CI_LCASE",
+      "UNICODE_AI_CI_UCASE"
+    ).foreach(collationId => {
+      val col1 = fetchCollation(collationId)
+      val col2 = fetchCollation(collationId)
+      assert(col1 eq col2) // reference equality
+    })
+  }
+
+  test("collations with ICU non-root localization") {
+    Seq(
+      // language only
+      "en",
+      "en_CS",
+      "en_CI",
+      "en_AS",
+      "en_AI",
+      "en_LCASE",
+      "en_UCASE",
+      // language + 3-letter country code
+      "en_USA",
+      "en_USA_CS",
+      "en_USA_CI",
+      "en_USA_AS",
+      "en_USA_AI",
+      "en_USA_LCASE",
+      "en_USA_UCASE",
+      // language + script code
+      "sr_Cyrl",
+      "sr_Cyrl_CS",
+      "sr_Cyrl_CI",
+      "sr_Cyrl_AS",
+      "sr_Cyrl_AI",
+      "sr_Cyrl_LCASE",
+      "sr_Cyrl_UCASE",
+      // language + script code + 3-letter country code
+      "sr_Cyrl_SRB",
+      "sr_Cyrl_SRB_CS",
+      "sr_Cyrl_SRB_CI",
+      "sr_Cyrl_SRB_AS",
+      "sr_Cyrl_SRB_AI",
+      "sr_Cyrl_SRB_LCASE",
+      "sr_Cyrl_SRB_UCASE"
+    ).foreach(collationICU => {
+      val col = fetchCollation(collationICU)
+      assert(col.collator.getLocale(ULocale.VALID_LOCALE) != ULocale.ROOT)
+    })
+  }
+
+  test("invalid names of collations with ICU non-root localization") {
+    Seq(
+      "en_US", // must use 3-letter country code
+      "enn",
+      "en_AAA",
+      "en_Something",
+      "en_Something_USA",
+      "en_Latn_USA", // use en_USA instead
+      "en_Cyrl_USA",
+      "en_USA_AAA",
+      "sr_Cyrl_SRB_AAA"
+    ).foreach(collationName => {
+      val error = intercept[SparkException] {
+        fetchCollation(collationName)
+      }
+
+      assert(error.getErrorClass === "COLLATION_INVALID_NAME")
+      assert(error.getMessageParameters.asScala === Map("collationName" -> 
collationName))
+    })
+  }
+
+  test("collations name normalization for ICU non-root localization") {
+    Seq(
+      ("en_USA", "en_USA"),
+      ("en_CS", "en"),
+      ("en_AS", "en"),
+      ("en_CS_AS", "en"),
+      ("en_AI_CI", "en_CI_AI"),
+      ("en_USA_AI_CI", "en_USA_CI_AI"),
+      // randomized case
+      ("EN_USA", "en_USA"),
+      ("eN_usA_ci_uCASe_aI", "en_USA_CI_AI_UCASE"),
+      ("SR_CYRL", "sr_Cyrl"),
+      ("sr_cyrl_srb", "sr_Cyrl_SRB"),
+      ("sR_cYRl_sRb", "sr_Cyrl_SRB")
+    ).foreach {
+      case (name, normalized) =>
+        val col = fetchCollation(name)
+        assert(col.collationName == normalized)
+    }
+  }
+
+  test("invalid collationId") {
+    val badCollationIds = Seq(
+      -1, // user-defined collation range

Review Comment:
   If it does end up doing that for 64b shouldn't be a problem (even with 
hypothetical indeterminate collation id = -1), but potentially 16b would be 
dangerous - although I don't know of a JVM that would interpret this as 
anything other than 32b, someone should probably confirm this



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Reply via email to