dbatomic commented on code in PR #46180:
URL: https://github.com/apache/spark/pull/46180#discussion_r1604916810


##########
common/unsafe/src/test/scala/org/apache/spark/unsafe/types/CollationFactorySuite.scala:
##########
@@ -152,4 +231,168 @@ class CollationFactorySuite extends AnyFunSuite with 
Matchers { // scalastyle:ig
       }
     })
   }
+
+  test("test collation caching") {
+    Seq(
+      "UTF8_BINARY",
+      "UTF8_BINARY_LCASE",
+      "UTF8_BINARY_UCASE",
+      "UNICODE",
+      "UNICODE_LCASE",
+      "UNICODE_UCASE",
+      "UNICODE_CI",
+      "UNICODE_AI_CI",
+      "UNICODE_AI_CI_LCASE",
+      "UNICODE_AI_CI_UCASE"
+    ).foreach(collationId => {
+      val col1 = fetchCollation(collationId)
+      val col2 = fetchCollation(collationId)
+      assert(col1 eq col2) // reference equality
+    })
+  }
+
+  test("collations with ICU non-root localization") {
+    Seq(
+      // language only
+      "en",
+      "en_CS",
+      "en_CI",
+      "en_AS",
+      "en_AI",
+      "en_LCASE",
+      "en_UCASE",
+      // language + 3-letter country code
+      "en_USA",
+      "en_USA_CS",
+      "en_USA_CI",
+      "en_USA_AS",
+      "en_USA_AI",
+      "en_USA_LCASE",
+      "en_USA_UCASE",
+      // language + script code
+      "sr_Cyrl",
+      "sr_Cyrl_CS",
+      "sr_Cyrl_CI",
+      "sr_Cyrl_AS",
+      "sr_Cyrl_AI",
+      "sr_Cyrl_LCASE",
+      "sr_Cyrl_UCASE",
+      // language + script code + 3-letter country code
+      "sr_Cyrl_SRB",
+      "sr_Cyrl_SRB_CS",
+      "sr_Cyrl_SRB_CI",
+      "sr_Cyrl_SRB_AS",
+      "sr_Cyrl_SRB_AI",
+      "sr_Cyrl_SRB_LCASE",
+      "sr_Cyrl_SRB_UCASE"
+    ).foreach(collationICU => {
+      val col = fetchCollation(collationICU)
+      assert(col.collator.getLocale(ULocale.VALID_LOCALE) != ULocale.ROOT)
+    })
+  }
+
+  test("invalid names of collations with ICU non-root localization") {
+    Seq(
+      "en_US", // must use 3-letter country code
+      "enn",
+      "en_AAA",
+      "en_Something",
+      "en_Something_USA",
+      "en_CI_UNSPECIFIED",
+      "en_USA_UNSPECIFIED",
+      "en_USA_UNSPECIFIED_CI",
+      "en_INDETERMINATE",
+      "en_USA_INDETERMINATE",
+      "en_Latn_USA", // use en_USA instead
+      "en_Cyrl_USA",
+      "en_USA_AAA",
+      "sr_Cyrl_SRB_AAA"
+    ).foreach(collationName => {
+      val error = intercept[SparkException] {
+        fetchCollation(collationName)
+      }
+
+      assert(error.getErrorClass === "COLLATION_INVALID_NAME")
+      assert(error.getMessageParameters.asScala === Map("collationName" -> 
collationName))
+    })
+  }
+
+  test("collations name normalization for ICU non-root localization") {
+    Seq(
+      ("en_USA", "en_USA"),
+      ("en_CS", "en"),
+      ("en_AS", "en"),
+      ("en_CS_AS", "en"),
+      ("en_AS_CS", "en"),
+      ("en_CI", "en_CI"),
+      ("en_AI", "en_AI"),
+      ("en_AI_CI", "en_CI_AI"),
+      ("en_USA_AI_CI", "en_USA_CI_AI"),
+      ("en_USA_LCASE_AI_CI", "en_USA_CI_AI_LCASE"),
+      ("en_USA_LCASE_CI_AI", "en_USA_CI_AI_LCASE"),
+      ("en_USA_AI_LCASE_CI", "en_USA_CI_AI_LCASE"),
+      ("en_USA_CI_LCASE_AI", "en_USA_CI_AI_LCASE"),
+      // randomized case
+      ("EN_USA", "en_USA"),
+      ("eN_usA_ci_uCASe_aI", "en_USA_CI_AI_UCASE"),
+      ("SR_CYRL", "sr_Cyrl"),
+      ("sr_cyrl_srb", "sr_Cyrl_SRB"),
+      ("sR_cYRl_sRb", "sr_Cyrl_SRB")
+    ).foreach {
+      case (name, normalized) =>
+        val col = fetchCollation(name)
+        assert(col.collationName == normalized)
+    }
+  }
+
+  test("invalid collationId") {
+    val badCollationIds = Seq(
+      -1, // user-defined collation range
+      1 << 31, // user-defined collation range
+      1 << 12, // utf8-binary mandatory zero bit 12 breach
+      1 << 13, // utf8-binary mandatory zero bit 13 breach
+      1 << 14, // utf8-binary mandatory zero bit 14 breach
+      1 << 15, // utf8-binary mandatory zero bit 15 breach
+      1 << 16, // utf8-binary mandatory zero bit 16 breach
+      1 << 17, // utf8-binary mandatory zero bit 17 breach
+      1 << 18, // utf8-binary mandatory zero bit 18 breach
+      1 << 19, // utf8-binary mandatory zero bit 19 breach
+      1 << 20, // utf8-binary mandatory zero bit 20 breach
+      1 << 23, // utf8-binary mandatory zero bit 23 breach
+      1 << 24, // utf8-binary mandatory zero bit 24 breach
+      1 << 25, // utf8-binary mandatory zero bit 25 breach
+      1 << 26, // utf8-binary mandatory zero bit 26 breach
+      (1 << 29) | (1 << 12), // ICU mandatory zero bit 12 breach
+      (1 << 29) | (1 << 13), // ICU mandatory zero bit 13 breach
+      (1 << 29) | (1 << 14), // ICU mandatory zero bit 14 breach
+      (1 << 29) | (1 << 15), // ICU mandatory zero bit 15 breach
+      (1 << 29) | (1 << 16), // ICU mandatory zero bit 16 breach
+      (1 << 29) | (1 << 17), // ICU mandatory zero bit 17 breach
+      (1 << 29) | (1 << 18), // ICU mandatory zero bit 18 breach
+      (1 << 29) | (1 << 19), // ICU mandatory zero bit 19 breach
+      (1 << 29) | (1 << 20), // ICU mandatory zero bit 20 breach
+      (1 << 29) | (1 << 23), // ICU mandatory zero bit 23 breach
+      (1 << 29) | (1 << 24), // ICU mandatory zero bit 24 breach
+      (1 << 29) | (1 << 25), // ICU mandatory zero bit 25 breach
+      (1 << 29) | (1 << 26), // ICU mandatory zero bit 26 breach
+      123, // utf8-binary with non-zero locale id
+      (1 << 29) | (1 << 12), // ICU with invalid locale id
+      (1 << 29) | 0xFFFF, // ICU with invalid locale id
+      (1 << 23) | (1 << 22), // utf8-binary with invalid case conversion
+      (1 << 29) | (1 << 23) | (1 << 22), // ICU with invalid case conversion
+      1 << 27, // utf8-binary accent-insensitive
+      1 << 28, // utf8-binary case-insensitive
+      INDETERMINATE_COLLATION_ID, // indeterminate collation
+      1 << 30, // indeterminate collation
+      (1 << 30) | (1 << 29) // invalid implementation provider
+    )
+    badCollationIds.foreach(collationId => {
+      val e = intercept[SparkException](fetchCollationUnsafe(collationId))
+      // user cannot specify collation ids directly so this is an internal 
error
+      assert(e.getErrorClass === "INTERNAL_ERROR")
+    })
+    badCollationIds.foreach(collationId => {
+      assert(fetchCollation(collationId).collationName == "UTF8_BINARY")
+    })

Review Comment:
   +1. Let's not do this.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Reply via email to