uros-db commented on code in PR #46180: URL: https://github.com/apache/spark/pull/46180#discussion_r1579147729
########## common/unsafe/src/test/scala/org/apache/spark/unsafe/types/CollationFactorySuite.scala: ########## @@ -152,4 +219,147 @@ class CollationFactorySuite extends AnyFunSuite with Matchers { // scalastyle:ig } }) } + + test("test collation caching") { + Seq( + "UTF8_BINARY", + "UTF8_BINARY_LCASE", + "UTF8_BINARY_UCASE", + "UNICODE", + "UNICODE_LCASE", + "UNICODE_UCASE", + "UNICODE_CI", + "UNICODE_AI_CI", + "UNICODE_AI_CI_LCASE", + "UNICODE_AI_CI_UCASE" + ).foreach(collationId => { + val col1 = fetchCollation(collationId) + val col2 = fetchCollation(collationId) + assert(col1 eq col2) // reference equality + }) + } + + test("collations with ICU non-root localization") { + Seq( + // language only + "en", + "en_CS", + "en_CI", + "en_AS", + "en_AI", + "en_LCASE", + "en_UCASE", + // language + 3-letter country code + "en_USA", + "en_USA_CS", + "en_USA_CI", + "en_USA_AS", + "en_USA_AI", + "en_USA_LCASE", + "en_USA_UCASE", + // language + script code + "sr_Cyrl", + "sr_Cyrl_CS", + "sr_Cyrl_CI", + "sr_Cyrl_AS", + "sr_Cyrl_AI", + "sr_Cyrl_LCASE", + "sr_Cyrl_UCASE", + // language + script code + 3-letter country code + "sr_Cyrl_SRB", + "sr_Cyrl_SRB_CS", + "sr_Cyrl_SRB_CI", + "sr_Cyrl_SRB_AS", + "sr_Cyrl_SRB_AI", + "sr_Cyrl_SRB_LCASE", + "sr_Cyrl_SRB_UCASE" + ).foreach(collationICU => { + val col = fetchCollation(collationICU) + assert(col.collator.getLocale(ULocale.VALID_LOCALE) != ULocale.ROOT) + }) + } + + test("invalid names of collations with ICU non-root localization") { + Seq( + "en_US", // must use 3-letter country code + "enn", + "en_AAA", + "en_Something", + "en_Something_USA", + "en_Latn_USA", // use en_USA instead + "en_Cyrl_USA", + "en_USA_AAA", + "sr_Cyrl_SRB_AAA" + ).foreach(collationName => { + val error = intercept[SparkException] { + fetchCollation(collationName) + } + + assert(error.getErrorClass === "COLLATION_INVALID_NAME") + assert(error.getMessageParameters.asScala === Map("collationName" -> collationName)) + }) + } + + test("collations name normalization for ICU non-root localization") { + Seq( + ("en_USA", "en_USA"), + ("en_CS", "en"), + ("en_AS", "en"), + ("en_CS_AS", "en"), + ("en_AI_CI", "en_CI_AI"), + ("en_USA_AI_CI", "en_USA_CI_AI"), + // randomized case + ("EN_USA", "en_USA"), + ("eN_usA_ci_uCASe_aI", "en_USA_CI_AI_UCASE"), + ("SR_CYRL", "sr_Cyrl"), + ("sr_cyrl_srb", "sr_Cyrl_SRB"), + ("sR_cYRl_sRb", "sr_Cyrl_SRB") + ).foreach { + case (name, normalized) => + val col = fetchCollation(name) + assert(col.collationName == normalized) + } + } + + test("invalid collationId") { + val badCollationIds = Seq( + -1, // user-defined collation range Review Comment: If it does end up doing that for 64b shouldn't be a problem (even with hypothetical indeterminate collation id = -1), but potentially 16b would be dangerous - although I don't know of a JVM that would interpret this as anything other than 32b, someone should probably confirm this -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org