nikolamand-db commented on code in PR #46180: URL: https://github.com/apache/spark/pull/46180#discussion_r1606706322
########## common/unsafe/src/test/scala/org/apache/spark/unsafe/types/CollationFactorySuite.scala: ########## @@ -152,4 +219,147 @@ class CollationFactorySuite extends AnyFunSuite with Matchers { // scalastyle:ig } }) } + + test("test collation caching") { + Seq( + "UTF8_BINARY", + "UTF8_BINARY_LCASE", + "UTF8_BINARY_UCASE", + "UNICODE", + "UNICODE_LCASE", + "UNICODE_UCASE", + "UNICODE_CI", + "UNICODE_AI_CI", + "UNICODE_AI_CI_LCASE", + "UNICODE_AI_CI_UCASE" + ).foreach(collationId => { + val col1 = fetchCollation(collationId) + val col2 = fetchCollation(collationId) + assert(col1 eq col2) // reference equality + }) + } + + test("collations with ICU non-root localization") { + Seq( + // language only + "en", + "en_CS", + "en_CI", + "en_AS", + "en_AI", + "en_LCASE", + "en_UCASE", + // language + 3-letter country code + "en_USA", + "en_USA_CS", + "en_USA_CI", + "en_USA_AS", + "en_USA_AI", + "en_USA_LCASE", + "en_USA_UCASE", + // language + script code + "sr_Cyrl", + "sr_Cyrl_CS", + "sr_Cyrl_CI", + "sr_Cyrl_AS", + "sr_Cyrl_AI", + "sr_Cyrl_LCASE", + "sr_Cyrl_UCASE", + // language + script code + 3-letter country code + "sr_Cyrl_SRB", + "sr_Cyrl_SRB_CS", + "sr_Cyrl_SRB_CI", + "sr_Cyrl_SRB_AS", + "sr_Cyrl_SRB_AI", + "sr_Cyrl_SRB_LCASE", + "sr_Cyrl_SRB_UCASE" + ).foreach(collationICU => { + val col = fetchCollation(collationICU) + assert(col.collator.getLocale(ULocale.VALID_LOCALE) != ULocale.ROOT) + }) + } + + test("invalid names of collations with ICU non-root localization") { + Seq( + "en_US", // must use 3-letter country code + "enn", + "en_AAA", + "en_Something", + "en_Something_USA", + "en_Latn_USA", // use en_USA instead + "en_Cyrl_USA", + "en_USA_AAA", + "sr_Cyrl_SRB_AAA" + ).foreach(collationName => { + val error = intercept[SparkException] { + fetchCollation(collationName) + } + + assert(error.getErrorClass === "COLLATION_INVALID_NAME") + assert(error.getMessageParameters.asScala === Map("collationName" -> collationName)) + }) + } + + test("collations name normalization for ICU non-root localization") { + Seq( + ("en_USA", "en_USA"), + ("en_CS", "en"), + ("en_AS", "en"), + ("en_CS_AS", "en"), + ("en_AI_CI", "en_CI_AI"), + ("en_USA_AI_CI", "en_USA_CI_AI"), + // randomized case + ("EN_USA", "en_USA"), + ("eN_usA_ci_uCASe_aI", "en_USA_CI_AI_UCASE"), + ("SR_CYRL", "sr_Cyrl"), + ("sr_cyrl_srb", "sr_Cyrl_SRB"), + ("sR_cYRl_sRb", "sr_Cyrl_SRB") + ).foreach { + case (name, normalized) => + val col = fetchCollation(name) + assert(col.collationName == normalized) + } + } + + test("invalid collationId") { + val badCollationIds = Seq( + -1, // user-defined collation range Review Comment: Outdated, new approach is https://github.com/apache/spark/pull/46180/files#diff-9c12d32db9d55dd6ecb5b10f2fc57c7ba7de7275cab57bf157fa42cbc09f3876R331. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org