nikolamand-db commented on code in PR #46180:
URL: https://github.com/apache/spark/pull/46180#discussion_r1606762581


##########
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java:
##########
@@ -118,76 +119,433 @@ public Collation(
     }
 
     /**
-     * Constructor with comparators that are inherited from the given collator.
+     * Collation id is defined as 32-bit integer.
+     * We specify binary layouts for different classes of collations.
+     * Classes of collations are differentiated by most significant 3 bits 
(bit 31, 30 and 29),
+     * bit 31 being most significant and bit 0 being least significant.
+     * ---
+     * INDETERMINATE collation id binary layout:
+     * bit 31-0: 1
+     * INDETERMINATE collation id is equal to -1
+     * ---
+     * user-defined collation id binary layout:
+     * bit 31:   0
+     * bit 30:   1
+     * bit 29-0: undefined, reserved for future use
+     * ---
+     * UTF8_BINARY collation id binary layout:
+     * bit 31-22: zeroes
+     * bit 21-18: zeroes, reserved for space trimming
+     * bit 17-16: zeroes, reserved for version
+     * bit 15-3:  zeroes
+     * bit 2:     0, reserved for accent sensitivity
+     * bit 1:     0, reserved for uppercase and case-insensitive
+     * bit 0:     0 = case-sensitive, 1 = lowercase
+     * ---
+     * ICU collation id binary layout:
+     * bit 31-30: zeroes
+     * bit 29:    1
+     * bit 28-24: zeroes
+     * bit 23-22: zeroes, reserved for version
+     * bit 21-18: zeroes, reserved for space trimming
+     * bit 17:    0 = case-sensitive, 1 = case-insensitive
+     * bit 16:    0 = accent-sensitive, 1 = accent-insensitive
+     * bit 15-14: zeroes, reserved for punctuation sensitivity
+     * bit 13-12: zeroes, reserved for first letter preference
+     * bit 11-0:  locale id as specified in `ICULocaleToId` mapping
+     * ---
+     * Some illustrative examples of collation name to id mapping:
+     * - UTF8_BINARY       -> 0
+     * - UTF8_BINARY_LCASE -> 1
+     * - UNICODE           -> 0x20000000
+     * - UNICODE_AI        -> 0x20010000
+     * - UNICODE_CI        -> 0x20020000
+     * - UNICODE_CI_AI     -> 0x20030000
+     * - af                -> 0x20000001
+     * - af_CI_AI          -> 0x20030001
      */
-    public Collation(
-        String collationName,
-        Collator collator,
-        String version,
-        boolean supportsBinaryEquality,
-        boolean supportsBinaryOrdering,
-        boolean supportsLowercaseEquality) {
-      this(
-        collationName,
-        collator,
-        (s1, s2) -> collator.compare(s1.toString(), s2.toString()),
-        version,
-        s -> (long)collator.getCollationKey(s.toString()).hashCode(),
-        supportsBinaryEquality,
-        supportsBinaryOrdering,
-        supportsLowercaseEquality);
+    private abstract static class CollationSpec {
+
+      private enum DefinitionOrigin {
+        PREDEFINED, USER_DEFINED
+      }
+
+      protected enum ImplementationProvider {
+        UTF8_BINARY, ICU
+      }
+
+      private static final int DEFINITION_ORIGIN_OFFSET = 30;
+      private static final int DEFINITION_ORIGIN_MASK = 0b1;
+      protected static final int IMPLEMENTATION_PROVIDER_OFFSET = 29;
+      protected static final int IMPLEMENTATION_PROVIDER_MASK = 0b1;
+
+      private static final int INDETERMINATE_COLLATION_ID = -1;
+
+      private static final Map<Integer, Collation> collationMap = new 
ConcurrentHashMap<>();
+
+      private static ImplementationProvider getImplementationProvider(int 
collationId) {
+        return 
ImplementationProvider.values()[SpecifierUtils.getSpecValue(collationId,
+          IMPLEMENTATION_PROVIDER_OFFSET, IMPLEMENTATION_PROVIDER_MASK)];
+      }
+
+      private static DefinitionOrigin getDefinitionOrigin(int collationId) {
+        return 
DefinitionOrigin.values()[SpecifierUtils.getSpecValue(collationId,
+          DEFINITION_ORIGIN_OFFSET, DEFINITION_ORIGIN_MASK)];
+      }
+
+      private static Collation fetchCollation(int collationId) {
+        assert (collationId >= 0 && getDefinitionOrigin(collationId)
+          == DefinitionOrigin.PREDEFINED);
+        if (collationId == UTF8_BINARY_COLLATION_ID) {
+          return CollationSpecUTF8Binary.UTF8_BINARY_COLLATION;
+        } else if (collationMap.containsKey(collationId)) {
+          return collationMap.get(collationId);
+        } else {
+          CollationSpec spec;
+          ImplementationProvider implementationProvider = 
getImplementationProvider(collationId);
+          if (implementationProvider == ImplementationProvider.UTF8_BINARY) {
+            spec = CollationSpecUTF8Binary.fromCollationId(collationId);
+          } else {
+            spec = CollationSpecICU.fromCollationId(collationId);
+          }
+          Collation collation = spec.buildCollation();
+          collationMap.put(collationId, collation);
+          return collation;
+        }
+      }
+
+      protected static SparkException collationInvalidNameException(String 
collationName) {
+        return new SparkException("COLLATION_INVALID_NAME",
+          SparkException.constructMessageParams(Map.of("collationName", 
collationName)), null);
+      }
+
+      private static int collationNameToId(String collationName) throws 
SparkException {
+        String collationNameUpper = collationName.toUpperCase();
+        if (collationNameUpper.startsWith("UTF8_BINARY")) {
+          return CollationSpecUTF8Binary.collationNameToId(collationName, 
collationNameUpper);
+        } else {
+          return CollationSpecICU.collationNameToId(collationName, 
collationNameUpper);
+        }
+      }
+
+      protected abstract Collation buildCollation();
     }
-  }
 
-  private static final Collation[] collationTable = new Collation[4];
-  private static final HashMap<String, Integer> collationNameToIdMap = new 
HashMap<>();
-
-  public static final int UTF8_BINARY_COLLATION_ID = 0;
-  public static final int UTF8_BINARY_LCASE_COLLATION_ID = 1;
-
-  static {
-    // Binary comparison. This is the default collation.
-    // No custom comparators will be used for this collation.
-    // Instead, we rely on byte for byte comparison.
-    collationTable[0] = new Collation(
-      "UTF8_BINARY",
-      null,
-      UTF8String::binaryCompare,
-      "1.0",
-      s -> (long)s.hashCode(),
-      true,
-      true,
-      false);
-
-    // Case-insensitive UTF8 binary collation.
-    // TODO: Do in place comparisons instead of creating new strings.
-    collationTable[1] = new Collation(
-      "UTF8_BINARY_LCASE",
-      null,
-      UTF8String::compareLowerCase,
-      "1.0",
-      (s) -> (long)s.toLowerCase().hashCode(),
-      false,
-      false,
-      true);
-
-    // UNICODE case sensitive comparison (ROOT locale, in ICU).
-    collationTable[2] = new Collation(
-      "UNICODE", Collator.getInstance(ULocale.ROOT), "153.120.0.0", true, 
false, false);
-    collationTable[2].collator.setStrength(Collator.TERTIARY);
-    collationTable[2].collator.freeze();
-
-    // UNICODE case-insensitive comparison (ROOT locale, in ICU + Secondary 
strength).
-    collationTable[3] = new Collation(
-      "UNICODE_CI", Collator.getInstance(ULocale.ROOT), "153.120.0.0", false, 
false, false);
-    collationTable[3].collator.setStrength(Collator.SECONDARY);
-    collationTable[3].collator.freeze();
-
-    for (int i = 0; i < collationTable.length; i++) {
-      collationNameToIdMap.put(collationTable[i].collationName, i);
+    private static class CollationSpecUTF8Binary extends CollationSpec {
+
+      private static final int CASE_SENSITIVITY_OFFSET = 0;
+      private static final int CASE_SENSITIVITY_MASK = 0b1;
+
+      private enum CaseSensitivity {
+        UNSPECIFIED, LCASE
+      }
+
+      private static final int UTF8_BINARY_COLLATION_ID =
+        new CollationSpecUTF8Binary(CaseSensitivity.UNSPECIFIED).collationId;
+      private static final int UTF8_BINARY_LCASE_COLLATION_ID =
+        new CollationSpecUTF8Binary(CaseSensitivity.LCASE).collationId;
+      protected static Collation UTF8_BINARY_COLLATION =
+        new 
CollationSpecUTF8Binary(CaseSensitivity.UNSPECIFIED).buildCollation();
+      protected static Collation UTF8_BINARY_LCASE_COLLATION =
+        new CollationSpecUTF8Binary(CaseSensitivity.LCASE).buildCollation();
+
+      private final int collationId;
+
+      private CollationSpecUTF8Binary(CaseSensitivity caseSensitivity) {
+        this.collationId =
+          SpecifierUtils.setSpecValue(0, CASE_SENSITIVITY_OFFSET, 
caseSensitivity);
+      }
+
+      private static int collationNameToId(String originalName, String 
collationName)
+          throws SparkException {
+        if (UTF8_BINARY_COLLATION.collationName.equals(collationName)) {
+          return UTF8_BINARY_COLLATION_ID;
+        } else if 
(UTF8_BINARY_LCASE_COLLATION.collationName.equals(collationName)) {
+          return UTF8_BINARY_LCASE_COLLATION_ID;
+        } else {
+          throw collationInvalidNameException(originalName);
+        }
+      }
+
+      private static CollationSpecUTF8Binary fromCollationId(int collationId) {
+        int caseConversionOrdinal = SpecifierUtils.getSpecValue(collationId,
+          CASE_SENSITIVITY_OFFSET, CASE_SENSITIVITY_MASK);
+        assert (SpecifierUtils.removeSpec(collationId,
+          CASE_SENSITIVITY_OFFSET, CASE_SENSITIVITY_MASK) == 0);
+        return new 
CollationSpecUTF8Binary(CaseSensitivity.values()[caseConversionOrdinal]);
+      }
+
+      @Override
+      protected Collation buildCollation() {
+        if (collationId == UTF8_BINARY_COLLATION_ID) {
+          return new Collation("UTF8_BINARY", null, UTF8String::binaryCompare, 
"1.0",
+            s -> (long) s.hashCode(), true, true, false);
+        } else {
+          return new Collation("UTF8_BINARY_LCASE", null, 
UTF8String::compareLowerCase, "1.0",
+            s -> (long) s.toLowerCase().hashCode(), false, false, true);
+        }
+      }
+    }
+
+    private static class CollationSpecICU extends CollationSpec {
+
+      private enum CaseSensitivity {
+        CS, CI
+      }
+
+      private enum AccentSensitivity {
+        AS, AI
+      }
+
+      private static final int CASE_SENSITIVITY_OFFSET = 17;
+      private static final int CASE_SENSITIVITY_MASK = 0b1;
+      private static final int ACCENT_SENSITIVITY_OFFSET = 16;
+      private static final int ACCENT_SENSITIVITY_MASK = 0b1;
+
+      // Array of locale names, each locale id corresponds to the index in 
this array
+      private static final String[] ICULocaleNames;
+
+      // Mapping of locale names to corresponding `ULocale` instance
+      private static final Map<String, ULocale> ICULocaleMap = new HashMap<>();
+
+      // Used to parse user input collation names which are converted to 
uppercase
+      private static final Map<String, String> ICULocaleMapUppercase = new 
HashMap<>();
+
+      // Reverse mapping of `ICULocaleNames`
+      private static final Map<String, Integer> ICULocaleToId = new 
HashMap<>();
+
+      private static final String ICU_COLLATOR_VERSION = "153.120.0.0";
+
+      static {
+        ICULocaleMap.put("UNICODE", ULocale.ROOT);
+        ULocale[] locales = Collator.getAvailableULocales();
+        for (ULocale locale : locales) {
+          if (locale.getVariant().isEmpty()) {
+            String language = locale.getLanguage();
+            assert (!language.isEmpty());
+            StringBuilder builder = new StringBuilder(language);
+            String script = locale.getScript();
+            if (!script.isEmpty()) {
+              builder.append('_');
+              builder.append(script);
+            }
+            String country = locale.getISO3Country();
+            if (!country.isEmpty()) {
+              builder.append('_');
+              builder.append(country);
+            }
+            String localeName = builder.toString();
+            // locale names are unique
+            assert (!ICULocaleMap.containsKey(localeName));
+            ICULocaleMap.put(localeName, locale);
+          }
+        }
+        for (String localeName : ICULocaleMap.keySet()) {
+          String localeUppercase = localeName.toUpperCase();
+          // locale names are unique case-insensitively
+          assert (!ICULocaleMapUppercase.containsKey(localeUppercase));
+          ICULocaleMapUppercase.put(localeUppercase, localeName);
+        }
+        ICULocaleNames = ICULocaleMap.keySet().toArray(new String[0]);
+        Arrays.sort(ICULocaleNames);
+        // maximum number of locale ids as defined by binary layout
+        assert (ICULocaleNames.length <= (1 << 12));
+        for (int i = 0; i < ICULocaleNames.length; ++i) {
+          ICULocaleToId.put(ICULocaleNames[i], i);
+        }
+      }
+
+      private static final int UNICODE_COLLATION_ID =
+        new CollationSpecICU("UNICODE", CaseSensitivity.CS, 
AccentSensitivity.AS).collationId;
+      private static final int UNICODE_CI_COLLATION_ID =
+        new CollationSpecICU("UNICODE", CaseSensitivity.CI, 
AccentSensitivity.AS).collationId;
+
+      private final CaseSensitivity caseSensitivity;
+      private final AccentSensitivity accentSensitivity;
+      private final String locale;
+      private final int collationId;
+
+      private CollationSpecICU(String locale, CaseSensitivity caseSensitivity,
+          AccentSensitivity accentSensitivity) {
+        this.locale = locale;
+        this.caseSensitivity = caseSensitivity;
+        this.accentSensitivity = accentSensitivity;
+        int collationId = ICULocaleToId.get(locale);
+        collationId = SpecifierUtils.setSpecValue(collationId, 
IMPLEMENTATION_PROVIDER_OFFSET,
+          ImplementationProvider.ICU);
+        collationId = SpecifierUtils.setSpecValue(collationId, 
CASE_SENSITIVITY_OFFSET,
+          caseSensitivity);
+        collationId = SpecifierUtils.setSpecValue(collationId, 
ACCENT_SENSITIVITY_OFFSET,
+          accentSensitivity);
+        this.collationId = collationId;
+      }
+
+      private static int collationNameToId(
+          String originalName, String collationName) throws SparkException {
+        // search for the longest locale match because specifiers are designed 
to be different from
+        // script tag and country code, meaning the only valid locale name 
match can be
+        // the longest one
+        int lastPos = -1;
+        for (int i = 1; i <= collationName.length(); i++) {
+          String localeName = collationName.substring(0, i);
+          if (ICULocaleMapUppercase.containsKey(localeName)) {
+            lastPos = i;
+          }
+        }
+        if (lastPos == -1) {
+          throw collationInvalidNameException(originalName);
+        } else {
+          String locale = collationName.substring(0, lastPos);
+          int collationId = 
ICULocaleToId.get(ICULocaleMapUppercase.get(locale));
+
+          // try all combinations of AS/AI and CS/CI
+          CaseSensitivity caseSensitivity;
+          AccentSensitivity accentSensitivity;
+          if (collationName.equals(locale) ||
+              collationName.equals(locale + "_AS") ||

Review Comment:
   The name is not expected to be of a particular case, we convert it to 
uppercase 
[here](https://github.com/apache/spark/pull/46180/files#diff-640c14aa5d7473df79b2435ce5a327dffcc16ca29354b153956b4f8d19fdb16cR278)
 before performing comparisons.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Reply via email to