This is an automated email from the ASF dual-hosted git repository. stoty pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/phoenix.git
commit e4861a3fe9e972a85cd7bd431ab886d19168a15f Author: Mate Szalay-Beko <sy...@apache.org> AuthorDate: Thu Nov 10 16:11:49 2022 +0100 PHOENIX-6818 Remove dependency on the i18n-util library i18n-util is not maintained anymore, but uses icu4j dependencies having CVE issues. To avoid these problems, I copied the relevant code from i18n-util and used the latest icu4j version. --- dev/release_files/LICENSE | 2 +- phoenix-core/pom.xml | 8 +- .../expression/function/CollationKeyFunction.java | 12 +- .../phoenix/expression/function/LowerFunction.java | 3 +- .../phoenix/expression/function/UpperFunction.java | 21 +- .../apache/phoenix/util/DeferredStringBuilder.java | 135 +++ .../apache/phoenix/util/i18n/LinguisticSort.java | 1172 ++++++++++++++++++++ .../org/apache/phoenix/util/i18n/LocaleUtils.java | 86 ++ .../org/apache/phoenix/util/i18n/OracleUpper.java | 82 ++ .../apache/phoenix/util/i18n/OracleUpperTable.java | 337 ++++++ .../org/apache/phoenix/util/i18n/package-info.java | 27 + .../phoenix/util/i18n/LinguisticSortTest.java | 650 +++++++++++ .../util/i18n/OracleUpperTableGeneratorTest.java | 391 +++++++ pom.xml | 13 +- 14 files changed, 2912 insertions(+), 27 deletions(-) diff --git a/dev/release_files/LICENSE b/dev/release_files/LICENSE index 4577518c7a..c3c68268f8 100644 --- a/dev/release_files/LICENSE +++ b/dev/release_files/LICENSE @@ -254,7 +254,7 @@ Janino Compiler (https://github.com/janino-compiler/janino) Hamcrest-core 1.3 (http://www.hamcrest.org) Copyright (c) 2000-2006, www.hamcrest.org -i18n-util 1.0.1 (https://github.com/salesforce/i18n-util) Copyright (c) 2017, Salesforce.com, Inc. All rights reserved. +icu4j (https://github.com/unicode-org/icu) Copyright (c) 2016 and later Unicode, Inc. and others. All Rights Reserved. --- diff --git a/phoenix-core/pom.xml b/phoenix-core/pom.xml index 3df5ef68a2..f1079105d3 100644 --- a/phoenix-core/pom.xml +++ b/phoenix-core/pom.xml @@ -549,8 +549,12 @@ <artifactId>stream</artifactId> </dependency> <dependency> - <groupId>com.salesforce.i18n</groupId> - <artifactId>i18n-util</artifactId> + <groupId>com.ibm.icu</groupId> + <artifactId>icu4j</artifactId> + </dependency> + <dependency> + <groupId>com.ibm.icu</groupId> + <artifactId>icu4j-localespi</artifactId> </dependency> <dependency> <groupId>com.lmax</groupId> diff --git a/phoenix-core/src/main/java/org/apache/phoenix/expression/function/CollationKeyFunction.java b/phoenix-core/src/main/java/org/apache/phoenix/expression/function/CollationKeyFunction.java index f5cbdc4557..676b6460df 100644 --- a/phoenix-core/src/main/java/org/apache/phoenix/expression/function/CollationKeyFunction.java +++ b/phoenix-core/src/main/java/org/apache/phoenix/expression/function/CollationKeyFunction.java @@ -35,11 +35,11 @@ import org.apache.phoenix.schema.types.PInteger; import org.apache.phoenix.schema.types.PVarbinary; import org.apache.phoenix.schema.types.PVarchar; import org.apache.phoenix.util.VarBinaryFormatter; +import org.apache.phoenix.util.i18n.LinguisticSort; +import org.apache.phoenix.util.i18n.LocaleUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.force.db.i18n.LinguisticSort; -import com.force.i18n.LocaleUtils; /** * A Phoenix Function that calculates a collation key for an input string based @@ -51,10 +51,12 @@ import com.force.i18n.LocaleUtils; * are all valid locale representations. Note the language code, country code * and variant are used as arguments to the constructor of java.util.Locale. * - * This function uses the open-source i18n-util package to obtain the collators - * it needs from the provided locale. + * This function originally used the open-source i18n-util package to obtain the + * collators it needs from the provided locale. As i18n-util is not maintained + * anymore, the relevant parts from it were copied into Phoenix. + * See: https://issues.apache.org/jira/browse/PHOENIX-6818 * - * The LinguisticSort implementation in i18n-util encapsulates sort-related + * The LinguisticSort implementation from i18n-util encapsulates sort-related * functionality for a substantive list of locales. For each locale, it provides * a collator and an Oracle-specific database function that can be used to sort * strings according to the natural language rules of that locale. diff --git a/phoenix-core/src/main/java/org/apache/phoenix/expression/function/LowerFunction.java b/phoenix-core/src/main/java/org/apache/phoenix/expression/function/LowerFunction.java index f444d36b5f..264ebfbb79 100644 --- a/phoenix-core/src/main/java/org/apache/phoenix/expression/function/LowerFunction.java +++ b/phoenix-core/src/main/java/org/apache/phoenix/expression/function/LowerFunction.java @@ -30,8 +30,7 @@ import org.apache.phoenix.parse.FunctionParseNode; import org.apache.phoenix.schema.tuple.Tuple; import org.apache.phoenix.schema.types.PDataType; import org.apache.phoenix.schema.types.PVarchar; - -import com.force.i18n.LocaleUtils; +import org.apache.phoenix.util.i18n.LocaleUtils; @FunctionParseNode.BuiltInFunction(name=LowerFunction.NAME, args={ @FunctionParseNode.Argument(allowedTypes={PVarchar.class}), diff --git a/phoenix-core/src/main/java/org/apache/phoenix/expression/function/UpperFunction.java b/phoenix-core/src/main/java/org/apache/phoenix/expression/function/UpperFunction.java index 0969269ba6..56a228c6dd 100644 --- a/phoenix-core/src/main/java/org/apache/phoenix/expression/function/UpperFunction.java +++ b/phoenix-core/src/main/java/org/apache/phoenix/expression/function/UpperFunction.java @@ -1,11 +1,10 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * @@ -15,7 +14,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.phoenix.expression.function; import java.io.DataInput; @@ -25,15 +23,12 @@ import java.util.List; import java.util.Locale; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; - import org.apache.phoenix.expression.Expression; import org.apache.phoenix.parse.FunctionParseNode; +import org.apache.phoenix.schema.tuple.Tuple; import org.apache.phoenix.schema.types.PDataType; import org.apache.phoenix.schema.types.PVarchar; - -import com.force.i18n.LocaleUtils; - -import org.apache.phoenix.schema.tuple.Tuple; +import org.apache.phoenix.util.i18n.LocaleUtils; @FunctionParseNode.BuiltInFunction(name=UpperFunction.NAME, args={ @FunctionParseNode.Argument(allowedTypes={PVarchar.class}), diff --git a/phoenix-core/src/main/java/org/apache/phoenix/util/DeferredStringBuilder.java b/phoenix-core/src/main/java/org/apache/phoenix/util/DeferredStringBuilder.java new file mode 100644 index 0000000000..45dec5c22e --- /dev/null +++ b/phoenix-core/src/main/java/org/apache/phoenix/util/DeferredStringBuilder.java @@ -0,0 +1,135 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.phoenix.util; + +/** + * This utility class was partially copied from Salesforce's internationalization utility library + * (com.salesforce.i18n:i18n-util:1.0.4), which was released under the 3-clause BSD License. + * The i18n-util library is not maintained anymore, and it was using vulnerable dependencies. + * For more info, see: https://issues.apache.org/jira/browse/PHOENIX-6818 + * + * This class implements a StringBuilder that is incrementally copied from a source String. + * Actual creation the new buffer is deferred until a character differs from a character at + * the same position in the source String. This class is useful for reducing garbage creation + * when doing operations like escaping a String, when most Strings are not expected to contain + * any escapable characters. In that case, no additional memory is used (as the original + * String is not actually copied). + */ +public final class DeferredStringBuilder implements Appendable, CharSequence { + + private StringBuilder buf; + private int pos; + private final CharSequence source; + + public DeferredStringBuilder(CharSequence source) { + if (source == null) { + this.buf = new StringBuilder(16); + } + this.source = source; + } + + public DeferredStringBuilder append(char c) { + if (this.buf == null) { + if (this.pos < this.source.length() && c == this.source.charAt(this.pos)) { + // characters match - just move ahead + ++this.pos; + } else { + // doh - character mismatch - now we need to allocate a real StringBuilder + this.buf = new StringBuilder(this.source.length() + 16); + this.buf.append(this.source.subSequence(0, this.pos)); + this.buf.append(c); + } + } else { + // we've already got the buf - just add this character + this.buf.append(c); + } + return this; + } + + public DeferredStringBuilder append(CharSequence csq) { + if (csq == null) { + return this; + } + return append(csq, 0, csq.length()); + } + + public DeferredStringBuilder append(CharSequence csq, int start, int end) { + if (csq != null) { + if (buf == null) { + int chars = end - start; + // For small strings or overflow, do it char by char. + if (chars < 10 || (this.pos + chars > this.source.length())) { + for (int i = start; i < end; ++i) { + append(csq.charAt(i)); + } + } else { + CharSequence subSeq = csq.subSequence(start, end); + //String.equals seems to get optimized a lot quicker than the + // chartA + length + loop method. I don't think this will matter at all, + // but between this and OptimizedURLEncoder, this made these classes + // disappear from my profiler + if (this.source.subSequence(this.pos, this.pos + chars).equals(subSeq)) { + this.pos += chars; + } else { + this.buf = new StringBuilder(this.source.length() + 16); + this.buf.append(this.source.subSequence(0, this.pos)); + this.buf.append(subSeq); + } + } + } else { + // We know it's different, so just append the whole string. + buf.append(csq, start, end); + } + } + return this; + } + + public char charAt(int index) { + if (this.buf != null) { + return this.buf.charAt(index); + } else if (index < pos) { + return this.source.charAt(index); + } else { + throw new StringIndexOutOfBoundsException(index); + } + } + + public CharSequence subSequence(int start, int end) { + if (this.buf != null) { + return this.buf.subSequence(start, end); + } else if (end <= pos) { + return this.source.subSequence(start, end); + } else { + throw new StringIndexOutOfBoundsException(end); + } + } + + @Override + public String toString() { + if (this.buf != null) { + return this.buf.toString(); + } + if (this.pos == this.source.length()) { + return this.source.toString(); + } + return this.source.subSequence(0, this.pos).toString(); + } + + public int length() { + return this.buf != null ? this.buf.length() : this.pos; + } +} diff --git a/phoenix-core/src/main/java/org/apache/phoenix/util/i18n/LinguisticSort.java b/phoenix-core/src/main/java/org/apache/phoenix/util/i18n/LinguisticSort.java new file mode 100644 index 0000000000..c1881c6440 --- /dev/null +++ b/phoenix-core/src/main/java/org/apache/phoenix/util/i18n/LinguisticSort.java @@ -0,0 +1,1172 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.phoenix.util.i18n; + +import java.text.CollationKey; +import java.text.Collator; +import java.text.MessageFormat; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; + +import org.apache.phoenix.util.DeferredStringBuilder; + +import com.ibm.icu.impl.jdkadapter.CollatorICU; +import com.ibm.icu.text.AlphabeticIndex; +import com.ibm.icu.util.ULocale; + +import edu.umd.cs.findbugs.annotations.SuppressWarnings; + + +/** + * This utility class was partially copied from Salesforce's internationalization utility library + * (com.salesforce.i18n:i18n-util:1.0.4), which was released under the 3-clause BSD License. + * The i18n-util library is not maintained anymore, and it was using vulnerable dependencies. + * For more info, see: https://issues.apache.org/jira/browse/PHOENIX-6818 + * + * Contains all the information about linguistic sorting. + * The intent of this is to provide the SQL changes to the RDBMS to ensure + * that the sorting uses the locale provided in Java, and to make sure that + * the collation in Java will correspond as much as possible to what is in the + * DB. + * + * Rolodex is a feature in alphabetic/syllabary languages to restrict the set + * of rows in a list to those that start with a certain letter. In SQL + * this is usually LIKE 'A%', which will include different letters. + * + * To get the list of valid nls_sorts, run this in oracle + * select value from v$nls_valid_values where parameter='SORT'; + */ +public enum LinguisticSort { + // English: + // Using oracle's upper() function to sort; digits come before letters, + // '[' is the lowest character after 'Z'. // balance-] + ENGLISH(Locale.ENGLISH, "[", false, false, LinguisticSort.Alphabets.STRING), // balance-] + + // German: + // Using oracle's nlssort() function to sort; digits come right after letters. + GERMAN(new Locale("de"), LinguisticSort.Alphabets.GERMAN, "0", true, false, + "nlssort({0}, ''nls_sort=xgerman'')"), + + // French: + // Using oracle's nlssort() function to sort; digits come right after letters. + FRENCH(new Locale("fr"), "0", false, false, "nlssort({0}, ''nls_sort=xfrench'')"), + + // Italian: + // Using oracle's nlssort() function to sort; digits come right after letters. + ITALIAN(new Locale("it"), "0", false, false, "nlssort({0}, ''nls_sort=italian'')"), + + // Spanish: + // Using oracle's nlssort() function to sort; digits come right after letters. + // Alphabet consists of A-Z plus N-tilde. However, CH and LL are not considered + // letters, so do not use Oracle's xspanish nlssort. + SPANISH(new Locale("es"), "0", false, false, "nlssort({0}, ''nls_sort=spanish'')"), + + // Catalan: + // Using oracle's nlssort() function to sort; digits come before letters, + // nothing sorts after the last legal catalan character. + CATALAN(new Locale("ca"), LinguisticSort.Alphabets.CATALAN, "0", true, false, + "nlssort({0}, ''nls_sort=catalan'')"), + + // Dutch: + // Using oracle's nlssort() function to sort; digits come right after letters. + DUTCH(new Locale("nl"), "0", false, false, "nlssort({0}, ''nls_sort=dutch'')"), + + // Portuguese: + // Using oracle's nlssort() function to sort; digits come right after letters. + PORTUGUESE(new Locale("pt"), "0", false, false, "nlssort({0}, ''nls_sort=west_european'')"), + + // Danish: + // Alphabet consists of A-Z followed by AE, O-stroke, and A-ring. + // Using oracle's nlssort() function to sort; digits come right after letters. + DANISH(new Locale("da"), "0", false, false, "nlssort({0}, ''nls_sort=danish'')"), + + // Norwegian: + // Alphabet consists of A-Z followed by AE, O-stroke, and A-ring. + // Using oracle's nlssort() function to sort; digits come right after letters. + NORWEGIAN(new Locale("no"), "0", false, false, + "nlssort({0}, ''nls_sort=norwegian'')"), + + // Swedish: + // Alphabet consists of A-Z followed by A-ring, A-diaeresis, and O-diaeresis. + // Using oracle's nlssort() function to sort; digits come before letters, + // nothing sorts after the last legal swedish character. + SWEDISH(new Locale("sv"), null, false, false, + "nlssort({0}, ''nls_sort=swedish'')"), + + // Finnish: + // Alphabet consists of A-Z, minus W, followed by A-ring, A-diaeresis, and O-diaeresis. + // We leave out W so that V's show up properly (bug #151961/W-513969) + // Using oracle's nlssort() function to sort; digits come right after letters. + FINNISH(new Locale("fi"), + new String[] { + "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", + "Q", "R", "S", "T", "U", "V", "X", "Y", "Z", "\u00C5", "\u00C4", "\u00D6" }, + "0", false, false, "nlssort({0}, ''nls_sort=finnish'')"), + + // Czech: + // Alphabet consists of many Czech letters but not all english letters. + // Using oracle's nlssort() function to sort; digits come right after letters. + CZECH(new Locale("cs"), "0", true, false, + "nlssort({0}, ''nls_sort=xczech'')"), + + // Polish: + // Alphabet consists of many Polish letters but not all english letters. + // Using oracle's nlssort() function to sort. + POLISH(new Locale("pl"), "\u00DF", false, false, + "nlssort({0}, ''nls_sort=polish'')"), + + // Turkish: + // Use Turkish alphabet, which also indicates special handling in getUpperCaseValue(). + // Using oracle's nlssort() function to sort. + TURKISH(new Locale("tr"), LinguisticSort.Alphabets.TURKISH, null, false, false, + "nlssort({0}, ''nls_sort=xturkish'')"), + + // Traditional chinese: + // Use English alphabet. Using oracle's nlssort() function to sort by stroke. + CHINESE_HK(new Locale("zh", "HK"), LinguisticSort.Alphabets.ENGLISH, "\u03B1", true, true, + "nlssort({0}, ''nls_sort=tchinese_radical_m'')"), + CHINESE_HK_STROKE(new Locale("zh", "HK", "STROKE"), LinguisticSort.Alphabets.ENGLISH, "\u03B1", + true, true, "nlssort({0}, ''nls_sort=tchinese_stroke_m'')"), + + CHINESE_TW(new Locale("zh", "TW"), LinguisticSort.Alphabets.ENGLISH, "\u03B1", true, true, + "nlssort({0}, ''nls_sort=tchinese_radical_m'')"), + CHINESE_TW_STROKE(new Locale("zh", "TW", "STROKE"), LinguisticSort.Alphabets.ENGLISH, "\u03B1", + true, true, "nlssort({0}, ''nls_sort=tchinese_stroke_m'')"), + + + // Simplified chinese: + // Use English alphabet. Using oracle's nlssort() function to sort by pinyin. + CHINESE(new Locale("zh"), LinguisticSort.Alphabets.ENGLISH, "\u03B1", true, true, + "nlssort({0}, ''nls_sort=schinese_radical_m'')"), + CHINESE_STROKE(new Locale("zh", "", "STROKE"), LinguisticSort.Alphabets.ENGLISH, "\u03B1", + true, true, + "nlssort({0}, ''nls_sort=schinese_stroke_m'')"), + CHINESE_PINYIN(new Locale("zh", "", "PINYIN"), LinguisticSort.Alphabets.ENGLISH, "\u03B1", + true, true, + "nlssort({0}, ''nls_sort=schinese_pinyin_m'')"), + + + // Japanese: + // Japanese alphabet. Using oracle's nlssort() function to sort. Special rolodex handling + JAPANESE(new Locale("ja"), LinguisticSort.Alphabets.JAPANESE, null, true, true, + "nlssort({0}, ''nls_sort=japanese_m'')"), + + // Korean: + // Use English alphabet. Using oracle's nlssort() function to sort. + KOREAN(new Locale("ko"), LinguisticSort.Alphabets.ENGLISH, "\u03B1", true, true, + "nlssort({0}, ''nls_sort=korean_m'')"), + + // Russian: + // Using oracle's nlssort() function to sort. + RUSSIAN(new Locale("ru"), null, false, false, + "nlssort({0}, ''nls_sort=russian'')"), + + // Bulgarian: + // Using oracle's nlssort() function to sort. + BULGARIAN(new Locale("bg"), LinguisticSort.Alphabets.BULGARIAN, null, true, false, + "nlssort({0}, ''nls_sort=bulgarian'')"), + + // Indonesian + // Using oracle's nlssort() function to sort. + INDONESIAN(new Locale("in"), null, true, false, "nlssort({0}, ''nls_sort=indonesian'')"), + + // Romanian: + // Using oracle's nlssort() function to sort. + ROMANIAN(new Locale("ro"), + new String[] { "A", "\u0102", "\u00c2", "B", "C", "D", "E", "F", "G", "H", "I", + "\u00ce", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "\u015e", "T", + "\u0162", "U", "V", "W", "X", "Y", "Z" }, + null, true, false, "nlssort({0}, ''nls_sort=romanian'')"), + + // Vietnamese + // Using oracle's nlssort() function to sort. + VIETNAMESE(new Locale("vi"), + new String[] { + "A", "\u0102", "\u00c2", "B", "C", "D", "\u0110", "E", "\u00ca", "G", "H", + "I", "K", "L", "M", "N", "O", "\u00d4", "\u01a0", "P", "Q", "R", "S", "T", + "U", "\u01af", "V", "X", "Y" }, + null, false, false, "nlssort({0}, ''nls_sort=vietnamese'')"), + + // Ukrainian: + // Using oracle's nlssort() function to sort. + UKRAINIAN(new Locale("uk"), null, false, false, "nlssort({0}, ''nls_sort=ukrainian'')"), + + // Hungarian: + // Using oracle's nlssort() function to sort. + HUNGARIAN(new Locale("hu"), LinguisticSort.Alphabets.HUNGARIAN, null, false, false, + "nlssort({0}, ''nls_sort=xhungarian'')"), + + // Greek: + // Using oracle's nlssort() function to sort. + GREEK(new Locale("el"), null, false, false, "nlssort({0}, ''nls_sort=greek'')"), + + // Hebrew: + // Using oracle's nlssort() function to sort. + HEBREW(new Locale("iw"), null, true, false, "nlssort({0}, ''nls_sort=hebrew'')"), + + // Slovak: + // Using oracle's nlssort() function to sort. + SLOVAK(new Locale("sk"), LinguisticSort.Alphabets.SLOVAK, null, true, false, + "nlssort({0}, ''nls_sort=slovak'')"), + + // Serbian (cyrillic): + // Using oracle's nlssort() function to sort using it's default + SERBIAN_CYRILLIC(new Locale("sr"), null, false, false, + "nlssort({0}, ''nls_sort=generic_m'')"), + + // Serbian (cyrillic): + // Using oracle's nlssort() function to sort using it's default + SERBIAN_LATIN(new Locale("sh"), LinguisticSort.Alphabets.SERBIAN_LATIN, null, false, false, + "nlssort({0}, ''nls_sort=xcroatian'')"), + + // Serbian (cyrillic): + // Using oracle's nlssort() function to sort using it's default + BOSNIAN(new Locale("bs"), LinguisticSort.Alphabets.SERBIAN_LATIN, null, false, false, + "nlssort({0}, ''nls_sort=xcroatian'')"), + + + // Georgian: + // Using oracle's nlssort() function to sort, even though we're using binary for this. + GEORGIAN(new Locale("ka"), LinguisticSort.Alphabets.GEORGIAN, null, false, false, + "nlssort({0}, ''nls_sort=binary'')"), + + // BASQUE: + // Using oracle's nlssort() function to sort. + BASQUE(new Locale("eu"), LinguisticSort.Alphabets.BASQUE, null, false, false, + "nlssort({0}, ''nls_sort=west_european'')"), + + // MALTESE: + // Using oracle's nlssort() function to sort. + MALTESE(new Locale("mt"), null, false, false, "nlssort({0}, ''nls_sort=west_european'')"), + + // ROMANSH: + // Using oracle's nlssort() function to sort. + ROMANSH(new Locale("rm"), null, false, false, "nlssort({0}, ''nls_sort=west_european'')"), + + // LUXEMBOURGISH: + // Using oracle's nlssort() function to sort. + LUXEMBOURGISH(new Locale("lb"), LinguisticSort.Alphabets.LUXEMBOURGISH, null, false, false, + "nlssort({0}, ''nls_sort=west_european'')"), + + // IRISH: + // Using oracle's nlssort() function to sort. + IRISH(new Locale("ga"), null, false, false, "nlssort({0}, ''nls_sort=west_european'')"), + + // Slovenian: + // Using oracle's nlssort() function to sort. + SLOVENE(new Locale("sl"), LinguisticSort.Alphabets.SLOVENE, null, false, false, + "nlssort({0}, ''nls_sort=xslovenian'')"), + + // Croatian: + // Using oracle's nlssort() function to sort. + CROATIAN(new Locale("hr"), LinguisticSort.Alphabets.SERBIAN_LATIN, null, false, false, + "nlssort({0}, ''nls_sort=xcroatian'')"), + + // Malay + // Using oracle's nlssort() function to sort. + // We're assuming people are using the english alphabet, + // and not the arabic one (Bahasa Melayu) + MALAY(new Locale("ms"), null, true, false, "nlssort({0}, ''nls_sort=malay'')"), + + // Arabic: + // Using oracle's nlssort() function to sort. + ARABIC(new Locale("ar"), null, false, false, "nlssort({0}, ''nls_sort=arabic'')"), + + // Estonian: + // Using oracle's nlssort() function to sort. + ESTONIAN(new Locale("et"), LinguisticSort.Alphabets.ESTONIAN, null, true, false, + "nlssort({0}, ''nls_sort=estonian'')"), + + // Icelandic: + // Using oracle's nlssort() function to sort. + ICELANDIC(new Locale("is"), LinguisticSort.Alphabets.ICELANDIC, null, true, false, + "nlssort({0}, ''nls_sort=icelandic'')"), + + // Latvian: + // Using oracle's nlssort() function to sort. + LATVIAN(new Locale("lv"), LinguisticSort.Alphabets.LATVIAN, null, false, false, + "nlssort({0}, ''nls_sort=latvian'')"), + + // Lithuanian: + // Using oracle's nlssort() function to sort. + LITHUANIAN(new Locale("lt"), LinguisticSort.Alphabets.LITHUANIAN, null, false, false, + "nlssort({0}, ''nls_sort=lithuanian'')"), + + + // Languages not supported fully. + KYRGYZ(new Locale("ky"), LinguisticSort.Alphabets.KYRGYZ, null, true, false, + "nlssort({0}, ''nls_sort=binary'')"), + + KAZAKH(new Locale("kk"), LinguisticSort.Alphabets.KAZAKH, null, true, false, + "nlssort({0}, ''nls_sort=binary'')"), + + TAJIK(new Locale("tg"), LinguisticSort.Alphabets.TAJIK, null, true, false, + "nlssort({0}, ''nls_sort=russian'')"), + + BELARUSIAN(new Locale("be"), null, true, false, "nlssort({0}, ''nls_sort=russian'')"), + + TURKMEN(new Locale("tk"), LinguisticSort.Alphabets.TURKISH, null, false, false, + "nlssort({0}, ''nls_sort=xturkish'')"), + + AZERBAIJANI(new Locale("az"), LinguisticSort.Alphabets.AZERBAIJANI, null, false, false, + "nlssort({0}, ''nls_sort=xturkish'')"), + + ARMENIAN(new Locale("hy"), null, true, false, "nlssort({0}, ''nls_sort=binary'')"), + + THAI(new Locale("th"), null, true, false, "nlssort({0}, ''nls_sort=thai_dictionary'')"), + + // Binary? really + HINDI(new Locale("hi"), null, true, false, "nlssort({0}, ''nls_sort=binary'')"), + + URDU(new Locale("ur"), LinguisticSort.Alphabets.URDU, null, false, false, + "nlssort({0}, ''nls_sort=arabic'')"), + + // Bengali + BENGALI(new Locale("bn"), LinguisticSort.Alphabets.BENGALI, null, true, false, + "nlssort({0}, ''nls_sort=bengali'')"), + + TAMIL(new Locale("ta"), LinguisticSort.Alphabets.TAMIL, null, true, false, + "nlssort({0}, ''nls_sort=binary'')"), + + // Unused language for testing; Alphabet and sorting defaults to English + ESPERANTO(new Locale("eo"), LinguisticSort.Alphabets.ENGLISH, "[", false, false, + LinguisticSort.Alphabets.STRING); + + private static final Map<Locale, LinguisticSort> BY_LOCALE = getByLocaleInfo(); + + /** + * Create the map that will be stuffed into BY_LOCALE. We have to fully create an object + * THEN stuff into a final field in a constructor (as unmodifiableMap does below) in order + * to get a proper guarantee from Java's memory model. + * + * See http://jeremymanson.blogspot.com/2008/07/immutability-in-java-part-2.html + */ + private static Map<Locale, LinguisticSort> getByLocaleInfo() { + final Map<Locale, LinguisticSort> byLocaleInfo = new HashMap<Locale, LinguisticSort>(64); + for (LinguisticSort sort : values()) { + LinguisticSort duplicated = byLocaleInfo.put(sort.getLocale(), sort); + assert duplicated == null : "Two linguistic sorts with the same locale: " + + sort.getLocale(); + } + return Collections.unmodifiableMap(byLocaleInfo); + } + + /** + * Get sorting info for the given locale. + */ + public static LinguisticSort get(Locale locale) { + // For non-UTF8 dbs, we always interpret everything as English. (We did not set + // the page encoding to UTF-8, and thus we may have incorrectly encoded data.) + // On all other instances, look for the language of the user's locale. This should + // succeed because every language we support are listed in data. But just in case, + // default to english also. + if (IS_MULTI_LINGUAL /*|| TestContext.isRunningTests()*/) { + LinguisticSort sort = BY_LOCALE.get(locale); + if (sort != null) { + return sort; + } + if (locale.getVariant().length() > 0) { + if ("zh".equals(locale.getLanguage())) { + // TW and HK are handled above, this handles SG + if (!"".equals(locale.getLanguage())) { + // This means it's standard. + return get(new Locale(locale.getLanguage(), "", locale.getVariant())); + } + } + return get(new Locale(locale.getLanguage(), locale.getLanguage())); + } + if (locale.getCountry().length() > 0) { + sort = BY_LOCALE.get(new Locale(locale.getLanguage())); + if (sort != null) { + return sort; + } + } + } + return ENGLISH; + } + + /** + * The locale for this LinguisticSort instance. + */ + private final Locale locale; + + /** + * Collator for this LinguisticSort instance. This may be different than the + * default collator for its locale. This is to better match Oracle's nls sort + * ordering. + */ + private final Collator collator; + + /** + * Array of letters (Strings) to show in the rolodex. An empty array for + * alphabet means that the rolodex is not supported for the locale. + */ + private final String[] alphabet; + + /** + * An optional String that sorts higher than all letters in the alphabet. + * Used when the generating rolodex sql for the last letter. + */ + private final String highValue; + + /** + * True normal secondary sorting is reversed, ie, if lower case letters + * are sorted before upper case. + */ + private final boolean reverseSecondary; + + /** + * True if the locale has double width alphabet, number or symbols, + * So we use Oracle's to_single_byte to convert into the half width letter. + */ + private final boolean hasDoubleWidth; + + /** + * A MessageFormat pattern for generating an oracle sql expression returning the + * collation key for sorting a sql expression. Not used by postgres. + */ + private final String collationKeySql; + + /** + * For upper-casing Java values and generating SQL to generate the same. Not used by postgres. + */ + private final OracleUpperTable upper; + + /** + * Constructor only used when building static data, where ICU should be used to derive the + * value for the alphabet + */ + LinguisticSort(Locale locale, String highValue, boolean reverseSecondary, + boolean hasDoubleWidth, String collationKeySql) { + this(locale, getAlphabetFromICU(locale), highValue, reverseSecondary, + hasDoubleWidth, collationKeySql); + } + + /** + * Mapping for locales and ULocale language tags to use for constructing an ICU4J collator. + * javac complains if we attempt to refer to a static defined inside the same class as an enum, + * so we need to use an inner class to have such a constant mapping. + */ + private static final class Icu4jCollatorOverrides { + static final Map<Locale, String> OVERRIDES = getIcu4jCollatorOverrides(); + + /** + * ICU4J collator overrides + */ + private static Map<Locale, String> getIcu4jCollatorOverrides() { + // Map between a Locale and a BCP47 language tag to use when calling ICU4J's + // Collator.getInstance(ULocale.forLanguageTag()). + Map<Locale, String> overrides = new HashMap<Locale, String>(7); + + // Built-in JDK collators for Chinese are behind the Unicode standard, so we need to + // override them. See discussion at + // https://stackoverflow.com/questions/33672422 + // /wrong-sorting-with-collator-using-locale-simplified-chinese + // Also see the following JDK collator bugs: + // https://bugs.openjdk.java.net/browse/JDK-6415666 + // https://bugs.openjdk.java.net/browse/JDK-2143916 + // https://bugs.openjdk.java.net/browse/JDK-6411864 + + // CHINESE_HK: + overrides.put(new Locale("zh", "HK"), "zh-HK-u-co-unihan"); + // CHINESE_HK_STROKE: + overrides.put(new Locale("zh", "HK", "STROKE"), "zh-HK-u-co-stroke"); + // CHINESE_TW: + overrides.put(new Locale("zh", "TW"), "zh-TW-u-co-unihan"); + // CHINESE_TW_STROKE: + overrides.put(new Locale("zh", "TW", "STROKE"), "zh-TW-u-co-stroke"); + // CHINESE: + overrides.put(new Locale("zh"), "zh-CN-u-co-unihan"); + // CHINESE_STROKE: + overrides.put(new Locale("zh", "", "STROKE"), "zh-CN-u-co-stroke"); + // CHINESE_PINYIN: + overrides.put(new Locale("zh", "", "PINYIN"), "zh-CN-u-co-pinyin"); + + return Collections.unmodifiableMap(overrides); + } + } + + /** + * Constructor only used when building static data + */ + LinguisticSort(Locale locale, String[] alphabet, String highValue, boolean reverseSecondary, + boolean hasDoubleWidth, String collationKeySql) { + this.locale = locale; + this.alphabet = alphabet; + this.highValue = highValue; + assert this.highValue == null || this.highValue.length() == 1; + this.reverseSecondary = reverseSecondary; + this.hasDoubleWidth = hasDoubleWidth; + this.collationKeySql = collationKeySql; + // Construct collator for this locale + if (LinguisticSort.Icu4jCollatorOverrides.OVERRIDES.containsKey(this.locale)) { + // Force ICU4J collators for specific locales so they match Oracle sort + this.collator = CollatorICU.wrap(com.ibm.icu.text.Collator.getInstance( + ULocale.forLanguageTag(LinguisticSort + .Icu4jCollatorOverrides.OVERRIDES.get(this.locale)))); + } else if (this.locale.getVariant().length() > 0) { + // If there's a variant, use ICU4J to figure it out. + this.collator = CollatorICU.wrap(com.ibm.icu.text.Collator.getInstance( + ULocale.forLocale(this.locale))); + } else { + this.collator = Collator.getInstance(this.locale); + } + this.collator.setStrength(Collator.SECONDARY); + this.upper = OracleUpperTable.forLinguisticSort(name()); + } + + /** + * @return a new collator for this LinguisticSort instance. + */ + public Collator getCollator() { + // Since RuleBasedCollator.compare() is synchronized, it is not nice to return + // this.collator here, because that would mean requests for the same language + // will be waiting for each other. Instead, return a clone. And, cloning + // RuleBasedCollator instances is much more efficient than creating one from + // the rules. + return (Collator) this.collator.clone(); + } + + /** + * @return a new collator for this LinguisticSort instance that is guaranteed to be + * case-insensitive. Danish collation, unfortunately, is a little odd, in that "v" + * and "w" are considered to be the same character. To make up for this, they made + * "v" and "V" a secondary difference, which makes Enum comparisons in FilterItem + * a little wonky. http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4174436 + */ + public Collator getGuaranteedCaseInsensitiveCollator() { + Collator collator = getCollator(); + if ("da".equals(this.locale.getLanguage())) { + collator.setStrength(Collator.PRIMARY); + } + return collator; + } + + Locale getLocale() { + return this.locale; + } + + /** + * @return a new comparator for strings for this LinguisticSort instance. + */ + @SuppressWarnings("unchecked") + // Converting from Comparator<Object> to Comparator<String> + public Comparator<String> getNonCachingComparator() { + return (Comparator<String>) this.collator.clone(); + } + + /** + * @return a new comparator for strings for this LinguisticSort instance. + * @param size the number of elements to compare (default is 16). + */ + public Comparator<String> getComparator(int size) { + return new LinguisticSort.CollatingComparator(getCollator(), size); + } + + /** + * A String comparator that uses the current collation + */ + static class CollatingComparator implements Comparator<String> { + private final Collator collator; + private final Map<String, CollationKey> cKeyMap; + + CollatingComparator(Collator collator) { + this(collator, 16); + } + + CollatingComparator(Collator collator, int defaultSize) { + this.collator = collator; + cKeyMap = new HashMap<>(defaultSize); + } + + @SuppressWarnings( + value = "ES_COMPARING_PARAMETER_STRING_WITH_EQ", + justification = "Reference comparison used for performance improvement.") + public int compare(String o1, String o2) { + if (o1 == o2) { + return 0; + } else if (o2 == null) { + return 1; + } else if (o1 == null) { + return -1; + } + + return getCollationKey(o1).compareTo(getCollationKey(o2)); + } + + private CollationKey getCollationKey(String comp) { + CollationKey key = cKeyMap.get(comp); + if (key == null) { + key = collator.getCollationKey(comp); + cKeyMap.put(comp, key); + } + return key; + } + } + + /** + * Returns the number of letters to show in the rolodex. + */ + public int getAlphabetLength() { + return this.alphabet.length; + } + + /** + * Returns the n-th of letter in the rolodex. Note, a 'letter' + * in a language be composed of more than one unicode characters, + * for example, letter 'ch' in Czech. + */ + public String getAlphabet(int index) { + return this.alphabet[index]; + } + + // Used only for test code + String[] getAlphabet() { + return this.alphabet; + } + + /** + * Return the rolodexIndex for a string. + * + * @param searchTerm Must be a 1-char string + * @return the rolodexIndex, including Other (i.e. getAlphabetLength) if it doesn't + * fall into a bucket. If this language doesn't have a rolodex (e.g. Arabic, + * Latvian, etc.) return -1 + * @throws IllegalArgumentException if the string is null or not of length 1 + */ + public int getRolodexIndexForChar(String searchTerm) { + if (searchTerm == null || searchTerm.length() != 1) { + throw new IllegalArgumentException("Must be a one-length string"); + } + + if (this.getAlphabetLength() == 0) { + return -1; + } + + for (int i = 0; i < this.getAlphabetLength(); i++) { + int comparison = this.collator.compare(searchTerm, this.getAlphabet(i)); + + if (comparison < 0) { + //If it's less than 'a', return Other + //Otherwise, it's less than the current index, but it wasn't 0 on the + // previous comparison, so return the previous rolodex letter. + return i == 0 ? this.getAlphabetLength() : (i - 1); + } else if (comparison == 0) { + return i; + } + } + return this.getAlphabetLength(); + } + + /** + * Returns the sql expression to convert the given sql expression to upper case. + */ + public String getUpperCaseSql(String expr, boolean isPostgres) { + if (isPostgres) { + return "icu_upper(" + expr + ",'" + this.locale.toString() + "')"; + } else { + return upper.getSql(expr); + } + } + + /** + * @return true if sql UPPER() is used in getUpperCaseSql(). Note that this is always false + * for postgres because postgres always use the icu_upper() function for all languages. + */ + public boolean usesUpperToGetUpperCase(boolean isPostgres) { + return !isPostgres && "upper(x)".equals(upper.getSql("x")); + } + + /** + * Returns the upper case value of the given value, or what would be the result + * of applying the sql expression in getUpperCaseSql() to the given value. + */ + public String getUpperCaseValue(String value, boolean isPostgres) { + String singleWidth = value; + if (this.hasDoubleWidth) { + singleWidth = toSingleWidth(value); + } + if (isPostgres) { + return singleWidth.toUpperCase(this.locale); + } else { + return upper.toUpperCase(singleWidth); + } + } + + private static final char[][] DOUBLE_TO_SINGLE = new char[256][]; + static { + DOUBLE_TO_SINGLE[0x20] = new char[256]; + DOUBLE_TO_SINGLE[0x20][0x18] = '`'; + DOUBLE_TO_SINGLE[0x20][0x19] = '\''; + DOUBLE_TO_SINGLE[0x20][0x1D] = '"'; + + DOUBLE_TO_SINGLE[0x22] = new char[256]; + DOUBLE_TO_SINGLE[0x22][0x3C] = '~'; + + DOUBLE_TO_SINGLE[0x30] = new char[256]; + DOUBLE_TO_SINGLE[0x30][0x00] = ' '; + + DOUBLE_TO_SINGLE[0xFE] = new char[256]; + DOUBLE_TO_SINGLE[0xFE][0x3F] = '^'; + + DOUBLE_TO_SINGLE[0xFF] = new char[256]; + DOUBLE_TO_SINGLE[0xFF][0x01] = '!'; + DOUBLE_TO_SINGLE[0xFF][0x03] = '#'; + DOUBLE_TO_SINGLE[0xFF][0x04] = '$'; + DOUBLE_TO_SINGLE[0xFF][0x05] = '%'; + DOUBLE_TO_SINGLE[0xFF][0x06] = '&'; + DOUBLE_TO_SINGLE[0xFF][0x08] = '('; + DOUBLE_TO_SINGLE[0xFF][0x09] = ')'; + DOUBLE_TO_SINGLE[0xFF][0x0A] = '*'; + DOUBLE_TO_SINGLE[0xFF][0x0B] = '+'; + DOUBLE_TO_SINGLE[0xFF][0x0C] = ','; + DOUBLE_TO_SINGLE[0xFF][0x0D] = '-'; + DOUBLE_TO_SINGLE[0xFF][0x0E] = '.'; + DOUBLE_TO_SINGLE[0xFF][0x0F] = '/'; + DOUBLE_TO_SINGLE[0xFF][0x10] = '0'; + DOUBLE_TO_SINGLE[0xFF][0x11] = '1'; + DOUBLE_TO_SINGLE[0xFF][0x12] = '2'; + DOUBLE_TO_SINGLE[0xFF][0x13] = '3'; + DOUBLE_TO_SINGLE[0xFF][0x14] = '4'; + DOUBLE_TO_SINGLE[0xFF][0x15] = '5'; + DOUBLE_TO_SINGLE[0xFF][0x16] = '6'; + DOUBLE_TO_SINGLE[0xFF][0x17] = '7'; + DOUBLE_TO_SINGLE[0xFF][0x18] = '8'; + DOUBLE_TO_SINGLE[0xFF][0x19] = '9'; + DOUBLE_TO_SINGLE[0xFF][0x1A] = ':'; + DOUBLE_TO_SINGLE[0xFF][0x1B] = ';'; + DOUBLE_TO_SINGLE[0xFF][0x1C] = '<'; + DOUBLE_TO_SINGLE[0xFF][0x1D] = '='; + DOUBLE_TO_SINGLE[0xFF][0x1E] = '>'; + DOUBLE_TO_SINGLE[0xFF][0x1F] = '?'; + DOUBLE_TO_SINGLE[0xFF][0x20] = '@'; + DOUBLE_TO_SINGLE[0xFF][0x21] = 'A'; + DOUBLE_TO_SINGLE[0xFF][0x22] = 'B'; + DOUBLE_TO_SINGLE[0xFF][0x23] = 'C'; + DOUBLE_TO_SINGLE[0xFF][0x24] = 'D'; + DOUBLE_TO_SINGLE[0xFF][0x25] = 'E'; + DOUBLE_TO_SINGLE[0xFF][0x26] = 'F'; + DOUBLE_TO_SINGLE[0xFF][0x27] = 'G'; + DOUBLE_TO_SINGLE[0xFF][0x28] = 'H'; + DOUBLE_TO_SINGLE[0xFF][0x29] = 'I'; + DOUBLE_TO_SINGLE[0xFF][0x2A] = 'J'; + DOUBLE_TO_SINGLE[0xFF][0x2B] = 'K'; + DOUBLE_TO_SINGLE[0xFF][0x2C] = 'L'; + DOUBLE_TO_SINGLE[0xFF][0x2D] = 'M'; + DOUBLE_TO_SINGLE[0xFF][0x2E] = 'N'; + DOUBLE_TO_SINGLE[0xFF][0x2F] = 'O'; + DOUBLE_TO_SINGLE[0xFF][0x30] = 'P'; + DOUBLE_TO_SINGLE[0xFF][0x31] = 'Q'; + DOUBLE_TO_SINGLE[0xFF][0x32] = 'R'; + DOUBLE_TO_SINGLE[0xFF][0x33] = 'S'; + DOUBLE_TO_SINGLE[0xFF][0x34] = 'T'; + DOUBLE_TO_SINGLE[0xFF][0x35] = 'U'; + DOUBLE_TO_SINGLE[0xFF][0x36] = 'V'; + DOUBLE_TO_SINGLE[0xFF][0x37] = 'W'; + DOUBLE_TO_SINGLE[0xFF][0x38] = 'X'; + DOUBLE_TO_SINGLE[0xFF][0x39] = 'Y'; + DOUBLE_TO_SINGLE[0xFF][0x3A] = 'Z'; + DOUBLE_TO_SINGLE[0xFF][0x3B] = '['; + DOUBLE_TO_SINGLE[0xFF][0x3C] = '\\'; + DOUBLE_TO_SINGLE[0xFF][0x3D] = ']'; + DOUBLE_TO_SINGLE[0xFF][0x3F] = '_'; + DOUBLE_TO_SINGLE[0xFF][0x41] = 'a'; + DOUBLE_TO_SINGLE[0xFF][0x42] = 'b'; + DOUBLE_TO_SINGLE[0xFF][0x43] = 'c'; + DOUBLE_TO_SINGLE[0xFF][0x44] = 'd'; + DOUBLE_TO_SINGLE[0xFF][0x45] = 'e'; + DOUBLE_TO_SINGLE[0xFF][0x46] = 'f'; + DOUBLE_TO_SINGLE[0xFF][0x47] = 'g'; + DOUBLE_TO_SINGLE[0xFF][0x48] = 'h'; + DOUBLE_TO_SINGLE[0xFF][0x49] = 'i'; + DOUBLE_TO_SINGLE[0xFF][0x4A] = 'j'; + DOUBLE_TO_SINGLE[0xFF][0x4B] = 'k'; + DOUBLE_TO_SINGLE[0xFF][0x4C] = 'l'; + DOUBLE_TO_SINGLE[0xFF][0x4D] = 'm'; + DOUBLE_TO_SINGLE[0xFF][0x4E] = 'n'; + DOUBLE_TO_SINGLE[0xFF][0x4F] = 'o'; + DOUBLE_TO_SINGLE[0xFF][0x50] = 'p'; + DOUBLE_TO_SINGLE[0xFF][0x51] = 'q'; + DOUBLE_TO_SINGLE[0xFF][0x52] = 'r'; + DOUBLE_TO_SINGLE[0xFF][0x53] = 's'; + DOUBLE_TO_SINGLE[0xFF][0x54] = 't'; + DOUBLE_TO_SINGLE[0xFF][0x55] = 'u'; + DOUBLE_TO_SINGLE[0xFF][0x56] = 'v'; + DOUBLE_TO_SINGLE[0xFF][0x57] = 'w'; + DOUBLE_TO_SINGLE[0xFF][0x58] = 'x'; + DOUBLE_TO_SINGLE[0xFF][0x59] = 'y'; + DOUBLE_TO_SINGLE[0xFF][0x5A] = 'z'; + DOUBLE_TO_SINGLE[0xFF][0x5B] = '{'; + DOUBLE_TO_SINGLE[0xFF][0x5C] = '|'; + DOUBLE_TO_SINGLE[0xFF][0x5D] = '}'; + } + + public static char toSingleWidth(char c) { + // Mask off high 2 bytes and index into char[][] + char[] cBucket = DOUBLE_TO_SINGLE[c >> 8]; + // If no bucket, then no translation so just use original char + if (cBucket == null) { + return c; + } + // Mask off low 2 bytes and index into char[] + char cSingle = cBucket[c & 0x00ff]; + // If char at that index is zero, then no translation so just use original char + if (cSingle == 0) { + return c; + } + return cSingle; + } + + /** + * Convert double width ascii characters to single width. + * This is the equivalent of Oracle's to_single_byte(). + */ + public static String toSingleWidth(String value) { + int n = value.length(); + DeferredStringBuilder buf = new DeferredStringBuilder(value); + + for (int i = 0; i < n; i++) { + char c = value.charAt(i); + buf.append(toSingleWidth(c)); + } + return buf.toString(); + } + + /** + * Returns the sql expression to compute the linguistic sort collation key for the + * given sql expression. This supports sorting in the database, where sort order + * of different upper and lower cases are handled linguistically. + */ + public String getCollationKeySql(String expr, boolean isPostgres) { + if (isPostgres) { + return "icu_sortkey(" + expr + ",'" + this.locale.toString() + "')::text"; + } else { + return MessageFormat.format(this.collationKeySql, new Object[] { expr }); + } + } + + /** + * Returns the sql expression to compute the linguistic sort collation key for the + * upper case of given sql expression. This supports case-insensitive filtering + * in the database. + */ + public String getUpperCollationKeySql(String expr, boolean isPostgres) { + if (!isPostgres && String.format(upper.getSqlFormatString(), "{0}") + .equals(this.collationKeySql)) { + return getCollationKeySql(expr, false); + } + return getCollationKeySql(getUpperCaseSql(expr, isPostgres), isPostgres); + } + + private String formatLetter(String letter, boolean isPostgres) { + return getCollationKeySql('\'' + letter + '\'', isPostgres); + } + + // + // Private Data + // + + // TODO: Make this an environment variable. + private static final boolean IS_MULTI_LINGUAL = true; /*(SfdcEnvProvider.getEnv() == null || + SfdcEnvProvider.getEnv().getIniFile().getString("Pages", "encoding").length() > 0);*/ + + static String[] getAlphabetFromICU(Locale locale) { + AlphabeticIndex<?> index = new AlphabeticIndex<String>(locale); + List<String> alphabet = index.getBucketLabels(); + if (alphabet.size() > 6) { + // Strip off first and last (which are ...) + List<String> alphabetWithoutEllipses = alphabet.subList(1, alphabet.size() - 1); + return alphabetWithoutEllipses.toArray(new String[alphabetWithoutEllipses.size()]); + } else { + return new String[0]; + } + } + + /** + * You can't refer to a static defined inside the same class as an enum, so you need an + * inner class to have such constants + * These are the alphabets that cannot be auto-derived from ICU's CLDR information + */ + static final class Alphabets { + static final String[] ENGLISH = { "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", + "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z" }; + static final String[] CATALAN = { "A", "B", "C", "\u00C7", "D", "E", "F", "G", "H", "I", + "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z" }; + static final String[] BASQUE = { "A", "B", "C", "\u00C7", "D", "E", "F", "G", "H", "I", + "J", "K", "L", "M", "N", "\u00D1", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", + "Y", "Z" }; + static final String[] JAPANESE = { "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", + "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "\u30A2", + "\u30AB", "\u30B5", "\u30BF", "\u30CA", "\u30CF", "\u30DE", "\u30E4", "\u30E9", + "\u30EF" }; + + // A, B, C, Cs, D, E, F, G, Gy, H, I, J, K, L, Ly, M, N, Ny, O, Ö, P, Q, R, S, Sz, T, + // Ty, U, Ü, V, W, X, Y, Z, Zs + static final String[] HUNGARIAN = { "A", "B", "C", "Cs", "D", "E", "F", "G", "Gy", "H", + "I", "J", "K", "L", "Ly", "M", "N", "Ny", "O", "\u00d6", "P", "Q", "R", "S", "Sz", + "T", "Ty", "U", "\u00dc", "V", "W", "X", "Y", "Z", "Zs" }; + + static final String[] TURKISH = { "A", "B", "C", "\u00C7", "D", "E", "F", "G", "\u011E", + "H", "I", "\u0130", "J", "K", "L", "M", "N", "O", "\u00D6", "P", "R", "S", "\u015E", + "T", "U", "\u00DC", "V", "Y", "Z" }; + + // A, B, C, Ç, D, E, Ə, F, G, Ğ, H, X, I, İ, J, K, Q, L, M, N, O, Ö, P, R, S, Ş, T, + // U, Ü, V, Y, Z + static final String[] AZERBAIJANI = { "A", "B", "C", "\u00C7", "D", "E", "\u018F", "F", + "G", "\u011E", "H", "X", "I", "\u0130", "J", "K", "Q", "L", "M", "N", "O", "\u00D6", + "P", "R", "S", "\u015E", "T", "U", "\u00DC", "V", "Y", "Z" }; + + // Russian without Ё, Ы, Э + static final String[] BULGARIAN = { "\u0410", "\u0411", "\u0412", "\u0413", "\u0414", + "\u0415", "\u0416", "\u0417", "\u0418", "\u0419", "\u041a", "\u041b", "\u041c", + "\u041d", "\u041e", "\u041f", "\u0420", "\u0421", "\u0422", "\u0423", "\u0424", + "\u0425", "\u0426", "\u0427", "\u0428", "\u0429", "\u042a", "\u042c", "\u042e", + "\u042f" }; + + // A B C Č Ć D Đ Dž E F G H I J K L Lj M N Nj O P R S Š T U V Z Ž + static final String[] SERBIAN_LATIN = { "A", "B", "C", "\u010c", "\u0106", "D", "\u0110", + "D\u017e", "E", "F", "G", "H", "I", "J", "K", "L", "Lj", "M", "N", "Nj", "O", "P", "R", + "S", "\u0160", "T", "U", "V", "Z", "\u017d" }; + + // A Á Ä B C Č D Ď DZ DŽ E É F G H CH I Í J K L Ĺ Ľ M N Ň O Ó Ô P Q R Ŕ S Š T Ť U Ú V W + // X Y Ý Z Ž + static final String[] SLOVAK = { "A", "\u00c1", "\u00c4", "B", "C", "\u010c", "D", + "\u010e", "DZ", "D\u017d", "E", "\u00c9", "F", "G", "H", "CH", "I", "\u00cd", "J", + "K", "L", "\u0139", "\u013d", "M", "N", "\u0147", "O", "\u00d3", "\u00d4", "P", "Q", + "R", "\u0154", "S", "\u0160", "T", "\u0164", "U", "\u00da", "V", "W", "X", "Y", + "\u00dd", "Z", "\u017d" }; + + // ა ბ გ დ ე ვ ზ თ ი კ ლ მ ნ ო პ ჟ რ ს ტ უ ფ ქ ღ .ყ შ ჩ ც ძ წ ჭ ხ ჯ ჰ + static final String[] GEORGIAN = { "\u10d0", "\u10d1", "\u10d2", "\u10d3", "\u10d4", + "\u10d5", "\u10d6", "\u10d7", "\u10d8", "\u10d9", "\u10da", "\u10db", "\u10dc", + "\u10dd", "\u10de", "\u10df", "\u10e0", "\u10e1", "\u10e2", "\u10e3", "\u10e4", + "\u10e5", "\u10e6", "\u10e7", "\u10e8", "\u10e9", "\u10ea", "\u10eb", "\u10ec", + "\u10ed", "\u10ee", "\u10ef", "\u10f0" }; + + // A B C D E F G H I J K L M N O P Q R S Š Z Ž T U V W Õ Ä Ö Ü X Y + static final String[] ESTONIAN = { "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", + "L", "M", "N", "O", "P", "Q", "R", "S", "\u0160", "Z", "\u017d", "T", "U", "V", "W", + "\u00d5", "\u00c4", "\u00d6", "\u00dc", "X", "Y" }; + + // A Á B D Ð E É F G H I Í J K L M N O Ó P R S T U Ú V X Y Ý Þ Æ Ö + static final String[] ICELANDIC = { "A", "\u00c1", "B", "D", "\u00d0", "E", "\u00c9", "F", + "G", "H", "I", "\u00cd", "J", "K", "L", "M", "N", "O", "\u00d3", "P", "R", "S", "T", + "U", "\u00da", "V", "X", "Y", "\u00dd", "\u00de", "\u00c6", "\u00d6" }; + + // A Ā B C Č D E Ē F G Ģ H I Ī J K Ķ L Ļ M N Ņ O P R S Š T U Ū V Z Ž + static final String[] LATVIAN = { "A", "\u0100", "B", "C", "\u010c", "D", "E", "\u0112", + "F", "G", "\u0122", "H", "I", "\u012a", "J", "K", "\u0136", "L", "\u013b", "M", "N", + "\u0145", "O", "P", "R", "S", "\u0160", "T", "U", "\u016a", "V", "Z", "\u017d" }; + + // A \u0104 B C \u010c D E \u0118 \u0116 F G H I \u012e Y J K L M N O P R S \u0160 T U + // \u0172 \u016a V Z \u017d + static final String[] LUXEMBOURGISH = { "A", "B", "C", "D", "E", "F", "G", "H", "I", + "J", "K", "L", "M", "N", "O", "P", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", + "Ä", "Ë", "É" }; + + // Russian with Ң, Ө, Ү + static final String[] KYRGYZ = { "\u0410", "\u0411", "\u0412", "\u0413", "\u0414", + "\u0415", "\u0401", "\u0416", "\u0417", "\u0418", "\u0419", "\u041a", "\u041b", + "\u041c", "\u041d", "\u04a2", "\u041e", "\u04e8", "\u041f", "\u0420", "\u0421", + "\u0422", "\u0423", "\u04ae", "\u0424", "\u0425", "\u0426", "\u0427", "\u0428", + "\u0429", "\u042a", "\u042b", "\u042c", "\u042d", "\u042e", "\u042f" }; + + // Kyrgyz with Ә, Ғ, Ұ, Һ, І (ICU4J doesn't have some of these characters for sorting...) + static final String[] KAZAKH = { "\u0410", "\u04d8", "\u0411", "\u0412", "\u0413", + "\u0492", "\u0414", "\u0415", "\u0401", "\u0416", "\u0417", "\u0418", "\u0419", + "\u041a", "\u049a", "\u041b", "\u041c", "\u041d", "\u04a2", "\u041e", "\u04e8", + "\u041f", "\u0420", "\u0421", "\u0422", "\u0423", "\u04b0", "\u04ae", "\u0424", + "\u0425", "\u04ba", "\u0426", "\u0427", "\u0428", "\u0429", "\u042a", "\u042b", + "\u0406", "\u042c", "\u042d", "\u042e", "\u042f" }; + + // Cyrillic Variant + static final String[] TAJIK = { "\u0410", "\u0411", "\u0412", "\u0413", "\u0492", "\u0414", + "\u0415", "\u0401", "\u0416", "\u0417", "\u0418", "\u04e2", "\u0419", "\u041a", + "\u049a", "\u041b", "\u041c", "\u041d", "\u041e", "\u041f", "\u0420", "\u0421", + "\u0422", "\u0423", "\u04ee", "\u0424", "\u0425", "\u04b2", "\u0427", "\u04b6", + "\u0428", "\u042a", "\u042d", "\u042e", "\u042f" }; + + // اآبپتٹثجچحخدڈذرڑزژسشصضطظعغفقکگلمنوەھ۶ىے + static final String[] URDU = new String[] {"\u0627", "\u0622", "\u0628", "\u067e", + "\u062a", "\u0679", "\u062b", "\u062c", "\u0686", "\u062d", "\u062e", "\u062f", + "\u0688", "\u0630", "\u0631", "\u0691", "\u0632", "\u0698", "\u0633", "\u0634", + "\u0635", "\u0636", "\u0637", "\u0638", "\u0639", "\u063a", "\u0641", "\u0642", + "\u06a9", "\u06af", "\u0644", "\u0645", "\u0646", "\u0648", "\u06d5", "\u06be", + "\u06f6", "\u0649", "\u06d2" }; + + // W-1308726: removed Ö and Ü; oracle treats them as the same characters as O and U. + // A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, ß, T, U, V, W, X, Y, Z + static final String[] GERMAN = { "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", + "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z" }; + + // ক,খ,গ,ঘ,ঙ,চ,ছ,জ,ঝ,ঞ,ট,ঠ,ড,ঢ,ণ,ত,দ,ধ,ন,প,ফ,ব,ভ,ম,য,র,ল,শ,ষ,স,হ,য়,ড়,ঢ,অ, + // আ,ই,ঈ,উ,ঊ,ঋ,ৠ,এ,ঐ,ও,ঔ + static final String[] BENGALI = { "\u0995", "\u0996", "\u0997", "\u0998", "\u0999", + "\u099a", "\u099b", "\u099c", "\u099d", "\u099e", "\u099f", "\u09a0", "\u09a1", + "\u09a2", "\u09a3", "\u09a4", "\u09a6", "\u09a7", "\u09a8", "\u09aa", "\u09ab", + "\u09ac", "\u09ad", "\u09ae", "\u09af", "\u09b0", "\u09b2", "\u09b6", "\u09b7", + "\u09b8", "\u09b9", "\u09af\u09bc", "\u09a1\u09bc", "\u09a2", "\u0985", "\u0986", + "\u0987", "\u0988", "\u0989", "\u098a", "\u098b", "\u09e0", "\u098f", "\u0990", + "\u0993", "\u0994" }; + + // A, Ą, B, C, Č, D, E, Ę, Ė, F, G, H, I, Į, Y, J, K, L, M, N, O, P, R, S, Š, T, U, Ų, + // Ū, V, Z, Ž + static final String[] LITHUANIAN = { "A", "\u0104", "B", "C", "\u010c", "D", "E", "\u0118", + "\u0116", "F", "G", "H", "I", "\u012e", "Y", "J", "K", "L", "M", "N", "O", "P", "R", + "S", "\u0160", "T", "U", "\u0172", "\u016a", "V", "Z", "\u017d" }; + + // A, B, C, Č, D, E, F, G, H, I, J, K, L, M, N, O, P, R, S, Š, T, U, V, Z, Ž + static final String[] SLOVENE = { "A", "B", "C", "\u010c", "D", "E", "F", "G", "H", "I", + "J", "K", "L", "M", "N", "O", "P", "R", "S", "\u0160", "T", "U", "V", "Z", "\u017d" }; + + // Contains "TAMIL LETTER"s from http://www.unicode.org/charts/PDF/U0B80.pdf + //அ, ஆ, இ, ஈ, உ, ஊ, எ, ஏ, ஐ, ஒ, ஓ, ஔ, க, ங, ச, ஜ, ஞ, + //ட, ண, த, ந, ன, ப, ம, ய, ர, ற, ல, ள, ழ, வ, ஶ, ஷ, ஸ, ஹ + static final String[] TAMIL = { "\u0B85", "\u0B86", "\u0B87", "\u0B88", "\u0B89", "\u0B8A", + "\u0B8E", "\u0B8F", "\u0B90", "\u0B92", "\u0B93", "\u0B94", "\u0B95", "\u0B99", + "\u0B9A", "\u0B9C", "\u0B9E", "\u0B9F", "\u0BA3", "\u0BA4", "\u0BA8", "\u0BA9", + "\u0BAA", "\u0BAE", "\u0BAF", "\u0BB0", "\u0BB1", "\u0BB2", "\u0BB3", "\u0BB4", + "\u0BB5", "\u0BB6", "\u0BB7", "\u0BB8", "\u0BB9" }; + + static final String STRING = "upper({0})"; + + static final String[] JAPANESE_ROLODEX = { + // Notes: unistr('\xxxx') is the Oracle sql expression to get unicode + // character by code point. + // Two backslashes are converted to one backslash by java compiler. + /* 'A' */"unistr('\\3041')", + /* 'Ka' */"unistr('\\30F5')", + /* 'Sa' */"unistr('\\3055')", + /* 'Ta' */"unistr('\\305F')", + /* 'Na' */"unistr('\\306A')", + /* 'Ha' */"unistr('\\306F')", + /* 'Ma' */"unistr('\\307E')", + /* 'Ya' */"unistr('\\3084')", + /* 'Ra' */"unistr('\\3089')", + /* 'Wa' */"unistr('\\308E')", "unistr('\\309D')" }; + + // Notes: unistr('\xxxx') is the Oracle sql expression to get unicode character + // by code point. Two backslashes are converted to one backslash by java compiler. + static final String[] JAPANESE_ROLODEX_JAVA = { + /* 'A' */"\u3041", + /* 'Ka' */"\u30F5", + /* 'Sa' */"\u3055", + /* 'Ta' */"\u305F", + /* 'Na" */"\u306A", + /* 'Ha' */"\u306F", + /* 'Ma' */"\u307E", + /* 'Ya' */"\u3084", + /* 'Ra' */"\u3089", + /* 'Wa' */"\u308E", + "\u3001" // this is the first character after the last valid kana in java + }; + } + + /** + * Apex and possibly other things collate based on upper case versions of strings. + * Always upper casing and then comparing is slow, though, so this method is intended + * to return a collator that is consistent with uppper-case-then-compare while perhaps + * doing something more efficient + */ + public Collator getUpperCaseCollator(final boolean isPostgres) { + final Collator innerCollator = getCollator(); + + // so far, the best I've been able to do that doesn't break sort order is to special + // case the english locale and scan for non-ascii characters before deciding how to + // proceed. With some work the same basic idea would work in many other locales but + // it would be very nice to find a more general and faster approach. The challenge + // is that upper casing effectively "normalizes" strings in a way that is very hard + // to replicate - for instance, western ligatures tend to get expanded by upper casing + // but Hangul ones don't. Even when that's all sorted out there's the issue that the + // built in collation rules for various locales are fairly narrowly focused. So, for + // instance, the English locale doesn't have rules for sorting Greek. With a case + // insensitive compare in the English locale, lower case Greek letters sort + // differently from upper case Greek letters but the English locale does upper case + // Greek letters. + if (!isPostgres && getLocale() == Locale.ENGLISH) { + innerCollator.setStrength(Collator.SECONDARY); + return new Collator() { + @Override + public int compare(String source, String target) { + // upper case only strings where the SECONDARY strength comparison + // (case insensitive comparison) is possibly different for upper + // cased and non upper cased strings + return innerCollator.compare(getUpperCaseIfNeeded(source), + getUpperCaseIfNeeded(target)); + } + + /** + * Upper cases on any non-ascii character + */ + private String getUpperCaseIfNeeded(String string) { + for (int i = 0; i < string.length(); i++) { + final char ch = string.charAt(i); + if (ch > 127) { + // non-ascii character, bail and use the upper case version + return getUpperCaseValue(string, false); + } + } + // no non-ascii characters found, we don't need to upper case + // - sorting with strength SECONDARY is equivalent. + return string; + } + + @Override + public CollationKey getCollationKey(String source) { + return innerCollator.getCollationKey(getUpperCaseIfNeeded(source)); + } + + @Override + public int hashCode() { + return LinguisticSort.this.hashCode(); + } + + @Override + public boolean equals(Object that) { + return super.equals(that); + } + }; + } else { + return new Collator() { + @Override + public int compare(String source, String target) { + return innerCollator.compare(getUpperCaseValue(source, isPostgres), + getUpperCaseValue(target, isPostgres)); + } + + @Override + public CollationKey getCollationKey(String source) { + return innerCollator.getCollationKey(getUpperCaseValue(source, isPostgres)); + } + + @Override + public int hashCode() { + return LinguisticSort.this.hashCode(); + } + + @Override + public boolean equals(Object that) { + return super.equals(that); + } + }; + } + } +} diff --git a/phoenix-core/src/main/java/org/apache/phoenix/util/i18n/LocaleUtils.java b/phoenix-core/src/main/java/org/apache/phoenix/util/i18n/LocaleUtils.java new file mode 100644 index 0000000000..b07e5b6620 --- /dev/null +++ b/phoenix-core/src/main/java/org/apache/phoenix/util/i18n/LocaleUtils.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.phoenix.util.i18n; + +import java.util.List; +import java.util.Locale; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; + +import org.apache.phoenix.thirdparty.com.google.common.base.Splitter; +import org.apache.phoenix.thirdparty.com.google.common.collect.Lists; + +/** + * This utility class was partially copied from Salesforce's internationalization utility library + * (com.salesforce.i18n:i18n-util:1.0.4), which was released under the 3-clause BSD License. + * The i18n-util library is not maintained anymore, and it was using vulnerable dependencies. + * For more info, see: https://issues.apache.org/jira/browse/PHOENIX-6818 + * + * A collection of utilities for dealing with Locales. + */ +public enum LocaleUtils { + INSTANCE; + + public static LocaleUtils get() { + return INSTANCE; + } + + // TODO: The number of locales in the system is rather small, + // but we should probably use a ConcurrentLruMap just in case. + private static final ConcurrentMap<Locale, Locale> UNIQUE_LOCALE_MAP = + new ConcurrentHashMap<>(64, .75f, 2); + + /** + * Returns a locale for language-only ("en") or language/country ("en_UK") + * iso codes + */ + public Locale getLocaleByIsoCode(String isoCode) { + if (isoCode == null) { + return null; + } + if (isoCode.length() == 2) { + return uniqueifyLocale(new Locale(isoCode)); + } else if (isoCode.length() == 5) { + String countryIsoCode = isoCode.substring(3, 5); + String langIsoCode = isoCode.substring(0, 2); + return uniqueifyLocale(new Locale(langIsoCode, countryIsoCode)); + } else { + List<String> split = Lists.newArrayList(Splitter.on('_').split(isoCode)); + String language = split.get(0); + String country = split.size() > 1 ? split.get(1) : ""; + String variant = split.size() > 2 ? split.get(2) : ""; + return uniqueifyLocale(new Locale(language, country, variant)); + } + } + + /** + * If you're going to cache a locale, it should call this function so that it caches + * @param value the locale to uniquify + * @return the unique locale + */ + static Locale uniqueifyLocale(Locale value) { + if (value == null) { + return null; + } + Locale oldValue = UNIQUE_LOCALE_MAP.get(value); + if (oldValue != null) { + return oldValue; + } + UNIQUE_LOCALE_MAP.put(value, value); + return value; + } +} diff --git a/phoenix-core/src/main/java/org/apache/phoenix/util/i18n/OracleUpper.java b/phoenix-core/src/main/java/org/apache/phoenix/util/i18n/OracleUpper.java new file mode 100644 index 0000000000..128990d180 --- /dev/null +++ b/phoenix-core/src/main/java/org/apache/phoenix/util/i18n/OracleUpper.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.phoenix.util.i18n; + +import org.apache.commons.lang3.StringUtils; + +/** + * This utility class was partially copied from Salesforce's internationalization utility library + * (com.salesforce.i18n:i18n-util:1.0.4), which was released under the 3-clause BSD License. + * The i18n-util library is not maintained anymore, and it was using vulnerable dependencies. + * For more info, see: https://issues.apache.org/jira/browse/PHOENIX-6818 + * + * OracleUpper is used in combination with OracleUpperTable to generate upper-case output + * consistent particular chosen Oracle expressions. + * + * @see OracleUpperTable + */ +public class OracleUpper { + + private OracleUpper() { + // HideUtilityClassConstructor + } + + /** + * Upper-case {@code value}, using the information in {@code t} to produce a result + * consistent with the PL/SQL expression used to generate t. + */ + public static String toUpperCase(OracleUpperTable t, String value) { + // Oracle's upper or nls_upper are known to disagree with Java on some particulars. + // We search for known exceptional characters and if found take measures to adjust + // Java's String.toUpperCase. In the average case we incur just a single relatively + // fast scan of the string. In typical bad cases we'll incur two extra String copies + // (one copy into the buffer, one out -- this on top of whatever's required by + // toUpperCase). Note that we have to match Oracle even for characters outside the + // language's alphabet since we still want to return records containing those characters. + char[] exceptions = t.getUpperCaseExceptions(); + if (exceptions.length > 0) { + // Prefer to use String.indexOf in the case of a single search char; it's faster by + // virtue of not requiring two loops and being intrinsic. + int nextExceptionIndex = (exceptions.length == 1) + ? value.indexOf(exceptions[0]) : StringUtils.indexOfAny(value, exceptions); + + if (nextExceptionIndex >= 0) { + // Annoying case: we have found a character that we know Oracle handles differently + // than Java and we must adjust appropriately. + StringBuilder result = new StringBuilder(value.length()); + String rem = value; + do { + char nextException = rem.charAt(nextExceptionIndex); + + result.append(rem.substring(0, nextExceptionIndex).toUpperCase(t.getLocale())); + result.append(t.getUpperCaseExceptionMapping(nextException)); + + rem = rem.substring(nextExceptionIndex + 1); + nextExceptionIndex = (exceptions.length == 1) + ? rem.indexOf(exceptions[0]) : StringUtils.indexOfAny(rem, exceptions); + } while (nextExceptionIndex >= 0); + result.append(rem.toUpperCase(t.getLocale())); + + return result.toString(); + } + } + + // Nice case: we know of no reason that Oracle and Java wouldn't agree when converting + // to upper case. + return value.toUpperCase(t.getLocale()); + } +} diff --git a/phoenix-core/src/main/java/org/apache/phoenix/util/i18n/OracleUpperTable.java b/phoenix-core/src/main/java/org/apache/phoenix/util/i18n/OracleUpperTable.java new file mode 100644 index 0000000000..b453a1bbd5 --- /dev/null +++ b/phoenix-core/src/main/java/org/apache/phoenix/util/i18n/OracleUpperTable.java @@ -0,0 +1,337 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.phoenix.util.i18n; + +import java.util.Locale; + +import edu.umd.cs.findbugs.annotations.SuppressWarnings; + +/** + * This utility class was partially copied from Salesforce's internationalization utility library + * (com.salesforce.i18n:i18n-util:1.0.4), which was released under the 3-clause BSD License. + * The i18n-util library is not maintained anymore, and it was using vulnerable dependencies. + * For more info, see: https://issues.apache.org/jira/browse/PHOENIX-6818 + * + * Generated by i18n.OracleUpperTableGeneratorTest + * <p> + * An instance of this enum codifies the difference between executing a + * {@link #getSqlFormatString() particular PL/SQL expression} in Oracle and executing + * {@link String#toUpperCase(Locale)} for a {@link #getLocale() particular locale} in Java. These + * differences (also called exceptions) are expressed by the output of + * {@link #getUpperCaseExceptions()} and {@link #getUpperCaseExceptionMapping(char)}. + * <p> + * The tables are generated by testing a particular set of characters that are known to contain + * exceptions and {@link #toUpperCase(String) may be used} to compensate for exceptions found and + * generate output in Java that will be consistent with Oracle for the given (sql expression, + * locale) pair over all tested values. + * <p> + * Characters tested: + * <ul> + * <li>U+0069 i</li> + * <li>U+00df ß</li> + * <li>U+0386 Ά</li> + * <li>U+0388 Έ</li> + * <li>U+0389 Ή</li> + * <li>U+038a Ί</li> + * <li>U+038c Ό</li> + * <li>U+038e Ύ</li> + * <li>U+038f Ώ</li> + * <li>U+03ac ά</li> + * <li>U+03ad έ</li> + * <li>U+03ae ή</li> + * <li>U+03af ί</li> + * <li>U+03cc ό</li> + * <li>U+03cd ύ</li> + * <li>U+03ce ώ</li> + * </ul> + * + * @see OracleUpper + */ +public enum OracleUpperTable { + ENGLISH("upper(%s)", "en", "ß"), + GERMAN("nls_upper(%s, 'nls_sort=xgerman')", "de", ""), + FRENCH("nls_upper(%s, 'nls_sort=xfrench')", "fr", "ß"), + ITALIAN("nls_upper(%s, 'nls_sort=italian')", "it", "ß"), + SPANISH("nls_upper(%s, 'nls_sort=spanish')", "es", "ß"), + CATALAN("nls_upper(%s, 'nls_sort=catalan')", "ca", "ß"), + DUTCH("nls_upper(%s, 'nls_sort=dutch')", "nl", "ß"), + PORTUGUESE("nls_upper(%s, 'nls_sort=west_european')", "pt", "ß"), + DANISH("nls_upper(%s, 'nls_sort=danish')", "da", "ß"), + NORWEGIAN("nls_upper(%s, 'nls_sort=norwegian')", "no", "ß"), + SWEDISH("nls_upper(%s, 'nls_sort=swedish')", "sv", "ß"), + FINNISH("nls_upper(%s, 'nls_sort=finnish')", "fi", "ß"), + CZECH("nls_upper(%s, 'nls_sort=xczech')", "cs", "ß"), + POLISH("nls_upper(%s, 'nls_sort=polish')", "pl", "ß"), + TURKISH("nls_upper(translate(%s,'i','İ'), 'nls_sort=xturkish')", "tr", "ß"), + CHINESE_HK("nls_upper(to_single_byte(%s), 'nls_sort=tchinese_radical_m')", "zh", ""), + CHINESE_HK_STROKE("nls_upper(to_single_byte(%s), 'nls_sort=tchinese_stroke_m')", "zh", ""), + CHINESE_TW("nls_upper(to_single_byte(%s), 'nls_sort=tchinese_radical_m')", "zh", ""), + CHINESE_TW_STROKE("nls_upper(to_single_byte(%s), 'nls_sort=tchinese_stroke_m')", "zh", ""), + CHINESE("nls_upper(to_single_byte(%s), 'nls_sort=schinese_radical_m')", "zh", ""), + CHINESE_STROKE("nls_upper(to_single_byte(%s), 'nls_sort=schinese_stroke_m')", "zh", ""), + CHINESE_PINYIN("nls_upper(to_single_byte(%s), 'nls_sort=schinese_pinyin_m')", "zh", ""), + JAPANESE("nls_upper(to_single_byte(%s), 'nls_sort=japanese_m')", "ja", ""), + KOREAN("nls_upper(to_single_byte(%s), 'nls_sort=korean_m')", "ko", ""), + RUSSIAN("nls_upper(%s, 'nls_sort=russian')", "ru", "ß"), + BULGARIAN("nls_upper(%s, 'nls_sort=bulgarian')", "bg", "ß"), + INDONESIAN("nls_upper(%s, 'nls_sort=indonesian')", "in", "ß"), + ROMANIAN("nls_upper(%s, 'nls_sort=romanian')", "ro", "ß"), + VIETNAMESE("nls_upper(%s, 'nls_sort=vietnamese')", "vi", "ß"), + UKRAINIAN("nls_upper(%s, 'nls_sort=ukrainian')", "uk", "ß"), + HUNGARIAN("nls_upper(%s, 'nls_sort=xhungarian')", "hu", ""), + GREEK("nls_upper(%s, 'nls_sort=greek')", "el", "ßΆΈΉΊΌΎΏάέήίόύώ"), + HEBREW("nls_upper(%s, 'nls_sort=hebrew')", "iw", "ß"), + SLOVAK("nls_upper(%s, 'nls_sort=slovak')", "sk", "ß"), + SERBIAN_CYRILLIC("nls_upper(%s, 'nls_sort=generic_m')", "sr", ""), + SERBIAN_LATIN("nls_upper(%s, 'nls_sort=xcroatian')", "sh", "ß"), + BOSNIAN("nls_upper(%s, 'nls_sort=xcroatian')", "bs", "ß"), + GEORGIAN("nls_upper(%s, 'nls_sort=binary')", "ka", "ß"), + BASQUE("nls_upper(%s, 'nls_sort=west_european')", "eu", "ß"), + MALTESE("nls_upper(%s, 'nls_sort=west_european')", "mt", "ß"), + ROMANSH("nls_upper(%s, 'nls_sort=west_european')", "rm", "ß"), + LUXEMBOURGISH("nls_upper(%s, 'nls_sort=west_european')", "lb", "ß"), + IRISH("nls_upper(%s, 'nls_sort=west_european')", "ga", "ß"), + SLOVENE("nls_upper(%s, 'nls_sort=xslovenian')", "sl", "ß"), + CROATIAN("nls_upper(%s, 'nls_sort=xcroatian')", "hr", "ß"), + MALAY("nls_upper(%s, 'nls_sort=malay')", "ms", "ß"), + ARABIC("nls_upper(%s, 'nls_sort=arabic')", "ar", "ß"), + ESTONIAN("nls_upper(%s, 'nls_sort=estonian')", "et", "ß"), + ICELANDIC("nls_upper(%s, 'nls_sort=icelandic')", "is", "ß"), + LATVIAN("nls_upper(%s, 'nls_sort=latvian')", "lv", "ß"), + LITHUANIAN("nls_upper(%s, 'nls_sort=lithuanian')", "lt", "ß"), + KYRGYZ("nls_upper(%s, 'nls_sort=binary')", "ky", "ß"), + KAZAKH("nls_upper(%s, 'nls_sort=binary')", "kk", "ß"), + TAJIK("nls_upper(%s, 'nls_sort=russian')", "tg", "ß"), + BELARUSIAN("nls_upper(%s, 'nls_sort=russian')", "be", "ß"), + TURKMEN("nls_upper(translate(%s,'i','İ'), 'nls_sort=xturkish')", "tk", "iß"), + AZERBAIJANI("nls_upper(translate(%s,'i','İ'), 'nls_sort=xturkish')", "az", "ß"), + ARMENIAN("nls_upper(%s, 'nls_sort=binary')", "hy", "ß"), + THAI("nls_upper(%s, 'nls_sort=thai_dictionary')", "th", "ß"), + HINDI("nls_upper(%s, 'nls_sort=binary')", "hi", "ß"), + URDU("nls_upper(%s, 'nls_sort=arabic')", "ur", "ß"), + BENGALI("nls_upper(%s, 'nls_sort=bengali')", "bn", "ß"), + TAMIL("nls_upper(%s, 'nls_sort=binary')", "ta", "ß"), + ESPERANTO("upper(%s)", "eo", ""), + XWEST_EUROPEAN("NLS_UPPER(%s,'NLS_SORT=xwest_european')", "en", ""); + + private final String sql; + private final Locale locale; + private final char[] exceptionChars; + + OracleUpperTable(String sql, String lang, String exceptionChars) { + this.sql = sql; + this.locale = new Locale(lang); + this.exceptionChars = exceptionChars.toCharArray(); + } + + /** + * Return an array containing characters for which Java's String.toUpperCase method is known + * to deviate from the result of Oracle evaluating {@link #getSql(String) this expression}. + * + * @return an array containing all exceptional characters. + */ + final char[] getUpperCaseExceptions() { + return exceptionChars; + } + + /** + * For a character, {@code exception}, contained in the String returned from + * {@link #getUpperCaseExceptions()}, this method returns the anticipated result of + * upper-casing the character in Oracle when evaluating + * {@link #getSql(String) this expression}. + * + * @return the upper case of {@code exception}, according to what Oracle would do. + * @throws IllegalArgumentException + * if the character is not contained in the String returned by + * {@link #getUpperCaseExceptions()}. + */ + final String getUpperCaseExceptionMapping(char exception) { + switch (exception) { + case 'i': + switch (this) { + case TURKMEN: return "İ"; // I + default: // fall out + } + break; + case 'ß': + switch (this) { + case ENGLISH: return "ß"; // SS + case FRENCH: return "ß"; // SS + case ITALIAN: return "ß"; // SS + case SPANISH: return "ß"; // SS + case CATALAN: return "ß"; // SS + case DUTCH: return "ß"; // SS + case PORTUGUESE: return "ß"; // SS + case DANISH: return "ß"; // SS + case NORWEGIAN: return "ß"; // SS + case SWEDISH: return "ß"; // SS + case FINNISH: return "ß"; // SS + case CZECH: return "ß"; // SS + case POLISH: return "ß"; // SS + case TURKISH: return "ß"; // SS + case RUSSIAN: return "ß"; // SS + case BULGARIAN: return "ß"; // SS + case INDONESIAN: return "ß"; // SS + case ROMANIAN: return "ß"; // SS + case VIETNAMESE: return "ß"; // SS + case UKRAINIAN: return "ß"; // SS + case GREEK: return "ß"; // SS + case HEBREW: return "ß"; // SS + case SLOVAK: return "ß"; // SS + case SERBIAN_LATIN: return "ß"; // SS + case BOSNIAN: return "ß"; // SS + case GEORGIAN: return "ß"; // SS + case BASQUE: return "ß"; // SS + case MALTESE: return "ß"; // SS + case ROMANSH: return "ß"; // SS + case LUXEMBOURGISH: return "ß"; // SS + case IRISH: return "ß"; // SS + case SLOVENE: return "ß"; // SS + case CROATIAN: return "ß"; // SS + case MALAY: return "ß"; // SS + case ARABIC: return "ß"; // SS + case ESTONIAN: return "ß"; // SS + case ICELANDIC: return "ß"; // SS + case LATVIAN: return "ß"; // SS + case LITHUANIAN: return "ß"; // SS + case KYRGYZ: return "ß"; // SS + case KAZAKH: return "ß"; // SS + case TAJIK: return "ß"; // SS + case BELARUSIAN: return "ß"; // SS + case TURKMEN: return "ß"; // SS + case AZERBAIJANI: return "ß"; // SS + case ARMENIAN: return "ß"; // SS + case THAI: return "ß"; // SS + case HINDI: return "ß"; // SS + case URDU: return "ß"; // SS + case BENGALI: return "ß"; // SS + case TAMIL: return "ß"; // SS + default: // fall out + } + break; + case 'Ά': + switch (this) { + case GREEK: return "Α"; // Ά + default: // fall out + } + break; + case 'Έ': + switch (this) { + case GREEK: return "Ε"; // Έ + default: // fall out + } + break; + case 'Ή': + switch (this) { + case GREEK: return "Η"; // Ή + default: // fall out + } + break; + case 'Ί': + switch (this) { + case GREEK: return "Ι"; // Ί + default: // fall out + } + break; + case 'Ό': + switch (this) { + case GREEK: return "Ο"; // Ό + default: // fall out + } + break; + case 'Ύ': + switch (this) { + case GREEK: return "Υ"; // Ύ + default: // fall out + } + break; + case 'Ώ': + switch (this) { + case GREEK: return "Ω"; // Ώ + default: // fall out + } + break; + case 'ά': + switch (this) { + case GREEK: return "Α"; // Ά + default: // fall out + } + break; + case 'έ': + switch (this) { + case GREEK: return "Ε"; // Έ + default: // fall out + } + break; + case 'ή': + switch (this) { + case GREEK: return "Η"; // Ή + default: // fall out + } + break; + case 'ί': + switch (this) { + case GREEK: return "Ι"; // Ί + default: // fall out + } + break; + case 'ό': + switch (this) { + case GREEK: return "Ο"; // Ό + default: // fall out + } + break; + case 'ύ': + switch (this) { + case GREEK: return "Υ"; // Ύ + default: // fall out + } + break; + case 'ώ': + switch (this) { + case GREEK: return "Ω"; // Ώ + default: // fall out + } + break; + } + throw new IllegalArgumentException( + "No upper case mapping for char=" + exception + + " and this=" + this); + } + + @SuppressWarnings(value = "EI_EXPOSE_REP", justification = "By design.") + public final Locale getLocale() { + return locale; + } + + public String getSqlFormatString() { + return sql; + } + + public String getSql(String expr) { + return String.format(sql, expr); + } + + public String toUpperCase(String value) { + return OracleUpper.toUpperCase(this, value); + } + + public static OracleUpperTable forLinguisticSort(String sort) { + return Enum.valueOf(OracleUpperTable.class, sort); + } +} + diff --git a/phoenix-core/src/main/java/org/apache/phoenix/util/i18n/package-info.java b/phoenix-core/src/main/java/org/apache/phoenix/util/i18n/package-info.java new file mode 100644 index 0000000000..3878a7c082 --- /dev/null +++ b/phoenix-core/src/main/java/org/apache/phoenix/util/i18n/package-info.java @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +/** + * This package contains utility classes partially copied from Salesforce's + * internationalization utility library (com.salesforce.i18n:i18n-util:1.0.4), which was + * released under the 3-clause BSD License. + * + * The i18n-util library is not maintained anymore, and it was using vulnerable dependencies. + * For more info, see: https://issues.apache.org/jira/browse/PHOENIX-6818 + */ +package org.apache.phoenix.util.i18n; diff --git a/phoenix-core/src/test/java/org/apache/phoenix/util/i18n/LinguisticSortTest.java b/phoenix-core/src/test/java/org/apache/phoenix/util/i18n/LinguisticSortTest.java new file mode 100644 index 0000000000..7603b4d5b7 --- /dev/null +++ b/phoenix-core/src/test/java/org/apache/phoenix/util/i18n/LinguisticSortTest.java @@ -0,0 +1,650 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.phoenix.util.i18n; + +import static org.apache.phoenix.util.i18n.LinguisticSort.AZERBAIJANI; +import static org.apache.phoenix.util.i18n.LinguisticSort.BASQUE; +import static org.apache.phoenix.util.i18n.LinguisticSort.BENGALI; +import static org.apache.phoenix.util.i18n.LinguisticSort.BOSNIAN; +import static org.apache.phoenix.util.i18n.LinguisticSort.BULGARIAN; +import static org.apache.phoenix.util.i18n.LinguisticSort.CATALAN; +import static org.apache.phoenix.util.i18n.LinguisticSort.CHINESE_HK; +import static org.apache.phoenix.util.i18n.LinguisticSort.CHINESE_HK_STROKE; +import static org.apache.phoenix.util.i18n.LinguisticSort.CHINESE_TW; +import static org.apache.phoenix.util.i18n.LinguisticSort.CHINESE_TW_STROKE; +import static org.apache.phoenix.util.i18n.LinguisticSort.CROATIAN; +import static org.apache.phoenix.util.i18n.LinguisticSort.ESTONIAN; +import static org.apache.phoenix.util.i18n.LinguisticSort.FINNISH; +import static org.apache.phoenix.util.i18n.LinguisticSort.HUNGARIAN; +import static org.apache.phoenix.util.i18n.LinguisticSort.ICELANDIC; +import static org.apache.phoenix.util.i18n.LinguisticSort.JAPANESE; +import static org.apache.phoenix.util.i18n.LinguisticSort.KOREAN; +import static org.apache.phoenix.util.i18n.LinguisticSort.LATVIAN; +import static org.apache.phoenix.util.i18n.LinguisticSort.LITHUANIAN; +import static org.apache.phoenix.util.i18n.LinguisticSort.ROMANIAN; +import static org.apache.phoenix.util.i18n.LinguisticSort.SERBIAN_LATIN; +import static org.apache.phoenix.util.i18n.LinguisticSort.SLOVAK; +import static org.apache.phoenix.util.i18n.LinguisticSort.SLOVENE; +import static org.apache.phoenix.util.i18n.LinguisticSort.TAJIK; +import static org.apache.phoenix.util.i18n.LinguisticSort.TURKISH; +import static org.apache.phoenix.util.i18n.LinguisticSort.TURKMEN; +import static org.apache.phoenix.util.i18n.LinguisticSort.VIETNAMESE; +import static org.apache.phoenix.util.i18n.LinguisticSort.LUXEMBOURGISH; +import static org.apache.phoenix.util.i18n.LinguisticSort.URDU; +import static org.apache.phoenix.util.i18n.LinguisticSort.TAMIL; +import static org.apache.phoenix.util.i18n.LinguisticSort.ESPERANTO; + +import com.ibm.icu.text.Normalizer2; + +import java.text.CollationKey; +import java.text.Collator; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.EnumSet; +import java.util.List; +import java.util.Locale; +import java.util.Random; +import java.util.Set; + +import org.apache.phoenix.thirdparty.com.google.common.collect.ImmutableList; +import org.apache.phoenix.thirdparty.com.google.common.collect.Ordering; + +import junit.framework.TestCase; + +/** + * This test class was partially copied from Salesforce's internationalization utility library + * (com.salesforce.i18n:i18n-util:1.0.4), which was released under the 3-clause BSD License. + * The i18n-util library is not maintained anymore, and it was using vulnerable dependencies. + * For more info, see: https://issues.apache.org/jira/browse/PHOENIX-6818 + * + * This could be expanded significantly. + */ +public class LinguisticSortTest extends TestCase { + + public LinguisticSortTest(String name) { + super(name); + } + + public void testThaiBasicSorting() { + Locale thaiLoc = new Locale("th"); + + LinguisticSort thaiSort = LinguisticSort.get(thaiLoc); + + // basic sanity check on thai collator comparisons + ImmutableList<String> unsorted = + ImmutableList.of("azw", "Ac", "ab", "21", "zaa", "b\u0e40k", "bk"); + ImmutableList<String> sorted = + ImmutableList.of("21", "ab", "Ac", "azw", "bk", "b\u0e40k", "zaa"); + + assertEquals(sorted, + Ordering.from(thaiSort.getNonCachingComparator()).sortedCopy(unsorted)); + assertEquals(sorted, + Ordering.from(thaiSort.getComparator(16)).sortedCopy(unsorted)); + } + + public void testThaiCharactersOfDeath() { + // This is the original bug report + Collator c = Collator.getInstance(new Locale("th")); + String s = "\u0e40"; + // any one of \u0e40, \u0e41, \u0e42, \u0e43, or \u0e44 will do + System.out.println(c.compare(s, s)); // In JDK6: runs forever + + + // Here's the "real" test + Locale thaiLoc = new Locale("th"); + + LinguisticSort thaiSort = LinguisticSort.get(thaiLoc); + Collator thaiColl = thaiSort.getCollator(); + + String [] oomStrings = { + "\u0e3f", "\u0e45", "\u0e40k", "\u0e44", "\u0e43", "\u0e42", "\u0e41", "\u0e40" + }; + String [] srcStrings = oomStrings; + // Deprecated Patched collator adds space after problematic characters at end of string + // (because of http://bugs.sun.com/view_bug.do?bug_id=5047314) + // Otherwise unpatched collator would OOM on these strings + // String [] srcStrings = { + // "\u0e3f", "\u0e45", "\u0e40k", "\u0e44 ", "\u0e43 ", "\u0e42 ", "\u0e41 ", "\u0e40 " + // }; + + for (int i=0; i<oomStrings.length;i++) { + String oomString = oomStrings[i]; + CollationKey key = thaiColl.getCollationKey(oomString); + assertEquals("string #"+i, srcStrings[i], key.getSourceString()); + } + } + + public void testRolodexIndexByChar() throws Exception{ + LinguisticSort englishSort = LinguisticSort.ENGLISH; + + assertEquals(0, englishSort.getRolodexIndexForChar("a")); + assertEquals(0, englishSort.getRolodexIndexForChar("Á")); + assertEquals(1, englishSort.getRolodexIndexForChar("b")); + assertEquals(13, englishSort.getRolodexIndexForChar("N")); + assertEquals(13, englishSort.getRolodexIndexForChar("Ñ")); + assertEquals(25, englishSort.getRolodexIndexForChar("z")); + //А below is the Cyrillic А + assertOther(Arrays.asList("А", "こ"), englishSort); + + //Spanish + LinguisticSort spanishSort = LinguisticSort.SPANISH; + assertEquals(0, spanishSort.getRolodexIndexForChar("a")); + assertEquals(0, spanishSort.getRolodexIndexForChar("Á")); + assertEquals(1, spanishSort.getRolodexIndexForChar("b")); + assertEquals(13, spanishSort.getRolodexIndexForChar("N")); + assertEquals(14, spanishSort.getRolodexIndexForChar("Ñ")); + assertEquals(26, spanishSort.getRolodexIndexForChar("z")); + //А below is the Cyrillic А + assertOther(Arrays.asList("А", "こ"), spanishSort); + + //Japanese + LinguisticSort japaneseSort = LinguisticSort.JAPANESE; + assertEquals(0, japaneseSort.getRolodexIndexForChar("a")); + assertEquals(0, japaneseSort.getRolodexIndexForChar("Á")); + assertEquals(1, japaneseSort.getRolodexIndexForChar("b")); + assertEquals(13, japaneseSort.getRolodexIndexForChar("N")); + assertEquals(13, japaneseSort.getRolodexIndexForChar("Ñ")); + assertEquals(25, japaneseSort.getRolodexIndexForChar("z")); + assertEquals(27, japaneseSort.getRolodexIndexForChar("こ")); + assertEquals(27, japaneseSort.getRolodexIndexForChar("く")); + assertEquals(31, japaneseSort.getRolodexIndexForChar("ふ")); + //А below is the Cyrillic А + assertOther(Arrays.asList("\u0410"), spanishSort); // А + + //Malay has a rolodex + LinguisticSort malaySort = LinguisticSort.MALAY; + assertEquals(0, malaySort.getRolodexIndexForChar("a")); + assertEquals(25, malaySort.getRolodexIndexForChar("z")); + assertOther(Arrays.asList("\u0410", "\u304f"), malaySort); // "А", "く" + + // Thai has a rolodex, all of these should be "other" + // (Thai has 44 chars, so other is 46) + LinguisticSort thaiSort = LinguisticSort.THAI; + assertConstant(Arrays.asList("A", "Á", "b", "\u304f", "\u0410"), + thaiSort, 46, "had a rolodex index."); + + } + + public void testRolodexComparedToIcu() { + Set<LinguisticSort> knownDifferences = EnumSet.of( + CATALAN, FINNISH, TURKISH, CHINESE_HK, CHINESE_HK_STROKE, CHINESE_TW, + CHINESE_TW_STROKE, JAPANESE, KOREAN, BULGARIAN, ROMANIAN, VIETNAMESE, + HUNGARIAN, SLOVAK, SERBIAN_LATIN, BOSNIAN, BASQUE, LUXEMBOURGISH, SLOVENE, + CROATIAN, ESTONIAN, ICELANDIC, LATVIAN, LITHUANIAN, TAJIK, TURKMEN, AZERBAIJANI, + URDU, BENGALI, TAMIL, ESPERANTO); + + for (LinguisticSort sort : LinguisticSort.values()) { + if (knownDifferences.contains(sort)) { + continue; + } + + String[] alphabet = sort.getAlphabet(); + String[] icuAlphabet = LinguisticSort.getAlphabetFromICU(sort.getLocale()); + String alphaAsString = Arrays.toString(alphabet); + String icuAlphaAsString = Arrays.toString(icuAlphabet); + + assertEquals("LinguisticSort for " + sort + " doesn't match", + icuAlphaAsString, alphaAsString); + if (!icuAlphaAsString.equals(alphaAsString)) { + System.out.println(sort + "\n" + icuAlphaAsString + "\n" + alphaAsString); + } else { + //System.out.println(sort + ":SAME"); + } + } + } + + private void assertOther(Collection<String> chars, LinguisticSort sort){ + assertConstant(chars, sort, sort.getAlphabetLength(), "wasn't in 'Other' category"); + } + + private void assertConstant(Collection<String> chars, LinguisticSort sort, + int constant, String message) { + for (String c : chars){ + assertEquals(c + " " + message, constant, sort.getRolodexIndexForChar(c)); + } + } + + /** + * Make sure the upper case collator works equivalently to upper-casing then collating + */ + public void testUpperCaseCollator() { + // bump these up for performance testing + final int repeatTimes = 1; + final int testSize = 1000; + + testUpperCaseCollator(true, repeatTimes, testSize); + testUpperCaseCollator(false, repeatTimes, testSize); + } + + /** + * Implementation of the testUpperCaseCollator that allows breaking out an ascii only + * test from a general string test + */ + private void testUpperCaseCollator(boolean asciiOnly, int repeatTimes, int testSize) { + final LinguisticSort sort = LinguisticSort.ENGLISH; + final Collator collator = sort.getCollator(); + + final Collator ucCollator = sort.getUpperCaseCollator(false); + + final Random r = new Random(); + final int maxLength = 100; + for (int iteration = 0; iteration < repeatTimes; iteration++) { + final boolean lastTime = iteration == repeatTimes - 1; + final String[] originals = new String[testSize]; + for (int i = 0; i < testSize; i++) { + switch (i) { + case 0: + originals[i] = "abß"; + break; + case 1: + originals[i] = "abSS"; + break; + case 2: + originals[i] = "abß"; + break; + case 3: + originals[i] = "ffo"; + break; + case 4: + originals[i] = "ffi"; + break; + case 5: + originals[i] = "FFI"; + break; + case 6: + originals[i] = "fred"; + break; + case 7: + originals[i] = "FRED"; + break; + case 8: + originals[i] = "FREE"; + break; + case 9: + originals[i] = "剫"; + break; + case 10: + originals[i] = "뻎"; + break; + case 11: + originals[i] = "\u1fe3"; + break; + case 12: + originals[i] = "\u05d7"; + break; + case 13: + originals[i] = "\u1fd3"; + break; + case 14: + originals[i] = "\u1441"; + break; + case 15: + originals[i] = "\ub9fe"; + break; + case 16: + originals[i] = "\u0398"; + break; + case 17: + originals[i] = "\u0399"; + break; + case 18: + originals[i] = "\u039a"; + break; + case 19: + originals[i] = "\u4371"; + break; + case 20: + originals[i] = "\ufb06"; + break; + default : + originals[i] = randomString(r, maxLength, asciiOnly); + } + } + + final int[] upperResults = new int[testSize]; + { + final long start = System.currentTimeMillis(); + for (int i = 0; i < testSize; i++) { + final int next = i + 1 == testSize ? 0 : i + 1; + upperResults[i] = collator.compare(sort.getUpperCaseValue(originals[i], false), + sort.getUpperCaseValue(originals[next], false)); + } + if (lastTime) { + final long time = System.currentTimeMillis() - start; + System.out.println("Compared " + testSize + " " + (asciiOnly ? "ascii " : "") + + "strings with upper casing in " + time + "ms"); + } + } + + final int[] caseResults = new int[testSize]; + { + final long start = System.currentTimeMillis(); + for (int i = 0; i < testSize; i++) { + final int next = i + 1 == testSize ? 0 : i + 1; + caseResults[i] = ucCollator.compare(originals[i], originals[next]); + } + if (lastTime) { + final long time = System.currentTimeMillis() - start; + System.out.println("Compared " + testSize + " " + (asciiOnly ? "ascii " : "") + + "strings with upper case collator comparison in " + time + "ms"); + } + } + + final int[] keyResults = new int[testSize]; + { + final long start = System.currentTimeMillis(); + for (int i = 0; i < testSize; i++) { + final int next = i + 1 == testSize ? 0 : i + 1; + keyResults[i] = ucCollator.getCollationKey(originals[i]) + .compareTo(ucCollator.getCollationKey(originals[next])); + } + if (lastTime) { + final long time = System.currentTimeMillis() - start; + System.out.println("Compared " + testSize + " " + (asciiOnly ? "ascii " : "") + + "strings with collation keys in " + time + "ms"); + } + } + + if (lastTime) { + System.out.println(); + } + + if (lastTime) { + // normalizing helps see why strings don't compare the same when upper-cased + final Normalizer2 normalizer = Normalizer2.getNFKDInstance(); + for (int i = 0; i < testSize; i++) { + final int next = i + 1 == testSize ? 0 : i + 1; + final boolean caseOk = upperResults[i] == caseResults[i]; + final boolean keyOk = upperResults[i] == keyResults[i]; + if (!caseOk || !keyOk) { + final String message = + "Did not get expected result when comparing string " + i + " " + + (caseOk ? "" : "using upper case collator comparison ") + + (caseOk || keyOk ? "" : "or ") + + (keyOk ? "" : "using collation key comparison ") + + "\n" + + "'" + escape(originals[i]) + "'\n" + + "(" + escape(sort.getUpperCaseValue(originals[i], false)) + ")\n" + + "<" + escape(normalizer.normalize(originals[i])) + "> " + + "with string " + next + " \n" + + "'" + escape(originals[next]) + "'\n" + + "(" + escape(sort.getUpperCaseValue(originals[next], false)) + + ")\n " + + "<" + escape(normalizer.normalize(originals[next])) + ">"; + assertEquals(message, upperResults[i], caseResults[i]); + } + } + } + } + } + + /** + * For diagnosis of mismatched strings, dumps a string using standard Java notation + * for escaping non-printable or non-ascii characters + */ + private String escape(String string) { + final StringBuilder sb = new StringBuilder(string.length() * 2); + int index = 0; + while (index < string.length()) { + final int ch = string.codePointAt(index); + index += Character.charCount(ch); + + escapeCodePoint(sb, ch); + } + return sb.toString(); + } + + /** + * Escapes a single code point so that non-ascii and non-printable characters use + * their standard Java escape + */ + private void escapeCodePoint(final StringBuilder sb, final int ch) { + switch(ch) { + case '\b' : sb.append("\\b"); + break; + case '\t' : sb.append("\\t"); + break; + case '\n' : sb.append("\\n"); + break; + case '\r' : sb.append("\\r"); + break; + case '\f' : sb.append("\\f"); + break; + case '\"' : sb.append("\\\""); + break; + case '\\' : sb.append("\\\\"); + break; + default: + if (ch < 0x20 || ch > 0x7E) { + sb.append(String.format("\\u%04x", ch)); + } else { + sb.appendCodePoint(ch); + } + } + } + + /** + * Generates a random string with between 0 and maxLength characters + */ + private String randomString(Random r, int maxLength, boolean asciiOnly) { + final int length = r.nextInt(maxLength); + return randomFixedLengthString(r, length, asciiOnly); + } + + + /** + * Generates a random string of the given length + */ + private String randomFixedLengthString(Random r, int length, boolean asciiOnly) { + final StringBuilder sb = new StringBuilder(); + for (int i = 0; i < length; i++) { + char c = 0; + while (!Character.isDefined(c) || Character.isISOControl(c)) { + c = (char)(asciiOnly ? r.nextInt(128) : r.nextInt()); + } + sb.append(c); + } + return sb.toString(); + } + + public void testUpperCaseExceptionChars() { + // Sharp s in English + String[][] enCases = new String[][] { + // { input, expected output } + new String[] { "ß", "ß" }, + new String[] { "ßß", "ßß" }, + new String[] { "ßßß", "ßßß" }, + new String[] { "aß", "Aß" }, + new String[] { "aaaß", "AAAß" }, + new String[] { "ßa", "ßA" }, + new String[] { "ßaaa", "ßAAA" }, + new String[] { "aßb", "AßB" }, + new String[] { "aaaßbbb", "AAAßBBB" }, + new String[] { "ßaß", "ßAß" }, + new String[] { "ßaaaß", "ßAAAß" }, + new String[] { "aßbßc", "AßBßC" }, + new String[] { "aaaßbbbßccc", "AAAßBBBßCCC" }, + new String[] { "aßßc", "AßßC" }, + new String[] { "aaaßßccc", "AAAßßCCC" }, + }; + + for (String[] c : enCases) { + assertEquals(c[1], LinguisticSort.ENGLISH.getUpperCaseValue(c[0], false)); + } + + // Omicron in Greek + String[][] greekCases = new String[][] { + new String[] { "\u039f", "\u039f" }, // capital omicron + new String[] { "Ό", "\u039f" } + + }; + + for (String[] c : greekCases) { + assertEquals(c[1], LinguisticSort.GREEK.getUpperCaseValue(c[0], false)); + } + } + + public void testUsesUpper() { + assertTrue(LinguisticSort.ENGLISH.usesUpperToGetUpperCase(false)); + assertTrue(LinguisticSort.ESPERANTO.usesUpperToGetUpperCase(false)); + assertTrue(!LinguisticSort.GERMAN.usesUpperToGetUpperCase(false)); + } + + public void testGetUpperCaseCollationKey() { + assertEquals(LinguisticSort.ENGLISH.getUpperCaseSql("x", false), + LinguisticSort.ENGLISH.getUpperCollationKeySql("x", false)); + } + + /** + * I wanted to see the perf impact of doing special-case logic in the EN locale for the German + * sharp s, ß. Rename this test (remove the leading _) to run it, e.g. in Eclipse. + * <p> + * This method generates two sets of 1000 randomish Strings, one with sharp s and one without. + * Then it runs 1 million uppercase operations on each bank of strings, using the EN locale + * (with the special-case logic) and a test locale -- EO, Esperanto -- which does not have + * any special-case logic. + * <p> + * For posterity, when I run this on my machine, I see results like this + * (averages rounded to nearest 10ms): + * <p> + * <table> + * <tr><td></td><td>ENGLSIH</td><td>ESPERANTO</td><td>GREEK</td></tr> + * <tr><td>with sharp s</td><td>330ms</td><td>260ms</td><td>370ms</td></tr> + * <tr><td>without sharp s</td><td>150ms</td><td>130ms</td><td>213ms</td></tr> + * </table> + */ + public void _testUpperCasePerf() { + String[] withSharpS = genStrings(1000, true); + String[] withoutSharpS = genStrings(1000, false); + + System.out.println("ENGLISH, with ß:"); + runUpperCase(LinguisticSort.ENGLISH, withSharpS); + System.out.println("ENGLISH, without ß:"); + runUpperCase(LinguisticSort.ENGLISH, withoutSharpS); + + System.out.println("ESPERANTO, with ß:"); + runUpperCase(LinguisticSort.ESPERANTO, withSharpS); + System.out.println("ESPERANTO, without ß:"); + runUpperCase(LinguisticSort.ESPERANTO, withoutSharpS); + + // Interesting for having a lot of exceptions. + System.out.println("GREEK, with ß:"); + runUpperCase(LinguisticSort.GREEK, withSharpS); + System.out.println("GREEK, without ß:"); + runUpperCase(LinguisticSort.GREEK, withoutSharpS); + } + + private void runUpperCase(LinguisticSort sort, String[] inputs) { + // Warm up + for (int i = 0; i < 10000; i++) { + sort.getUpperCaseValue(inputs[i % inputs.length], false); + } + + // Run experiment + for (int i = 0; i < 3; i++) { + long start = System.currentTimeMillis(); + for (int j = 0; j < 1000000; j++) { + sort.getUpperCaseValue(inputs[j % inputs.length], false); + } + + System.out.println("[" + (i + 1) + "] Complete in " + + (System.currentTimeMillis() - start) + "ms."); + } + } + + /** + * Return n randomly generated strings, each containing at least + * one sharp s if useSharpS is true. + * */ + private String[] genStrings(int n, boolean useSharpS) { + Random r = new Random(); + + String[] inputs = new String[n]; + for (int i = 0; i < inputs.length; i++) { + inputs[i] = randomString(r, r.nextInt(12) + 1, r.nextBoolean()) + + (useSharpS? "ß" : "") + + (r.nextBoolean() ? + randomString(r, r.nextInt(12) + 1, r.nextBoolean()) + (useSharpS? "ß" : "") + : "") + + (randomString(r, r.nextInt(12) + 1, r.nextBoolean())); + + if (!useSharpS) assertFalse(inputs[i].contains("ß")); + } + return inputs; + } + + private List<String> cloneAndSort(LinguisticSort sort, List<String> source) { + List<String> result = new ArrayList<String>(source); + Collections.sort(result, sort.getCollator()); + return result; + } + + /** + * Validate that the sorting of the linguistic sorts for various locales is "correct" + * The toSort below is in this order. + * 阿嗄阾啊 : āáǎa + * 仈㶚 : bā bà + * 齑: ji + */ + public void testChineseSorting() { + final List<String> toSort = ImmutableList.of("\u963f", "\u55c4", "\u963e", + "\u554a", "\u4ec8", "\u3d9a", "\u9f51"); + assertEquals(ImmutableList.of("\u4ec8", "\u554a", "\u55c4", "\u3d9a", "\u963e", + "\u963f", "\u9f51"), cloneAndSort(LinguisticSort.CHINESE, toSort)); + assertEquals(ImmutableList.of("\u4ec8", "\u554a", "\u55c4", "\u3d9a", "\u963e", + "\u963f", "\u9f51"), cloneAndSort(LinguisticSort.CHINESE_HK, toSort)); + assertEquals(ImmutableList.of("\u4ec8", "\u554a", "\u55c4", "\u3d9a", "\u963e", + "\u963f", "\u9f51"), cloneAndSort(LinguisticSort.CHINESE_TW, toSort)); + assertEquals(ImmutableList.of("\u4ec8", "\u963e", "\u963f", "\u554a", "\u55c4", + "\u9f51", "\u3d9a"), cloneAndSort(LinguisticSort.CHINESE_STROKE, toSort)); + assertEquals(ImmutableList.of("\u4ec8", "\u963e", "\u963f", "\u554a", "\u55c4", + "\u9f51", "\u3d9a"), cloneAndSort(LinguisticSort.CHINESE_HK_STROKE, toSort)); + assertEquals(ImmutableList.of("\u4ec8", "\u963e", "\u963f", "\u554a", "\u55c4", + "\u9f51", "\u3d9a"), cloneAndSort(LinguisticSort.CHINESE_TW_STROKE, toSort)); + assertEquals(ImmutableList.of("\u963f", "\u55c4", "\u554a", "\u4ec8", "\u9f51", + "\u963e", "\u3d9a"), cloneAndSort(LinguisticSort.CHINESE_PINYIN, toSort)); + } + + public void testChineseLocaleMapping() { + assertEquals(LinguisticSort.CHINESE, + LinguisticSort.get(new Locale("zh"))); + assertEquals(LinguisticSort.CHINESE_TW, + LinguisticSort.get(new Locale("zh","TW"))); + assertEquals(LinguisticSort.CHINESE, + LinguisticSort.get(new Locale("zh","SG"))); + assertEquals(LinguisticSort.CHINESE_HK, + LinguisticSort.get(new Locale("zh","HK"))); + assertEquals(LinguisticSort.CHINESE_TW_STROKE, + LinguisticSort.get(new Locale("zh","TW","STROKE"))); + assertEquals(LinguisticSort.CHINESE_HK_STROKE, + LinguisticSort.get(new Locale("zh","HK","STROKE"))); + assertEquals(LinguisticSort.CHINESE_STROKE, + LinguisticSort.get(new Locale("zh","CN","STROKE"))); + assertEquals(LinguisticSort.CHINESE_STROKE, + LinguisticSort.get(new Locale("zh","SG","STROKE"))); + assertEquals(LinguisticSort.CHINESE_STROKE, + LinguisticSort.get(new Locale("zh","","STROKE"))); + assertEquals(LinguisticSort.CHINESE_PINYIN, + LinguisticSort.get(new Locale("zh","CN","PINYIN"))); + assertEquals(LinguisticSort.CHINESE_PINYIN, + LinguisticSort.get(new Locale("zh","SG","PINYIN"))); + assertEquals(LinguisticSort.CHINESE_PINYIN, + LinguisticSort.get(new Locale("zh","","PINYIN"))); + } +} diff --git a/phoenix-core/src/test/java/org/apache/phoenix/util/i18n/OracleUpperTableGeneratorTest.java b/phoenix-core/src/test/java/org/apache/phoenix/util/i18n/OracleUpperTableGeneratorTest.java new file mode 100644 index 0000000000..2e101cf78d --- /dev/null +++ b/phoenix-core/src/test/java/org/apache/phoenix/util/i18n/OracleUpperTableGeneratorTest.java @@ -0,0 +1,391 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.phoenix.util.i18n; + +import junit.framework.TestCase; + +import java.io.PrintWriter; +import java.io.StringWriter; +import java.util.Locale; + +/** + * This test class was partially copied from Salesforce's internationalization utility library + * (com.salesforce.i18n:i18n-util:1.0.4), which was released under the 3-clause BSD License. + * The i18n-util library is not maintained anymore, and it was using vulnerable dependencies. + * For more info, see: https://issues.apache.org/jira/browse/PHOENIX-6818 + * + * A generator for OracleUpperTable.java. This generator creates an OracleUpperTable for each + * of a number of {@link UpperExpr PL/SQL expressions}, which simply tabulates the these + * differences, allowing them to be compensated for. + * <p> + * May be run as a JUnit test or as a stand-alone Java application. Run the output in Oracle + * to generate the source for OracleUpperTable.java. + * + * @see OracleUpper + * @see OracleUpperTable + */ +public class OracleUpperTableGeneratorTest extends TestCase { + + private static final char[] charsToTest = new char[] { + // i may be messed up for Turkic languages where it's supposed to upper-case + // to dotted I. + 'i', + // Sharp s may upper-case to SS or itself, depending on the details. + 'ß', + // Oracle removes tonos from all of these when upper-casing. + 'Ά', 'Έ', 'Ή', 'Ί', 'Ό', 'Ύ','Ώ','ά','έ','ή','ί','ό','ύ','ώ' + }; + + /** + * Most of these were just generated from the LinguisticSort enum: + * + * <pre><code> + * public static void generateValuesFromLinguisticSort() { + * for (LinguisticSort s : LinguisticSort.values()) { + * System.out.println(String.format("%1$s(\"%2$s\", \"%3$s\"),", + * s.name(), s.getUpperSqlFormatString(), s.getLocale().getLanguage())); + * } + * } + * </code></pre> + * + * Each value is a PL/SQL upper case expression that may return different results than + * Java's String.toUpperCase method for the given language. + */ + private enum UpperExpr { + ENGLISH("upper(%s)", "en"), + GERMAN("nls_upper(%s, 'nls_sort=xgerman')", "de"), + FRENCH("nls_upper(%s, 'nls_sort=xfrench')", "fr"), + ITALIAN("nls_upper(%s, 'nls_sort=italian')", "it"), + SPANISH("nls_upper(%s, 'nls_sort=spanish')", "es"), + CATALAN("nls_upper(%s, 'nls_sort=catalan')", "ca"), + DUTCH("nls_upper(%s, 'nls_sort=dutch')", "nl"), + PORTUGUESE("nls_upper(%s, 'nls_sort=west_european')", "pt"), + DANISH("nls_upper(%s, 'nls_sort=danish')", "da"), + NORWEGIAN("nls_upper(%s, 'nls_sort=norwegian')", "no"), + SWEDISH("nls_upper(%s, 'nls_sort=swedish')", "sv"), + FINNISH("nls_upper(%s, 'nls_sort=finnish')", "fi"), + CZECH("nls_upper(%s, 'nls_sort=xczech')", "cs"), + POLISH("nls_upper(%s, 'nls_sort=polish')", "pl"), + TURKISH("nls_upper(translate(%s,'i','İ'), 'nls_sort=xturkish')", "tr"), + CHINESE_HK("nls_upper(to_single_byte(%s), 'nls_sort=tchinese_radical_m')", "zh"), + CHINESE_TW("nls_upper(to_single_byte(%s), 'nls_sort=tchinese_radical_m')", "zh"), + CHINESE("nls_upper(to_single_byte(%s), 'nls_sort=schinese_radical_m')", "zh"), + JAPANESE("nls_upper(to_single_byte(%s), 'nls_sort=japanese_m')", "ja"), + KOREAN("nls_upper(to_single_byte(%s), 'nls_sort=korean_m')", "ko"), + RUSSIAN("nls_upper(%s, 'nls_sort=russian')", "ru"), + BULGARIAN("nls_upper(%s, 'nls_sort=bulgarian')", "bg"), + INDONESIAN("nls_upper(%s, 'nls_sort=indonesian')", "in"), + ROMANIAN("nls_upper(%s, 'nls_sort=romanian')", "ro"), + VIETNAMESE("nls_upper(%s, 'nls_sort=vietnamese')", "vi"), + UKRAINIAN("nls_upper(%s, 'nls_sort=ukrainian')", "uk"), + HUNGARIAN("nls_upper(%s, 'nls_sort=xhungarian')", "hu"), + GREEK("nls_upper(%s, 'nls_sort=greek')", "el"), + HEBREW("nls_upper(%s, 'nls_sort=hebrew')", "iw"), + SLOVAK("nls_upper(%s, 'nls_sort=slovak')", "sk"), + SERBIAN_CYRILLIC("nls_upper(%s, 'nls_sort=generic_m')", "sr"), + SERBIAN_LATIN("nls_upper(%s, 'nls_sort=xcroatian')", "sh"), + BOSNIAN("nls_upper(%s, 'nls_sort=xcroatian')", "bs"), + GEORGIAN("nls_upper(%s, 'nls_sort=binary')", "ka"), + BASQUE("nls_upper(%s, 'nls_sort=west_european')", "eu"), + MALTESE("nls_upper(%s, 'nls_sort=west_european')", "mt"), + ROMANSH("nls_upper(%s, 'nls_sort=west_european')", "rm"), + LUXEMBOURGISH("nls_upper(%s, 'nls_sort=west_european')", "lb"), + IRISH("nls_upper(%s, 'nls_sort=west_european')", "ga"), + SLOVENE("nls_upper(%s, 'nls_sort=xslovenian')", "sl"), + CROATIAN("nls_upper(%s, 'nls_sort=xcroatian')", "hr"), + MALAY("nls_upper(%s, 'nls_sort=malay')", "ms"), + ARABIC("nls_upper(%s, 'nls_sort=arabic')", "ar"), + ESTONIAN("nls_upper(%s, 'nls_sort=estonian')", "et"), + ICELANDIC("nls_upper(%s, 'nls_sort=icelandic')", "is"), + LATVIAN("nls_upper(%s, 'nls_sort=latvian')", "lv"), + LITHUANIAN("nls_upper(%s, 'nls_sort=lithuanian')", "lt"), + KYRGYZ("nls_upper(%s, 'nls_sort=binary')", "ky"), + KAZAKH("nls_upper(%s, 'nls_sort=binary')", "kk"), + TAJIK("nls_upper(%s, 'nls_sort=russian')", "tg"), + BELARUSIAN("nls_upper(%s, 'nls_sort=russian')", "be"), + TURKMEN("nls_upper(translate(%s,'i','İ'), 'nls_sort=xturkish')", "tk"), + AZERBAIJANI("nls_upper(translate(%s,'i','İ'), 'nls_sort=xturkish')", "az"), + ARMENIAN("nls_upper(%s, 'nls_sort=binary')", "hy"), + THAI("nls_upper(%s, 'nls_sort=thai_dictionary')", "th"), + HINDI("nls_upper(%s, 'nls_sort=binary')", "hi"), + URDU("nls_upper(%s, 'nls_sort=arabic')", "ur"), + BENGALI("nls_upper(%s, 'nls_sort=bengali')", "bn"), + TAMIL("nls_upper(%s, 'nls_sort=binary')", "ta"), + ESPERANTO("upper(%s)", "eo"), + + // for formulas + XWEST_EUROPEAN("NLS_UPPER(%s,'NLS_SORT=xwest_european')", "en"); + + + private final String expr; + private final Locale locale; + + /** + * @param expr the PL/SQL expression with %s wildcards for the single string input. + * @param langCode ISO code for the language to use, as in + * <code> str.toUpperCase(new Locale(langCode))<code>. + */ + private UpperExpr(String expr, String langCode) { + this.expr = expr; + this.locale = new Locale(langCode); + } + + private String getSql(char value) { + return String.format(expr, "unistr('\\" + hexCodePoint(value) + "')"); + } + + private String getJava(char value) { + return Character.toString(value).toUpperCase(locale); + } + } + + /** + * This method generates some anonymous PL/SQL routines which, when run, will generate an + * OracleUpperTable value for each {@code UpperExpr}. Each table is created by comparing + * the result of {@link String#toUpperCase(Locale)} against a + * {@link UpperExpr#getSql(char) PL/SQL expression}. The table contains all deviations from + * Oracle for each character in a {@link #charsToTest given set} that we know are fussy. + */ + public static void generateUpperCaseExceptions(PrintWriter out) { + + out.println("set serveroutput on;"); + out.println("set define off;"); // So we don't have to escape ampersands. + out.println("/"); + out.println("BEGIN"); + + putLine(out, "/*"); + putLine(out, " * Licensed to the Apache Software Foundation (ASF) under one or more"); + putLine(out, " * contributor license agreements. See the NOTICE file distributed with"); + putLine(out, " * this work for additional information regarding copyright ownership."); + putLine(out, " * The ASF licenses this file to you under the Apache License, Version 2.0"); + putLine(out, " * (the \"License\"); you may not use this file except in compliance with"); + putLine(out, " * the License. You may obtain a copy of the License at"); + putLine(out, " *"); + putLine(out, " * http://www.apache.org/licenses/LICENSE-2.0"); + putLine(out, " *"); + putLine(out, " * Unless required by applicable law or agreed to in writing, software"); + putLine(out, " * distributed under the License is distributed on an \"AS IS\" BASIS,"); + putLine(out, " * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied."); + putLine(out, " * See the License for the specific language governing permissions and"); + putLine(out, " * limitations under the License."); + putLine(out, " */"); + + putLine(out, "package i18n;"); + putLine(out, ""); + putLine(out, "import java.util.Locale;"); + putLine(out, "import edu.umd.cs.findbugs.annotations.NonNull;"); + putLine(out, ""); + putLine(out, "/**"); + putLine(out, " * Generated by " + OracleUpperTableGeneratorTest.class.getCanonicalName()); + putLine(out, " * <p>"); + putLine(out, " * An instance of this enum codifies the difference between executing a " + + "{@link #getSqlFormatString() particular PL/SQL"); + putLine(out, " * expression} in Oracle and executing {@link String#toUpperCase(Locale)} " + + "for a {@link #getLocale() particular locale}"); + putLine(out, " * in Java. These differences (also called exceptions) are expressed by " + + "the output of {@link #getUpperCaseExceptions()}"); + putLine(out, " * and {@link #getUpperCaseExceptionMapping(char)}."); + putLine(out, " * <p>"); + putLine(out, " * The tables are generated by testing a particular set of characters " + + "that are known to contain exceptions and"); + putLine(out, " * {@link #toUpperCase(String) may be used} to compensate for exceptions " + + "found and generate output in Java that will be"); + putLine(out, " * consistent with Oracle for the given (sql expression, locale) pair " + + "over all tested values."); + putLine(out, " * <p>"); + putLine(out, " * Characters tested:"); + putLine(out, " * <ul>"); + for (char c : charsToTest) { + putLine(out, " * <li>U+%1$s &#x%1$s</li>", hexCodePoint(c)); + } + putLine(out, " * </ul>"); + putLine(out, " *"); + putLine(out, " * @see OracleUpper"); + putLine(out, " */"); + putLine(out, "public enum OracleUpperTable {"); + + for (UpperExpr u : UpperExpr.values()) { + put(out, " %s(\"%s\", \"%s\", \"", u.name(), u.expr, u.locale.getLanguage()); + + // Don't generate any exceptions for EO, it's a test value and + // I wanna use it as a baseline. + if (u != UpperExpr.ESPERANTO) { + for (char c : charsToTest) { + String template = "IF %1$s <> '%2$s' THEN dbms_output.put(unistr('\\%3$s')); END IF;"; + out.println(String.format(template, u.getSql(c), u.getJava(c), hexCodePoint(c))); + } + } + + putLine(out, "\"),"); + } + + putLine(out, " ;"); + putLine(out, ""); + putLine(out, " private final String sql;"); + putLine(out, " private final Locale locale;"); + putLine(out, " private final char[] exceptionChars;"); + putLine(out, ""); + putLine(out, " private OracleUpperTable(String sql, String lang, " + + "String exceptionChars) {"); + putLine(out, " this.sql = sql;"); + putLine(out, " this.locale = new Locale(lang);"); + putLine(out, " this.exceptionChars = exceptionChars.toCharArray();"); + putLine(out, " }"); + putLine(out, ""); + putLine(out, " /**"); + putLine(out, " * Return an array containing characters for which Java's " + + "String.toUpperCase method is known to"); + putLine(out, " * deviate from the result of Oracle evaluating {@link #getSql(String) " + + "this expression}."); + putLine(out, " *"); + putLine(out, " * @return an array containing all exceptional characters."); + putLine(out, " */"); + putLine(out, " final @NonNull char[] getUpperCaseExceptions() {"); + putLine(out, " return exceptionChars;"); + putLine(out, " }"); + putLine(out, ""); + putLine(out, " /**"); + putLine(out, " * For a character, {@code exception}, contained in the String " + + "returned from"); + putLine(out, " * {@link #getUpperCaseExceptions()}, this method returns the " + + "anticipated result of upper-casing"); + putLine(out, " * the character in Oracle when evaluating {@link #getSql(String) " + + "this expression}."); + putLine(out, " *"); + putLine(out, " * @return the upper case of {@code exception}, according to what " + + "Oracle would do."); + putLine(out, " * @throws IllegalArgumentException"); + putLine(out, " * if the character is not contained in the String returned"); + putLine(out, " * by {@link #getUpperCaseExceptions()}."); + putLine(out, " */"); + putLine(out, " final String getUpperCaseExceptionMapping(char exception) {"); + + putLine(out, " switch (exception) {"); + for (char c : charsToTest){ + putLine(out, " case '%s':", "" + c); + putLine(out, " switch (this) {"); + for (UpperExpr u : UpperExpr.values()) { + if (u == UpperExpr.ESPERANTO) { + continue; + } + String template = "IF %1$s <> '%2$s' THEN dbms_output.put_line(' " + + "case %3$s: return ' || '\"' || %1$s || '\"; // %2$s'); END IF;"; + out.println(String.format(template, + u.getSql(c), + u.getJava(c), + u.name())); + } + putLine(out, " default: // fall out"); + putLine(out, " }"); + putLine(out, " break;"); + } + putLine(out, " }"); + + putLine(out, " throw new IllegalArgumentException("); + putLine(out, " \"No upper case mapping for char=\" + exception"); + putLine(out, " + \" and this=\" + this);"); + putLine(out, " }"); + putLine(out, ""); + + putLine(out, " public final Locale getLocale() {"); + putLine(out, " return locale;"); + putLine(out, " }"); + putLine(out, ""); + + putLine(out, " public String getSqlFormatString() {"); + putLine(out, " return sql;"); + putLine(out, " }"); + putLine(out, ""); + + putLine(out, " public String getSql(String expr) {"); + putLine(out, " return String.format(sql, expr);"); + putLine(out, " }"); + putLine(out, ""); + + putLine(out, " public String toUpperCase(String value) {"); + putLine(out, " return OracleUpper.toUpperCase(this, value);"); + putLine(out, " }"); + putLine(out, ""); + + putLine(out, " public static final OracleUpperTable forLinguisticSort(String sort) {"); + putLine(out, " return Enum.valueOf(OracleUpperTable.class, sort);"); + putLine(out, " }"); + putLine(out, "}"); + + out.println("END;"); + } + + /** Escape single quotes by doubling them up (i.e. two single quotes in a row). */ + private static String sqlEscape(String str) { + //return TextUtil.replaceChar(str, '\'', "''"); + return str.replace("'", "''"); + } + + /** Return four hex digits of the character's codepoint. */ + private static String hexCodePoint(char c) { + String cp = Integer.toHexString(c); + while (cp.length() < 4) { + cp = "0" + cp; + } + return cp; + } + + /** Send to standard output a dbms_output.put_line call that will emit the result of + * {@link String#format(String, Object...) formatting} {@code str} with {@code args}. + * + * @param str a format string + * @param args optional format arguments. + */ + private static void put(PrintWriter out, String str, String... args) { + out.println("dbms_output.put('" + format(str, args) + "');"); + } + + /** Send to standard output a dbms_output.put call that will emit the result of + * {@link #format(String, String...) formatting} {@code str} with {@code args}. + * + * @param str a format string + * @param args optional format arguments. + */ + private static void putLine(PrintWriter out, String str, String... args) { + out.println("dbms_output.put_line('" + format(str, args) + "');"); + } + + /** + * Both {@code str} and {@code args} will be {@link #sqlEscape(String) sql escaped}, + * and then {@code str} will be {@link String#format(String, Object...) formatted} + * using {@code args}. + */ + private static String format(String str, String... args) { + str = sqlEscape(str); + if (args != null && args.length > 0) { + for (int i = 0; i < args.length; i++) { + args[i] = sqlEscape(args[i]); + } + str = String.format(str, (Object[])args); + } + return str; + } + + public static void main(String[] args) { + generateUpperCaseExceptions(new PrintWriter(System.out)); + } + + public void testGenerateUpperCaseExceptions() { + // Don't bother logging it, just see if there's an exception + generateUpperCaseExceptions(new PrintWriter(new StringWriter())); + } +} diff --git a/pom.xml b/pom.xml index 3cac0e074a..ef99d0de18 100644 --- a/pom.xml +++ b/pom.xml @@ -123,7 +123,7 @@ <joni.version>2.1.31</joni.version> <omid.version>1.1.0</omid.version> <stream.version>2.9.5</stream.version> - <i18n-util.version>1.0.4</i18n-util.version> + <icu4j.version>72.1</icu4j.version> <guice.version>4.0</guice.version> <zookeeper.version>3.5.7</zookeeper.version> <curator.version>4.2.0</curator.version> @@ -1428,9 +1428,14 @@ <version>${stream.version}</version> </dependency> <dependency> - <groupId>com.salesforce.i18n</groupId> - <artifactId>i18n-util</artifactId> - <version>${i18n-util.version}</version> + <groupId>com.ibm.icu</groupId> + <artifactId>icu4j</artifactId> + <version>${icu4j.version}</version> + </dependency> + <dependency> + <groupId>com.ibm.icu</groupId> + <artifactId>icu4j-localespi</artifactId> + <version>${icu4j.version}</version> </dependency> <dependency> <groupId>com.lmax</groupId>