[ https://issues.apache.org/jira/browse/PHOENIX-4237?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16209670#comment-16209670 ]
ASF GitHub Bot commented on PHOENIX-4237: ----------------------------------------- Github user JamesRTaylor commented on a diff in the pull request: https://github.com/apache/phoenix/pull/275#discussion_r145474350 --- Diff: phoenix-core/src/main/java/org/apache/phoenix/expression/function/CollationKeyFunction.java --- @@ -0,0 +1,230 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.phoenix.expression.function; + +import java.io.DataInput; +import java.io.IOException; +import java.sql.SQLException; +import java.text.Collator; +import java.util.List; +import java.util.Locale; + +import org.apache.commons.lang.BooleanUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; +import org.apache.phoenix.expression.Expression; +import org.apache.phoenix.expression.LiteralExpression; +import org.apache.phoenix.parse.FunctionParseNode; +import org.apache.phoenix.schema.tuple.Tuple; +import org.apache.phoenix.schema.types.PBoolean; +import org.apache.phoenix.schema.types.PDataType; +import org.apache.phoenix.schema.types.PInteger; +import org.apache.phoenix.schema.types.PVarbinary; +import org.apache.phoenix.schema.types.PVarchar; +import org.apache.phoenix.util.VarBinaryFormatter; + +import com.force.db.i18n.LinguisticSort; +import com.force.i18n.LocaleUtils; + +/** + * A Phoenix Function that calculates a collation key for an input + * string based on a caller-provided locale and collator strength and + * decomposition settings. + * + * The locale should be specified as xx_yy_variant where xx is the ISO + * 639-1 2-letter language code, yy is the the ISO 3166 2-letter + * country code. Both countryCode and variant are optional. For + * example, zh_TW_STROKE, zh_TW and zh are all valid locale + * representations. Note the language code, country code and variant + * are used as arguments to the constructor of java.util.Locale. + * + * This function uses the open-source grammaticus and i18n-util + * packages to obtain the collators it needs from the provided locale. + * + * The LinguisticSort implementation in i18n-util encapsulates + * sort-related functionality for a substantive list of locales. For + * each locale, it provides a collator and an Oracle-specific database + * function that can be used to sort strings according to the natural + * language rules of that locale. + * + * This function uses the collator returned by + * LinguisticSort.getCollator to produce a collation key for its input + * string. A user can expect that the sorting semantics of this + * function for a given locale is equivalent to the sorting behaviour + * of an Oracle query that is constructed using the Oracle functions + * returned by LinguisticSort for that locale. + * + * The optional third argument to the function is a boolean that + * specifies whether to use the upper-case collator (case-insensitive) + * returned by LinguisticSort.getUpperCaseCollator. + * + * The optional fourth and fifth arguments are used to set + * respectively the strength and composition of the collator returned + * by LinguisticSort using the setStrength and setDecomposition + * methods of java.text.Collator. + * + * @author snakhoda-sfdc + * + */ +@FunctionParseNode.BuiltInFunction(name = CollationKeyFunction.NAME, args = { + // input string + @FunctionParseNode.Argument(allowedTypes = { PVarchar.class }), + // ISO Code for Locale + @FunctionParseNode.Argument(allowedTypes = { PVarchar.class }, isConstant = true), + // whether to use special upper case collator + @FunctionParseNode.Argument(allowedTypes = { PBoolean.class }, defaultValue = "false", isConstant = true), + // collator strength + @FunctionParseNode.Argument(allowedTypes = { PInteger.class }, defaultValue = "null", isConstant = true), + // collator decomposition + @FunctionParseNode.Argument(allowedTypes = { PInteger.class }, defaultValue = "null", isConstant = true) }) +public class CollationKeyFunction extends ScalarFunction { + + private static final Log LOG = LogFactory.getLog(CollationKeyFunction.class); + + public static final String NAME = "COLLATION_KEY"; + + private Collator collator; + + public CollationKeyFunction() { + } + + public CollationKeyFunction(List<Expression> children) throws SQLException { + super(children); + initialize(); + } + + @Override + public void readFields(DataInput input) throws IOException { + super.readFields(input); + initialize(); + } + + @Override + public boolean evaluate(Tuple tuple, ImmutableBytesWritable ptr) { + try { + String inputValue = getInputString(tuple, ptr); + byte[] collationKeyByteArray = collator.getCollationKey(inputValue).toByteArray(); + + if (LOG.isDebugEnabled()) { + LOG.debug("Collation key bytes: " + VarBinaryFormatter.INSTANCE.format(collationKeyByteArray)); + } + + ptr.set(collationKeyByteArray); + return true; + } catch (ExpressionEvaluationException e) { + LOG.debug("ExpressionEvaluationException caught: " + e.getMessage()); + return false; + } + } + + private void initialize() { + String localeISOCode = getLiteralValue(1, String.class); + Boolean useSpecialUpperCaseCollator = getLiteralValue(2, Boolean.class); + Integer collatorStrength = getLiteralValue(3, Integer.class); + Integer collatorDecomposition = getLiteralValue(4, Integer.class); + + if (LOG.isDebugEnabled()) { + StringBuilder logInputsMessage = new StringBuilder(); + logInputsMessage.append("Input (literal) arguments:").append("localeISOCode: " + localeISOCode) + .append(", useSpecialUpperCaseCollator: " + useSpecialUpperCaseCollator) + .append(", collatorStrength: " + collatorStrength) + .append(", collatorDecomposition: " + collatorDecomposition); + LOG.debug(logInputsMessage); + } + + Locale locale = LocaleUtils.get().getLocaleByIsoCode(localeISOCode); + + if (LOG.isDebugEnabled()) { + LOG.debug(String.format("Locale: " + locale.toLanguageTag())); + } + + LinguisticSort linguisticSort = LinguisticSort.get(locale); + + collator = BooleanUtils.isTrue(useSpecialUpperCaseCollator) ? linguisticSort.getUpperCaseCollator(false) + : linguisticSort.getCollator(); + + if (collatorStrength != null) { + collator.setStrength(collatorStrength); + } + + if (collatorDecomposition != null) { + collator.setDecomposition(collatorDecomposition); + } + + if (LOG.isDebugEnabled()) { + LOG.debug(String.format("Collator: [strength: %d, decomposition: %d], Special-Upper-Case: %s", + collator.getStrength(), collator.getDecomposition(), + BooleanUtils.isTrue(useSpecialUpperCaseCollator))); + } + } --- End diff -- As of PHOENIX-4294 (just make sure you've pulled the latest), you can declare your function as not being thread safe by adding the following method override: @Override public boolean isThreadSafe() { return false; } > Allow sorting on (Java) collation keys for non-English locales > -------------------------------------------------------------- > > Key: PHOENIX-4237 > URL: https://issues.apache.org/jira/browse/PHOENIX-4237 > Project: Phoenix > Issue Type: Improvement > Reporter: Shehzaad Nakhoda > Fix For: 4.12.0 > > > Strings stored via Phoenix can be composed from a subset of the entire set of > Unicode characters. The natural sort order for strings for different > languages often differs from the order dictated by the binary representation > of the characters of these strings. Java provides the idea of a Collator > which given an input string and a (language) locale can generate a Collation > Key which can then be used to compare strings in that natural order. > Salesforce has recently open-sourced grammaticus. IBM has open-sourced ICU4J > some time ago. These technologies can be combined to provide a robust new > Phoenix function that can be used in an ORDER BY clause to sort strings > according to the user's locale. -- This message was sent by Atlassian JIRA (v6.4.14#64029)