[ https://issues.apache.org/jira/browse/PHOENIX-4237?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16208271#comment-16208271 ]
ASF GitHub Bot commented on PHOENIX-4237: ----------------------------------------- Github user snakhoda-sfdc commented on a diff in the pull request: https://github.com/apache/phoenix/pull/275#discussion_r145245002 --- Diff: phoenix-core/src/main/java/org/apache/phoenix/expression/function/CollationKeyFunction.java --- @@ -0,0 +1,221 @@ +package org.apache.phoenix.expression.function; + +import java.sql.SQLException; +import java.text.Collator; +import java.util.Arrays; +import java.util.List; +import java.util.Locale; + +import org.apache.commons.lang.BooleanUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; +import org.apache.phoenix.expression.Expression; +import org.apache.phoenix.parse.FunctionParseNode; +import org.apache.phoenix.schema.tuple.Tuple; +import org.apache.phoenix.schema.types.PBoolean; +import org.apache.phoenix.schema.types.PDataType; +import org.apache.phoenix.schema.types.PInteger; +import org.apache.phoenix.schema.types.PIntegerArray; +import org.apache.phoenix.schema.types.PUnsignedIntArray; +import org.apache.phoenix.schema.types.PVarbinary; +import org.apache.phoenix.schema.types.PVarchar; +import org.apache.phoenix.schema.types.PhoenixArray; +import org.apache.phoenix.util.VarBinaryFormatter; + +import com.force.db.i18n.LinguisticSort; +import com.force.i18n.LocaleUtils; + +import com.ibm.icu.impl.jdkadapter.CollatorICU; +import com.ibm.icu.util.ULocale; + +/** + * A Phoenix Function that calculates a collation key for an input string based + * on a caller-provided locale and collator strength and decomposition settings. + * + * It uses the open-source grammaticus and i18n packages to obtain the collators + * it needs. + * + * @author snakhoda + * + */ +@FunctionParseNode.BuiltInFunction(name = CollationKeyFunction.NAME, args = { + // input string + @FunctionParseNode.Argument(allowedTypes = { PVarchar.class }), + // ISO Code for Locale + @FunctionParseNode.Argument(allowedTypes = { PVarchar.class }, isConstant = true), + // whether to use special upper case collator + @FunctionParseNode.Argument(allowedTypes = { PBoolean.class }, defaultValue = "false", isConstant = true), + // collator strength + @FunctionParseNode.Argument(allowedTypes = { PInteger.class }, defaultValue = "null", isConstant = true), + // collator decomposition + @FunctionParseNode.Argument(allowedTypes = { PInteger.class }, defaultValue = "null", isConstant = true) }) +public class CollationKeyFunction extends ScalarFunction { + + private static final Log LOG = LogFactory.getLog(CollationKeyFunction.class); + + public static final String NAME = "COLLKEY"; + + public CollationKeyFunction() { + } + + public CollationKeyFunction(List<Expression> children) throws SQLException { + super(children); + } + + @Override + public boolean evaluate(Tuple tuple, ImmutableBytesWritable ptr) { + try { + String inputValue = getInputValue(tuple, ptr); --- End diff -- @JamesRTaylor Won't that require that the collator be thread-safe? Or will the CollationKeyFunction not be shared across threads? (Maybe the tweak you were mentioning is for this purpose?) > Allow sorting on (Java) collation keys for non-English locales > -------------------------------------------------------------- > > Key: PHOENIX-4237 > URL: https://issues.apache.org/jira/browse/PHOENIX-4237 > Project: Phoenix > Issue Type: Improvement > Reporter: Shehzaad Nakhoda > Fix For: 4.12.0 > > > Strings stored via Phoenix can be composed from a subset of the entire set of > Unicode characters. The natural sort order for strings for different > languages often differs from the order dictated by the binary representation > of the characters of these strings. Java provides the idea of a Collator > which given an input string and a (language) locale can generate a Collation > Key which can then be used to compare strings in that natural order. > Salesforce has recently open-sourced grammaticus. IBM has open-sourced ICU4J > some time ago. These technologies can be combined to provide a robust new > Phoenix function that can be used in an ORDER BY clause to sort strings > according to the user's locale. -- This message was sent by Atlassian JIRA (v6.4.14#64029)