[ https://issues.apache.org/jira/browse/DRILL-4119?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15022713#comment-15022713 ]
ASF GitHub Bot commented on DRILL-4119: --------------------------------------- Github user mehant commented on a diff in the pull request: https://github.com/apache/drill/pull/279#discussion_r45643009 --- Diff: exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/Hash32AsDouble.java --- @@ -0,0 +1,340 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.drill.exec.expr.fn.impl; + +import org.apache.drill.exec.expr.DrillSimpleFunc; +import org.apache.drill.exec.expr.annotations.FunctionTemplate; +import org.apache.drill.exec.expr.annotations.Output; +import org.apache.drill.exec.expr.annotations.Param; +import org.apache.drill.exec.expr.annotations.FunctionTemplate.FunctionScope; +import org.apache.drill.exec.expr.holders.BigIntHolder; +import org.apache.drill.exec.expr.holders.Decimal18Holder; +import org.apache.drill.exec.expr.holders.Decimal28SparseHolder; +import org.apache.drill.exec.expr.holders.Decimal38SparseHolder; +import org.apache.drill.exec.expr.holders.Decimal9Holder; +import org.apache.drill.exec.expr.holders.Float4Holder; +import org.apache.drill.exec.expr.holders.Float8Holder; +import org.apache.drill.exec.expr.holders.IntHolder; +import org.apache.drill.exec.expr.holders.NullableBigIntHolder; +import org.apache.drill.exec.expr.holders.NullableDecimal18Holder; +import org.apache.drill.exec.expr.holders.NullableDecimal28SparseHolder; +import org.apache.drill.exec.expr.holders.NullableDecimal38SparseHolder; +import org.apache.drill.exec.expr.holders.NullableDecimal9Holder; +import org.apache.drill.exec.expr.holders.NullableFloat4Holder; +import org.apache.drill.exec.expr.holders.NullableFloat8Holder; +import org.apache.drill.exec.expr.holders.NullableIntHolder; + +/* + * Class contains hash function definitions for different data types. + * + * NOTE: These functions are used internally by Drill to perform hash distribution and in hash join. For + * numeric data types we would like to apply implicit casts in the join method however for this to work + * as expected we would need to hash the same value represented in different data types (int, bigint, float etc) + * to hash to the same node, this is why we cast all numeric values to double before performing the actual hash. + */ +public class Hash32AsDouble { + @FunctionTemplate(name = "hash32AsDouble", scope = FunctionScope.SIMPLE, nulls = FunctionTemplate.NullHandling.INTERNAL) + public static class NullableFloatHash implements DrillSimpleFunc { + + @Param + NullableFloat4Holder in; + @Output + IntHolder out; + + public void setup() { + } + + public void eval() { + if (in.isSet == 0) { + out.value = 0; + } else { + out.value = org.apache.drill.exec.expr.fn.impl.XXHash.hash32((double) in.value, 0); + } + } + } + + @FunctionTemplate(name = "hash32AsDouble", scope = FunctionScope.SIMPLE, nulls = FunctionTemplate.NullHandling.INTERNAL) + public static class FloatHash implements DrillSimpleFunc { + + @Param + Float4Holder in; + @Output + IntHolder out; + + public void setup() { + } + + public void eval() { + out.value = org.apache.drill.exec.expr.fn.impl.XXHash.hash32((double) in.value, 0); + } + } + + @FunctionTemplate(name = "hash32AsDouble", scope = FunctionScope.SIMPLE, nulls = FunctionTemplate.NullHandling.INTERNAL) + public static class NullableDoubleHash implements DrillSimpleFunc { + + @Param + NullableFloat8Holder in; + @Output + IntHolder out; + + public void setup() { + } + + public void eval() { + if (in.isSet == 0) { + out.value = 0; + } else { + out.value = org.apache.drill.exec.expr.fn.impl.XXHash.hash32((double) in.value, 0); + } + } + } + + @FunctionTemplate(name = "hash32AsDouble", scope = FunctionScope.SIMPLE, nulls = FunctionTemplate.NullHandling.INTERNAL) + public static class DoubleHash implements DrillSimpleFunc { + + @Param + Float8Holder in; + @Output + IntHolder out; + + public void setup() { + } + + public void eval() { + out.value = org.apache.drill.exec.expr.fn.impl.XXHash.hash32(in.value, 0); + } + } + + @FunctionTemplate(name = "hash32AsDouble", scope = FunctionScope.SIMPLE, nulls = FunctionTemplate.NullHandling.INTERNAL) + public static class NullableBigIntHash implements DrillSimpleFunc { + + @Param + NullableBigIntHolder in; + @Output + IntHolder out; + + public void setup() { + } + + public void eval() { + if (in.isSet == 0) { + out.value = 0; + } else { + out.value = org.apache.drill.exec.expr.fn.impl.XXHash.hash32((double) in.value, 0); + } + } + } + + @FunctionTemplate(name = "hash32AsDouble", scope = FunctionScope.SIMPLE, nulls = FunctionTemplate.NullHandling.INTERNAL) + public static class NullableIntHash implements DrillSimpleFunc { + @Param + NullableIntHolder in; + @Output + IntHolder out; + + public void setup() { + } + + public void eval() { + if (in.isSet == 0) { + out.value = 0; + } else { + out.value = org.apache.drill.exec.expr.fn.impl.XXHash.hash32((double) in.value, 0); + } + } + } + + @FunctionTemplate(name = "hash32AsDouble", scope = FunctionScope.SIMPLE, nulls = FunctionTemplate.NullHandling.INTERNAL) + public static class HashBigInt implements DrillSimpleFunc { --- End diff -- Can you rename this to be BigIntHash, to be consistent with the rest of the classes in this file. > Skew in hash distribution for varchar (and possibly other) types of data > ------------------------------------------------------------------------ > > Key: DRILL-4119 > URL: https://issues.apache.org/jira/browse/DRILL-4119 > Project: Apache Drill > Issue Type: Bug > Components: Functions - Drill > Affects Versions: 1.3.0 > Reporter: Aman Sinha > Assignee: Mehant Baid > Fix For: 1.4.0 > > > We are seeing substantial skew for an Id column that contains varchar data of > length 32. It is easily reproducible by a group-by query: > {noformat} > Explain plan for SELECT SomeId From table GROUP BY SomeId; > ... > 01-02 HashAgg(group=[{0}]) > 01-03 Project(SomeId=[$0]) > 01-04 HashToRandomExchange(dist0=[[$0]]) > 02-01 UnorderedMuxExchange > 03-01 Project(SomeId=[$0], > E_X_P_R_H_A_S_H_F_I_E_L_D=[castInt(hash64AsDouble($0))]) > 03-02 HashAgg(group=[{0}]) > 03-03 Project(SomeId=[$0]) > {noformat} > The string id happens to be of the following type: > {noformat} > e4b4388e8865819126cb0e4dcaa7261d > {noformat} -- This message was sent by Atlassian JIRA (v6.3.4#6332)