[GitHub] spark pull request #15047: [SPARK-17495] [SQL] Add Hash capability semantica...

hvanhovell Tue, 27 Sep 2016 16:07:29 -0700

Github user hvanhovell commented on a diff in the pull request:

    https://github.com/apache/spark/pull/15047#discussion_r80814933
  
    --- Diff: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
 ---
    @@ -559,3 +607,219 @@ case class CurrentDatabase() extends LeafExpression 
with Unevaluable {
       override def foldable: Boolean = true
       override def nullable: Boolean = false
     }
    +
    +/**
    + * Simulates Hive's hashing function at
    + * 
org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils#hashcode() 
in Hive
    + *
    + * We should use this hash function for both shuffle and bucket of Hive 
tables, so that
    + * we can guarantee shuffle and bucketing have same data distribution
    + *
    + * TODO: Support Decimal and date related types
    + */
    +@ExpressionDescription(
    +  usage = "_FUNC_(a1, a2, ...) - Returns a hash value of the arguments.")
    +case class HiveHash(children: Seq[Expression]) extends HashExpression[Int] 
{
    +  override val seed = 0
    +
    +  override def dataType: DataType = IntegerType
    +
    +  override def prettyName: String = "hive-hash"
    +
    +  override protected def hasherClassName: String = 
classOf[HiveHasher].getName
    +
    +  override protected def computeHash(value: Any, dataType: DataType, seed: 
Int): Int = {
    +    HiveHashFunction.hash(value, dataType, seed).toInt
    +  }
    +
    +  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
    +    ev.isNull = "false"
    +    val childHash = ctx.freshName("childHash")
    +    val childrenHash = children.map { child =>
    +      val childGen = child.genCode(ctx)
    +      childGen.code + ctx.nullSafeExec(child.nullable, childGen.isNull) {
    +        computeHash(childGen.value, child.dataType, childHash, ctx)
    +      } + s"${ev.value} = (31 * ${ev.value}) + $childHash;"
    +    }.mkString(s"int $childHash = 0;", s"\n$childHash = 0;\n", "")
    +
    +    ev.copy(code = s"""
    +      ${ctx.javaType(dataType)} ${ev.value} = $seed;
    +      $childrenHash""")
    +  }
    +
    +  @tailrec
    +  private def computeHash(
    +      input: String,
    +      dataType: DataType,
    +      result: String,
    +      ctx: CodegenContext): String = {
    +    val hasher = hasherClassName
    +
    +    dataType match {
    +      case NullType => ""
    +      case BooleanType => genHashBoolean(input, hasher, result)
    +      case ByteType | ShortType | IntegerType | DateType => 
genHashInt(input, hasher, result)
    +      case LongType | TimestampType => genHashLong(input, hasher, result)
    +      case FloatType => genHashFloat(input, hasher, result)
    +      case DoubleType => genHashDouble(input, hasher, result)
    +      case d: DecimalType => genHashDecimal(ctx, d, input, hasher, result)
    +      case CalendarIntervalType => genHashCalendarInterval(input, hasher, 
result)
    +      case BinaryType => genHashBytes(input, hasher, result)
    +      case StringType => genHashString(input, hasher, result)
    +      case ArrayType(et, containsNull) => genHashForArray(ctx, input, 
result, et, containsNull)
    +      case MapType(kt, vt, valueContainsNull) =>
    +        genHashForMap(ctx, input, result, kt, vt, valueContainsNull)
    +      case StructType(fields) => genHashForStruct(ctx, input, result, 
fields)
    +      case udt: UserDefinedType[_] => computeHash(input, udt.sqlType, 
result, ctx)
    +    }
    +  }
    +
    +  override def eval(input: InternalRow): Int = {
    +    var hash = seed
    +    var i = 0
    +    val len = children.length
    +    while (i < len) {
    +      hash = (31 * hash) + computeHash(children(i).eval(input), 
children(i).dataType, hash)
    +      i += 1
    +    }
    +    hash
    +  }
    +
    +  override protected def genHashInt(i: String, hasher: String, result: 
String): String =
    +    s"$result = $hasher.hashInt($i, 0);"
    --- End diff --
    
    Why not do the 31 multiplication in the `genHash*` methods?



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] spark pull request #15047: [SPARK-17495] [SQL] Add Hash capability semantica...

Reply via email to