[GitHub] spark pull request #16043: [SPARK-18601][SQL] Simplify Create/Get complex ex...

hvanhovell Fri, 27 Jan 2017 03:41:22 -0800

Github user hvanhovell commented on a diff in the pull request:

    https://github.com/apache/spark/pull/16043#discussion_r98186441
  
    --- Diff: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ComplexTypes.scala
 ---
    @@ -0,0 +1,128 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one or more
    + * contributor license agreements.  See the NOTICE file distributed with
    + * this work for additional information regarding copyright ownership.
    + * The ASF licenses this file to You under the Apache License, Version 2.0
    + * (the "License"); you may not use this file except in compliance with
    + * the License.  You may obtain a copy of the License at
    + *
    + *    http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +package org.apache.spark.sql.catalyst.optimizer
    +
    +import org.apache.spark.sql.catalyst.expressions._
    +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
    +import org.apache.spark.sql.catalyst.rules.Rule
    +
    +/**
    +* push down operations into [[CreateNamedStructLike]].
    +*/
    +object SimplifyCreateStructOps extends Rule[LogicalPlan] {
    +  override def apply(plan: LogicalPlan): LogicalPlan = {
    +    plan.transformExpressionsUp {
    +      // push down field extraction
    +      case GetStructField(createNamedStructLike: CreateNamedStructLike, 
ordinal, _) =>
    +        createNamedStructLike.valExprs(ordinal)
    +    }
    +  }
    +}
    +
    +/**
    +* push down operations into [[CreateArray]].
    +*/
    +object SimplifyCreateArrayOps extends Rule[LogicalPlan] {
    +  override def apply(plan: LogicalPlan): LogicalPlan = {
    +    plan.transformExpressionsUp {
    +      // push down field selection (array of structs)
    +      case GetArrayStructFields(CreateArray(elems), field, ordinal, 
numFields, containsNull) =>
    +        CreateArray(elems.map(GetStructField(_, ordinal, 
Some(field.name))))
    +      // push down item selection.
    +      case ga @ GetArrayItem(CreateArray(elems), IntegerLiteral(idx)) =>
    +        if (idx >= 0 && idx < elems.size) {
    +          elems(idx)
    +        } else {
    +          Cast(Literal(null), ga.dataType)
    +        }
    +    }
    +  }
    +}
    +
    +/**
    +* push down operations into [[CreateMap]].
    +*/
    +object SimplifyCreateMapOps extends Rule[LogicalPlan] {
    +  object ComparisonResult extends Enumeration {
    +    val PositiveMatch = Value
    +    val NegativeMatch = Value
    +    val UnDetermined = Value
    +  }
    +
    +  def compareKeys(k1 : Expression, k2 : Expression) : 
ComparisonResult.Value = {
    +    (k1, k2) match {
    +      case (x, y) if x.semanticEquals(y) => ComparisonResult.PositiveMatch
    +      // make surethis is null safe, especially when datatypes differ
    +      // is this even possible?
    +      case (_ : Literal, _ : Literal) => ComparisonResult.NegativeMatch
    +      case _ => ComparisonResult.UnDetermined
    +    }
    +  }
    +
    +  case class ClassifiedEntries(
    +    undetermined : Seq[Expression],
    +    nullable : Boolean,
    +    firstPositive : Option[Expression]) {
    +    def normalize(k : Expression) : ClassifiedEntries = this match {
    +      /**
    +      * when we have undetermined matches that might bproduce a null value,
    +      * we can't separate a positive match and use [[Coalesce]] to choose 
the final result.
    +      * so we 'hide' the positive match as an undetermined match.
    +      */
    +      case ClassifiedEntries(u, true, Some(p)) if u.nonEmpty =>
    +        ClassifiedEntries(u ++ Seq(k, p), true, None)
    +      case _ => this
    +    }
    +  }
    +
    +  def classifyEntries(mapEntries : Seq[(Expression, Expression)],
    +                      requestedKey : Expression) : ClassifiedEntries = {
    +    val res1 = mapEntries.foldLeft(ClassifiedEntries(Seq.empty, nullable = 
false, None)) {
    +      case (prev @ ClassifiedEntries(_, _, Some(_)), _) => prev
    +      case (ClassifiedEntries(prev, nullable, None), (k, v)) =>
    +        compareKeys(k, requestedKey) match {
    +          case ComparisonResult.UnDetermined =>
    +            val vIsNullable = v.nullable
    +            val nextNullbale = nullable || vIsNullable
    +            ClassifiedEntries(prev ++ Seq(k, v), nullable = nextNullbale, 
None)
    +          case ComparisonResult.NegativeMatch => ClassifiedEntries(prev, 
nullable, None)
    +          case ComparisonResult.PositiveMatch => ClassifiedEntries(prev, 
nullable, Some(v))
    +        }
    +    }
    +    res1.normalize(requestedKey)
    +  }
    +
    +  override def apply(plan: LogicalPlan): LogicalPlan = {
    +    plan.transformExpressionsUp {
    +      // attempt to unfold 'constant' key extraction,
    +      // this enables other optimizations to take place.
    +      case gmv @ GetMapValue(cm @ CreateMap(elems), key) =>
    --- End diff --
    
    If I understand this correctly this does the following. The rule scans the 
map for potential matches, the following scenario's apply:
    
    1. No matches are found & no potential (undetermined) matches are found. We 
return a null value.
    2. An undetermined match is found first. We prune the map and add all 
undetermined and positive matches to the given map, and wrap this `CreateMap` 
with a `GetMapValue` expression.
    3. An positive match is found first. We return the positive match.
    
    So why not write the following (I have not compiled this):
    ```scala
    case gmv @ GetMapValue(CreateMap(elems), key) =>
      // Tag each key/value pair with a potential match result.
      val taggedKvs = elems.grouped(2).map {
        case kv @ (k, _) => compareKeys(k, key) -> kv
      }
      // Filter out negative results.
      val prunedTaggedKvs = kvs.filterNot(_._1 == 
ComparisonResult.NegativeMatch)
      prunedKvs.headOption match {
        case Some((ComparisonResult.PositiveMatch, (_, v)) => v
        case Some((ComparisonResult.Undetermined, _)) =>
          val prunedKvs = prunedTaggedKvs.flatmap {
            case (_, (k, v)) => Seq(k, v)
          }
          GetMapValue(CreateMap(prunedKvs.map(_._2)), key)
        case None => Literal.create(null, gmv.dataType)
      }
    ```
    That could save a lot of code.



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] spark pull request #16043: [SPARK-18601][SQL] Simplify Create/Get complex ex...

Reply via email to