[GitHub] spark pull request: [SPARK-8638] [SQL] Window Function Performance...

yhuai Sat, 18 Jul 2015 23:49:13 -0700

Github user yhuai commented on a diff in the pull request:

    https://github.com/apache/spark/pull/7057#discussion_r34955556
  
    --- Diff: 
sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala ---
    @@ -38,443 +84,667 @@ case class Window(
         child: SparkPlan)
       extends UnaryNode {
     
    -  override def output: Seq[Attribute] =
    -    (projectList ++ windowExpression).map(_.toAttribute)
    +  override def output: Seq[Attribute] = projectList ++ 
windowExpression.map(_.toAttribute)
     
    -  override def requiredChildDistribution: Seq[Distribution] =
    +  override def requiredChildDistribution: Seq[Distribution] = {
         if (windowSpec.partitionSpec.isEmpty) {
    -      // This operator will be very expensive.
    +      // Only show warning when the number of bytes is larger than 100 MB?
    +      logWarning("No Partition Defined for Window operation! Moving all 
data to a single "
    +        + "partition, this can cause serious performance degradation.")
           AllTuples :: Nil
    -    } else {
    -      ClusteredDistribution(windowSpec.partitionSpec) :: Nil
    -    }
    -
    -  // Since window functions are adding columns to the input rows, the 
child's outputPartitioning
    -  // is preserved.
    -  override def outputPartitioning: Partitioning = child.outputPartitioning
    -
    -  override def requiredChildOrdering: Seq[Seq[SortOrder]] = {
    -    // The required child ordering has two parts.
    -    // The first part is the expressions in the partition specification.
    -    // We add these expressions to the required ordering to make sure 
input rows are grouped
    -    // based on the partition specification. So, we only need to process a 
single partition
    -    // at a time.
    -    // The second part is the expressions specified in the ORDER BY cluase.
    -    // Basically, we first use sort to group rows based on partition 
specifications and then sort
    -    // Rows in a group based on the order specification.
    -    (windowSpec.partitionSpec.map(SortOrder(_, Ascending)) ++ 
windowSpec.orderSpec) :: Nil
    +    } else ClusteredDistribution(windowSpec.partitionSpec) :: Nil
       }
     
    -  // Since window functions basically add columns to input rows, this 
operator
    -  // will not change the ordering of input rows.
    +  override def requiredChildOrdering: Seq[Seq[SortOrder]] =
    +    Seq(windowSpec.partitionSpec.map(SortOrder(_, Ascending)) ++ 
windowSpec.orderSpec)
    +
       override def outputOrdering: Seq[SortOrder] = child.outputOrdering
     
    -  case class ComputedWindow(
    -    unbound: WindowExpression,
    -    windowFunction: WindowFunction,
    -    resultAttribute: AttributeReference)
    -
    -  // A list of window functions that need to be computed for each group.
    -  private[this] val computedWindowExpressions = windowExpression.flatMap { 
window =>
    -    window.collect {
    -      case w: WindowExpression =>
    -        ComputedWindow(
    -          w,
    -          BindReferences.bindReference(w.windowFunction, child.output),
    -          AttributeReference(s"windowResult:$w", w.dataType, w.nullable)())
    +  /**
    +   * Create a bound ordering object for a given frame type and offset. A 
bound ordering object is
    +   * used to determine which input row lies within the frame boundaries of 
an output row.
    +   *
    +   * This method uses Code Generation. It can only be used on the executor 
side.
    +   *
    +   * @param frameType to evaluate. This can either be Row or Range based.
    +   * @param offset with respect to the row.
    +   * @return a bound ordering object.
    +   */
    +  private[this] def createBoundOrdering(frameType: FrameType, offset: 
Int): BoundOrdering = {
    +    frameType match {
    +      case RangeFrame =>
    +        val (exprs, current, bound) = if (offset == 0) {
    +          // Use the entire order expression when the offset is 0.
    +          val exprs = windowSpec.orderSpec.map(_.child)
    +          val projection = newMutableProjection(exprs, child.output)
    +          (windowSpec.orderSpec, projection(), projection())
    +        }
    +        else if (windowSpec.orderSpec.size == 1) {
    +          // Use only the first order expression when the offset is 
non-null.
    +          val sortExpr = windowSpec.orderSpec.head
    +          val expr = sortExpr.child
    +          // Create the projection which returns the current 'value'.
    +          val current = newMutableProjection(expr :: Nil, child.output)()
    +          // Flip the sign of the offset when processing the order is 
descending
    +          val boundOffset = if (sortExpr.direction == Descending) -offset
    +          else offset
    --- End diff --
    
    ```
    val boundOffset =
      if (sortExpr.direction == Descending)
        -offset
      else
        offset
    ```



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] spark pull request: [SPARK-8638] [SQL] Window Function Performance...

Reply via email to