[GitHub] [spark] amaliujia commented on a diff in pull request #40070: [SPARK-42481][CONNECT] Implement agg.{max,min,mean,count,avg,sum}

via GitHub Fri, 17 Feb 2023 13:04:24 -0800


amaliujia commented on code in PR #40070:
URL: https://github.com/apache/spark/pull/40070#discussion_r1110359583



##########
connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala:
##########
@@ -109,44 +109,131 @@ class RelationalGroupedDataset protected[sql] (
     agg(exprs.asScala.toMap)
   }
 
-  private[this] def strToExpr(expr: String, inputExpr: proto.Expression): 
proto.Expression = {
+  private[this] def strToExpr(expr: String, columnName: String): 
proto.Expression = {
     val builder = proto.Expression.newBuilder()
 
     expr.toLowerCase(Locale.ROOT) match {
       // We special handle a few cases that have alias that are not in 
function registry.
       case "avg" | "average" | "mean" =>
-        builder.getUnresolvedFunctionBuilder
-          .setFunctionName("avg")
-          .addArguments(inputExpr)
-          .setIsDistinct(false)
+        functions.avg(columnName)
       case "stddev" | "std" =>
-        builder.getUnresolvedFunctionBuilder
-          .setFunctionName("stddev")
-          .addArguments(inputExpr)
-          .setIsDistinct(false)
+        functions.stddev(columnName)
       // Also special handle count because we need to take care count(*).
       case "count" | "size" =>
-        // Turn count(*) into count(1)
-        inputExpr match {
-          case s if s.hasUnresolvedStar =>
-            val exprBuilder = proto.Expression.newBuilder
-            exprBuilder.getLiteralBuilder.setInteger(1)
-            builder.getUnresolvedFunctionBuilder
-              .setFunctionName("count")
-              .addArguments(exprBuilder)
-              .setIsDistinct(false)
-          case _ =>
-            builder.getUnresolvedFunctionBuilder
-              .setFunctionName("count")
-              .addArguments(inputExpr)
-              .setIsDistinct(false)
-        }
+        functions.col(columnName)
       case name =>
         builder.getUnresolvedFunctionBuilder
           .setFunctionName(name)
-          .addArguments(inputExpr)
+          .addArguments(df(columnName).expr)
           .setIsDistinct(false)
     }
     builder.build()
   }
+
+  /**
+   * Compute aggregates by specifying a series of aggregate columns. Note that 
this function by
+   * default retains the grouping columns in its output. To not retain 
grouping columns, set
+   * `spark.sql.retainGroupColumns` to false.
+   *
+   * The available aggregate methods are defined in 
[[org.apache.spark.sql.functions]].
+   *
+   * {{{
+   *   // Selects the age of the oldest employee and the aggregate expense for 
each department
+   *
+   *   // Scala:
+   *   import org.apache.spark.sql.functions._
+   *   df.groupBy("department").agg(max("age"), sum("expense"))
+   *
+   *   // Java:
+   *   import static org.apache.spark.sql.functions.*;
+   *   df.groupBy("department").agg(max("age"), sum("expense"));
+   * }}}
+   *
+   * Note that before Spark 1.4, the default behavior is to NOT retain 
grouping columns. To change
+   * to that behavior, set config variable `spark.sql.retainGroupColumns` to 
`false`.
+   * {{{
+   *   // Scala, 1.3.x:
+   *   df.groupBy("department").agg($"department", max("age"), sum("expense"))
+   *
+   *   // Java, 1.3.x:
+   *   df.groupBy("department").agg(col("department"), max("age"), 
sum("expense"));
+   * }}}
+   *
+   * @since 3.4.0
+   */
+  @scala.annotation.varargs
+  def agg(expr: Column, exprs: Column*): DataFrame = {
+    toDF((expr +: exprs).map { case c =>
+      c.expr
+    // TODO: deal with typed columns.
+    })
+  }
+
+  /**
+   * Count the number of rows for each group. The resulting `DataFrame` will 
also contain the
+   * grouping columns.
+   *
+   * @since 3.4.0
+   */
+  def count(): DataFrame = 
toDF(Seq(functions.count(functions.lit(1)).alias("count").expr))

Review Comment:
   Wait I think we should do reverse way right? So the Dataset.count = 
groupby().count().collect()?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] amaliujia commented on a diff in pull request #40070: [SPARK-42481][CONNECT] Implement agg.{max,min,mean,count,avg,sum}

Reply via email to