Github user felixcheung commented on a diff in the pull request: https://github.com/apache/spark/pull/19816#discussion_r153021622 --- Diff: R/pkg/tests/fulltests/test_sparkSQL.R --- @@ -3078,101 +3091,117 @@ test_that("coalesce, repartition, numPartitions", { }) test_that("gapply() and gapplyCollect() on a DataFrame", { - df <- createDataFrame( - list(list(1L, 1, "1", 0.1), list(1L, 2, "1", 0.2), list(3L, 3, "3", 0.3)), - c("a", "b", "c", "d")) - expected <- collect(df) - df1 <- gapply(df, "a", function(key, x) { x }, schema(df)) - actual <- collect(df1) - expect_identical(actual, expected) - - df1Collect <- gapplyCollect(df, list("a"), function(key, x) { x }) - expect_identical(df1Collect, expected) - - # gapply on empty grouping columns. - df1 <- gapply(df, c(), function(key, x) { x }, schema(df)) - actual <- collect(df1) - expect_identical(actual, expected) - - # Computes the sum of second column by grouping on the first and third columns - # and checks if the sum is larger than 2 - schemas <- list(structType(structField("a", "integer"), structField("e", "boolean")), - "a INT, e BOOLEAN") - for (schema in schemas) { - df2 <- gapply( + # The tasks here launch R workers with shuffles. So, we decrease the number of shuffle --- End diff -- yes, sounds like we should
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org