Repository: spark Updated Branches: refs/heads/master d37ea3c09 -> c34b546d6
[SPARK-16519][SPARKR] Handle SparkR RDD generics that create warnings in R CMD check ## What changes were proposed in this pull request? Rename RDD functions for now to avoid CRAN check warnings. Some RDD functions are sharing generics with DataFrame functions (hence the problem) so after the renames we need to add new generics, for now. ## How was this patch tested? unit tests Author: Felix Cheung <felixcheun...@hotmail.com> Closes #14626 from felixcheung/rrddfunctions. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c34b546d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c34b546d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c34b546d Branch: refs/heads/master Commit: c34b546d674ce186f13d9999b97977bc281cfedf Parents: d37ea3c Author: Felix Cheung <felixcheun...@hotmail.com> Authored: Tue Aug 16 11:19:18 2016 -0700 Committer: Shivaram Venkataraman <shiva...@cs.berkeley.edu> Committed: Tue Aug 16 11:19:18 2016 -0700 ---------------------------------------------------------------------- R/pkg/R/RDD.R | 100 +++++------ R/pkg/R/SQLContext.R | 2 +- R/pkg/R/context.R | 2 +- R/pkg/R/generics.R | 91 ++++++---- R/pkg/R/pairRDD.R | 40 ++--- R/pkg/inst/tests/testthat/test_binaryFile.R | 8 +- .../inst/tests/testthat/test_binary_function.R | 18 +- R/pkg/inst/tests/testthat/test_broadcast.R | 4 +- R/pkg/inst/tests/testthat/test_context.R | 6 +- R/pkg/inst/tests/testthat/test_includePackage.R | 4 +- .../tests/testthat/test_parallelize_collect.R | 26 +-- R/pkg/inst/tests/testthat/test_rdd.R | 172 +++++++++---------- R/pkg/inst/tests/testthat/test_shuffle.R | 34 ++-- R/pkg/inst/tests/testthat/test_sparkSQL.R | 28 +-- R/pkg/inst/tests/testthat/test_take.R | 32 ++-- R/pkg/inst/tests/testthat/test_textFile.R | 26 +-- R/pkg/inst/tests/testthat/test_utils.R | 6 +- 17 files changed, 312 insertions(+), 287 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/c34b546d/R/pkg/R/RDD.R ---------------------------------------------------------------------- diff --git a/R/pkg/R/RDD.R b/R/pkg/R/RDD.R index 72a8052..6b254bb 100644 --- a/R/pkg/R/RDD.R +++ b/R/pkg/R/RDD.R @@ -67,7 +67,7 @@ setMethod("initialize", "RDD", function(.Object, jrdd, serializedMode, .Object }) -setMethod("show", "RDD", +setMethod("showRDD", "RDD", function(object) { cat(paste(callJMethod(getJRDD(object), "toString"), "\n", sep = "")) }) @@ -215,7 +215,7 @@ setValidity("RDD", #' @rdname cache-methods #' @aliases cache,RDD-method #' @noRd -setMethod("cache", +setMethod("cacheRDD", signature(x = "RDD"), function(x) { callJMethod(getJRDD(x), "cache") @@ -235,12 +235,12 @@ setMethod("cache", #'\dontrun{ #' sc <- sparkR.init() #' rdd <- parallelize(sc, 1:10, 2L) -#' persist(rdd, "MEMORY_AND_DISK") +#' persistRDD(rdd, "MEMORY_AND_DISK") #'} #' @rdname persist #' @aliases persist,RDD-method #' @noRd -setMethod("persist", +setMethod("persistRDD", signature(x = "RDD", newLevel = "character"), function(x, newLevel = "MEMORY_ONLY") { callJMethod(getJRDD(x), "persist", getStorageLevel(newLevel)) @@ -259,12 +259,12 @@ setMethod("persist", #' sc <- sparkR.init() #' rdd <- parallelize(sc, 1:10, 2L) #' cache(rdd) # rdd@@env$isCached == TRUE -#' unpersist(rdd) # rdd@@env$isCached == FALSE +#' unpersistRDD(rdd) # rdd@@env$isCached == FALSE #'} #' @rdname unpersist-methods #' @aliases unpersist,RDD-method #' @noRd -setMethod("unpersist", +setMethod("unpersistRDD", signature(x = "RDD"), function(x) { callJMethod(getJRDD(x), "unpersist") @@ -345,13 +345,13 @@ setMethod("numPartitions", #'\dontrun{ #' sc <- sparkR.init() #' rdd <- parallelize(sc, 1:10, 2L) -#' collect(rdd) # list from 1 to 10 +#' collectRDD(rdd) # list from 1 to 10 #' collectPartition(rdd, 0L) # list from 1 to 5 #'} #' @rdname collect-methods #' @aliases collect,RDD-method #' @noRd -setMethod("collect", +setMethod("collectRDD", signature(x = "RDD"), function(x, flatten = TRUE) { # Assumes a pairwise RDD is backed by a JavaPairRDD. @@ -397,7 +397,7 @@ setMethod("collectPartition", setMethod("collectAsMap", signature(x = "RDD"), function(x) { - pairList <- collect(x) + pairList <- collectRDD(x) map <- new.env() lapply(pairList, function(i) { assign(as.character(i[[1]]), i[[2]], envir = map) }) as.list(map) @@ -411,30 +411,30 @@ setMethod("collectAsMap", #'\dontrun{ #' sc <- sparkR.init() #' rdd <- parallelize(sc, 1:10) -#' count(rdd) # 10 +#' countRDD(rdd) # 10 #' length(rdd) # Same as count #'} #' @rdname count #' @aliases count,RDD-method #' @noRd -setMethod("count", +setMethod("countRDD", signature(x = "RDD"), function(x) { countPartition <- function(part) { as.integer(length(part)) } valsRDD <- lapplyPartition(x, countPartition) - vals <- collect(valsRDD) + vals <- collectRDD(valsRDD) sum(as.integer(vals)) }) #' Return the number of elements in the RDD #' @rdname count #' @noRd -setMethod("length", +setMethod("lengthRDD", signature(x = "RDD"), function(x) { - count(x) + countRDD(x) }) #' Return the count of each unique value in this RDD as a list of @@ -460,7 +460,7 @@ setMethod("countByValue", signature(x = "RDD"), function(x) { ones <- lapply(x, function(item) { list(item, 1L) }) - collect(reduceByKey(ones, `+`, getNumPartitions(x))) + collectRDD(reduceByKey(ones, `+`, getNumPartitions(x))) }) #' Apply a function to all elements @@ -479,7 +479,7 @@ setMethod("countByValue", #' sc <- sparkR.init() #' rdd <- parallelize(sc, 1:10) #' multiplyByTwo <- lapply(rdd, function(x) { x * 2 }) -#' collect(multiplyByTwo) # 2,4,6... +#' collectRDD(multiplyByTwo) # 2,4,6... #'} setMethod("lapply", signature(X = "RDD", FUN = "function"), @@ -512,7 +512,7 @@ setMethod("map", #' sc <- sparkR.init() #' rdd <- parallelize(sc, 1:10) #' multiplyByTwo <- flatMap(rdd, function(x) { list(x*2, x*10) }) -#' collect(multiplyByTwo) # 2,20,4,40,6,60... +#' collectRDD(multiplyByTwo) # 2,20,4,40,6,60... #'} #' @rdname flatMap #' @aliases flatMap,RDD,function-method @@ -541,7 +541,7 @@ setMethod("flatMap", #' sc <- sparkR.init() #' rdd <- parallelize(sc, 1:10) #' partitionSum <- lapplyPartition(rdd, function(part) { Reduce("+", part) }) -#' collect(partitionSum) # 15, 40 +#' collectRDD(partitionSum) # 15, 40 #'} #' @rdname lapplyPartition #' @aliases lapplyPartition,RDD,function-method @@ -576,7 +576,7 @@ setMethod("mapPartitions", #' rdd <- parallelize(sc, 1:10, 5L) #' prod <- lapplyPartitionsWithIndex(rdd, function(partIndex, part) { #' partIndex * Reduce("+", part) }) -#' collect(prod, flatten = FALSE) # 0, 7, 22, 45, 76 +#' collectRDD(prod, flatten = FALSE) # 0, 7, 22, 45, 76 #'} #' @rdname lapplyPartitionsWithIndex #' @aliases lapplyPartitionsWithIndex,RDD,function-method @@ -607,7 +607,7 @@ setMethod("mapPartitionsWithIndex", #'\dontrun{ #' sc <- sparkR.init() #' rdd <- parallelize(sc, 1:10) -#' unlist(collect(filterRDD(rdd, function (x) { x < 3 }))) # c(1, 2) +#' unlist(collectRDD(filterRDD(rdd, function (x) { x < 3 }))) # c(1, 2) #'} # nolint end #' @rdname filterRDD @@ -656,7 +656,7 @@ setMethod("reduce", Reduce(func, part) } - partitionList <- collect(lapplyPartition(x, reducePartition), + partitionList <- collectRDD(lapplyPartition(x, reducePartition), flatten = FALSE) Reduce(func, partitionList) }) @@ -736,7 +736,7 @@ setMethod("foreach", lapply(x, func) NULL } - invisible(collect(mapPartitions(x, partition.func))) + invisible(collectRDD(mapPartitions(x, partition.func))) }) #' Applies a function to each partition in an RDD, and forces evaluation. @@ -753,7 +753,7 @@ setMethod("foreach", setMethod("foreachPartition", signature(x = "RDD", func = "function"), function(x, func) { - invisible(collect(mapPartitions(x, func))) + invisible(collectRDD(mapPartitions(x, func))) }) #' Take elements from an RDD. @@ -768,13 +768,13 @@ setMethod("foreachPartition", #'\dontrun{ #' sc <- sparkR.init() #' rdd <- parallelize(sc, 1:10) -#' take(rdd, 2L) # list(1, 2) +#' takeRDD(rdd, 2L) # list(1, 2) #'} # nolint end #' @rdname take #' @aliases take,RDD,numeric-method #' @noRd -setMethod("take", +setMethod("takeRDD", signature(x = "RDD", num = "numeric"), function(x, num) { resList <- list() @@ -817,13 +817,13 @@ setMethod("take", #'\dontrun{ #' sc <- sparkR.init() #' rdd <- parallelize(sc, 1:10) -#' first(rdd) +#' firstRDD(rdd) #' } #' @noRd -setMethod("first", +setMethod("firstRDD", signature(x = "RDD"), function(x) { - take(x, 1)[[1]] + takeRDD(x, 1)[[1]] }) #' Removes the duplicates from RDD. @@ -838,13 +838,13 @@ setMethod("first", #'\dontrun{ #' sc <- sparkR.init() #' rdd <- parallelize(sc, c(1,2,2,3,3,3)) -#' sort(unlist(collect(distinct(rdd)))) # c(1, 2, 3) +#' sort(unlist(collectRDD(distinctRDD(rdd)))) # c(1, 2, 3) #'} # nolint end #' @rdname distinct #' @aliases distinct,RDD-method #' @noRd -setMethod("distinct", +setMethod("distinctRDD", signature(x = "RDD"), function(x, numPartitions = SparkR:::getNumPartitions(x)) { identical.mapped <- lapply(x, function(x) { list(x, NULL) }) @@ -868,8 +868,8 @@ setMethod("distinct", #'\dontrun{ #' sc <- sparkR.init() #' rdd <- parallelize(sc, 1:10) -#' collect(sampleRDD(rdd, FALSE, 0.5, 1618L)) # ~5 distinct elements -#' collect(sampleRDD(rdd, TRUE, 0.5, 9L)) # ~5 elements possibly with duplicates +#' collectRDD(sampleRDD(rdd, FALSE, 0.5, 1618L)) # ~5 distinct elements +#' collectRDD(sampleRDD(rdd, TRUE, 0.5, 9L)) # ~5 elements possibly with duplicates #'} #' @rdname sampleRDD #' @aliases sampleRDD,RDD @@ -942,7 +942,7 @@ setMethod("takeSample", signature(x = "RDD", withReplacement = "logical", fraction <- 0.0 total <- 0 multiplier <- 3.0 - initialCount <- count(x) + initialCount <- countRDD(x) maxSelected <- 0 MAXINT <- .Machine$integer.max @@ -964,7 +964,7 @@ setMethod("takeSample", signature(x = "RDD", withReplacement = "logical", } set.seed(seed) - samples <- collect(sampleRDD(x, withReplacement, fraction, + samples <- collectRDD(sampleRDD(x, withReplacement, fraction, as.integer(ceiling(runif(1, -MAXINT, MAXINT))))) @@ -972,7 +972,7 @@ setMethod("takeSample", signature(x = "RDD", withReplacement = "logical", # take samples; this shouldn't happen often because we use a big # multiplier for thei initial size while (length(samples) < total) - samples <- collect(sampleRDD(x, withReplacement, fraction, + samples <- collectRDD(sampleRDD(x, withReplacement, fraction, as.integer(ceiling(runif(1, -MAXINT, MAXINT))))) @@ -990,7 +990,7 @@ setMethod("takeSample", signature(x = "RDD", withReplacement = "logical", #'\dontrun{ #' sc <- sparkR.init() #' rdd <- parallelize(sc, list(1, 2, 3)) -#' collect(keyBy(rdd, function(x) { x*x })) # list(list(1, 1), list(4, 2), list(9, 3)) +#' collectRDD(keyBy(rdd, function(x) { x*x })) # list(list(1, 1), list(4, 2), list(9, 3)) #'} # nolint end #' @rdname keyBy @@ -1019,12 +1019,12 @@ setMethod("keyBy", #' sc <- sparkR.init() #' rdd <- parallelize(sc, list(1, 2, 3, 4, 5, 6, 7), 4L) #' getNumPartitions(rdd) # 4 -#' getNumPartitions(repartition(rdd, 2L)) # 2 +#' getNumPartitions(repartitionRDD(rdd, 2L)) # 2 #'} #' @rdname repartition #' @aliases repartition,RDD #' @noRd -setMethod("repartition", +setMethod("repartitionRDD", signature(x = "RDD"), function(x, numPartitions) { if (!is.null(numPartitions) && is.numeric(numPartitions)) { @@ -1064,7 +1064,7 @@ setMethod("coalesce", }) } shuffled <- lapplyPartitionsWithIndex(x, func) - repartitioned <- partitionBy(shuffled, numPartitions) + repartitioned <- partitionByRDD(shuffled, numPartitions) values(repartitioned) } else { jrdd <- callJMethod(getJRDD(x), "coalesce", numPartitions, shuffle) @@ -1135,7 +1135,7 @@ setMethod("saveAsTextFile", #'\dontrun{ #' sc <- sparkR.init() #' rdd <- parallelize(sc, list(3, 2, 1)) -#' collect(sortBy(rdd, function(x) { x })) # list (1, 2, 3) +#' collectRDD(sortBy(rdd, function(x) { x })) # list (1, 2, 3) #'} # nolint end #' @rdname sortBy @@ -1304,7 +1304,7 @@ setMethod("aggregateRDD", Reduce(seqOp, part, zeroValue) } - partitionList <- collect(lapplyPartition(x, partitionFunc), + partitionList <- collectRDD(lapplyPartition(x, partitionFunc), flatten = FALSE) Reduce(combOp, partitionList, zeroValue) }) @@ -1322,7 +1322,7 @@ setMethod("aggregateRDD", #'\dontrun{ #' sc <- sparkR.init() #' rdd <- parallelize(sc, 1:10) -#' collect(pipeRDD(rdd, "more") +#' pipeRDD(rdd, "more") #' Output: c("1", "2", ..., "10") #'} #' @aliases pipeRDD,RDD,character-method @@ -1397,7 +1397,7 @@ setMethod("setName", #'\dontrun{ #' sc <- sparkR.init() #' rdd <- parallelize(sc, list("a", "b", "c", "d", "e"), 3L) -#' collect(zipWithUniqueId(rdd)) +#' collectRDD(zipWithUniqueId(rdd)) #' # list(list("a", 0), list("b", 3), list("c", 1), list("d", 4), list("e", 2)) #'} # nolint end @@ -1440,7 +1440,7 @@ setMethod("zipWithUniqueId", #'\dontrun{ #' sc <- sparkR.init() #' rdd <- parallelize(sc, list("a", "b", "c", "d", "e"), 3L) -#' collect(zipWithIndex(rdd)) +#' collectRDD(zipWithIndex(rdd)) #' # list(list("a", 0), list("b", 1), list("c", 2), list("d", 3), list("e", 4)) #'} # nolint end @@ -1452,7 +1452,7 @@ setMethod("zipWithIndex", function(x) { n <- getNumPartitions(x) if (n > 1) { - nums <- collect(lapplyPartition(x, + nums <- collectRDD(lapplyPartition(x, function(part) { list(length(part)) })) @@ -1488,7 +1488,7 @@ setMethod("zipWithIndex", #'\dontrun{ #' sc <- sparkR.init() #' rdd <- parallelize(sc, as.list(1:4), 2L) -#' collect(glom(rdd)) +#' collectRDD(glom(rdd)) #' # list(list(1, 2), list(3, 4)) #'} # nolint end @@ -1556,7 +1556,7 @@ setMethod("unionRDD", #' sc <- sparkR.init() #' rdd1 <- parallelize(sc, 0:4) #' rdd2 <- parallelize(sc, 1000:1004) -#' collect(zipRDD(rdd1, rdd2)) +#' collectRDD(zipRDD(rdd1, rdd2)) #' # list(list(0, 1000), list(1, 1001), list(2, 1002), list(3, 1003), list(4, 1004)) #'} # nolint end @@ -1628,7 +1628,7 @@ setMethod("cartesian", #' sc <- sparkR.init() #' rdd1 <- parallelize(sc, list(1, 1, 2, 2, 3, 4)) #' rdd2 <- parallelize(sc, list(2, 4)) -#' collect(subtract(rdd1, rdd2)) +#' collectRDD(subtract(rdd1, rdd2)) #' # list(1, 1, 3) #'} # nolint end @@ -1662,7 +1662,7 @@ setMethod("subtract", #' sc <- sparkR.init() #' rdd1 <- parallelize(sc, list(1, 10, 2, 3, 4, 5)) #' rdd2 <- parallelize(sc, list(1, 6, 2, 3, 7, 8)) -#' collect(sortBy(intersection(rdd1, rdd2), function(x) { x })) +#' collectRDD(sortBy(intersection(rdd1, rdd2), function(x) { x })) #' # list(1, 2, 3) #'} # nolint end @@ -1699,7 +1699,7 @@ setMethod("intersection", #' rdd1 <- parallelize(sc, 1:2, 2L) # 1, 2 #' rdd2 <- parallelize(sc, 1:4, 2L) # 1:2, 3:4 #' rdd3 <- parallelize(sc, 1:6, 2L) # 1:3, 4:6 -#' collect(zipPartitions(rdd1, rdd2, rdd3, +#' collectRDD(zipPartitions(rdd1, rdd2, rdd3, #' func = function(x, y, z) { list(list(x, y, z))} )) #' # list(list(1, c(1,2), c(1,2,3)), list(2, c(3,4), c(4,5,6))) #'} http://git-wip-us.apache.org/repos/asf/spark/blob/c34b546d/R/pkg/R/SQLContext.R ---------------------------------------------------------------------- diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R index a14bcd9..0c06bba 100644 --- a/R/pkg/R/SQLContext.R +++ b/R/pkg/R/SQLContext.R @@ -218,7 +218,7 @@ createDataFrame.default <- function(data, schema = NULL, samplingRatio = 1.0) { } if (is.null(schema) || (!inherits(schema, "structType") && is.null(names(schema)))) { - row <- first(rdd) + row <- firstRDD(rdd) names <- if (is.null(schema)) { names(row) } else { http://git-wip-us.apache.org/repos/asf/spark/blob/c34b546d/R/pkg/R/context.R ---------------------------------------------------------------------- diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R index 2538bb2..13ade49 100644 --- a/R/pkg/R/context.R +++ b/R/pkg/R/context.R @@ -267,7 +267,7 @@ spark.lapply <- function(list, func) { sc <- getSparkContext() rdd <- parallelize(sc, list, length(list)) results <- map(rdd, func) - local <- collect(results) + local <- collectRDD(results) local } http://git-wip-us.apache.org/repos/asf/spark/blob/c34b546d/R/pkg/R/generics.R ---------------------------------------------------------------------- diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 10a0912..52ab730 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -23,9 +23,7 @@ setGeneric("aggregateRDD", function(x, zeroValue, seqOp, combOp) { standardGeneric("aggregateRDD") }) -# @rdname cache-methods -# @export -setGeneric("cache", function(x) { standardGeneric("cache") }) +setGeneric("cacheRDD", function(x) { standardGeneric("cacheRDD") }) # @rdname coalesce # @seealso repartition @@ -36,9 +34,7 @@ setGeneric("coalesce", function(x, numPartitions, ...) { standardGeneric("coales # @export setGeneric("checkpoint", function(x) { standardGeneric("checkpoint") }) -# @rdname collect-methods -# @export -setGeneric("collect", function(x, ...) { standardGeneric("collect") }) +setGeneric("collectRDD", function(x, ...) { standardGeneric("collectRDD") }) # @rdname collect-methods # @export @@ -51,9 +47,9 @@ setGeneric("collectPartition", standardGeneric("collectPartition") }) -# @rdname nrow -# @export -setGeneric("count", function(x) { standardGeneric("count") }) +setGeneric("countRDD", function(x) { standardGeneric("countRDD") }) + +setGeneric("lengthRDD", function(x) { standardGeneric("lengthRDD") }) # @rdname countByValue # @export @@ -74,17 +70,13 @@ setGeneric("approxQuantile", standardGeneric("approxQuantile") }) -# @rdname distinct -# @export -setGeneric("distinct", function(x, numPartitions = 1) { standardGeneric("distinct") }) +setGeneric("distinctRDD", function(x, numPartitions = 1) { standardGeneric("distinctRDD") }) # @rdname filterRDD # @export setGeneric("filterRDD", function(x, f) { standardGeneric("filterRDD") }) -# @rdname first -# @export -setGeneric("first", function(x, ...) { standardGeneric("first") }) +setGeneric("firstRDD", function(x, ...) { standardGeneric("firstRDD") }) # @rdname flatMap # @export @@ -110,6 +102,8 @@ setGeneric("glom", function(x) { standardGeneric("glom") }) # @export setGeneric("histogram", function(df, col, nbins=10) { standardGeneric("histogram") }) +setGeneric("joinRDD", function(x, y, ...) { standardGeneric("joinRDD") }) + # @rdname keyBy # @export setGeneric("keyBy", function(x, func) { standardGeneric("keyBy") }) @@ -152,9 +146,7 @@ setGeneric("getNumPartitions", function(x) { standardGeneric("getNumPartitions") # @export setGeneric("numPartitions", function(x) { standardGeneric("numPartitions") }) -# @rdname persist -# @export -setGeneric("persist", function(x, newLevel) { standardGeneric("persist") }) +setGeneric("persistRDD", function(x, newLevel) { standardGeneric("persistRDD") }) # @rdname pipeRDD # @export @@ -168,10 +160,7 @@ setGeneric("pivot", function(x, colname, values = list()) { standardGeneric("piv # @export setGeneric("reduce", function(x, func) { standardGeneric("reduce") }) -# @rdname repartition -# @seealso coalesce -# @export -setGeneric("repartition", function(x, ...) { standardGeneric("repartition") }) +setGeneric("repartitionRDD", function(x, ...) { standardGeneric("repartitionRDD") }) # @rdname sampleRDD # @export @@ -193,6 +182,8 @@ setGeneric("saveAsTextFile", function(x, path) { standardGeneric("saveAsTextFile # @export setGeneric("setName", function(x, name) { standardGeneric("setName") }) +setGeneric("showRDD", function(object, ...) { standardGeneric("showRDD") }) + # @rdname sortBy # @export setGeneric("sortBy", @@ -200,9 +191,7 @@ setGeneric("sortBy", standardGeneric("sortBy") }) -# @rdname take -# @export -setGeneric("take", function(x, num) { standardGeneric("take") }) +setGeneric("takeRDD", function(x, num) { standardGeneric("takeRDD") }) # @rdname takeOrdered # @export @@ -223,9 +212,7 @@ setGeneric("top", function(x, num) { standardGeneric("top") }) # @export setGeneric("unionRDD", function(x, y) { standardGeneric("unionRDD") }) -# @rdname unpersist-methods -# @export -setGeneric("unpersist", function(x, ...) { standardGeneric("unpersist") }) +setGeneric("unpersistRDD", function(x, ...) { standardGeneric("unpersistRDD") }) # @rdname zipRDD # @export @@ -343,9 +330,7 @@ setGeneric("join", function(x, y, ...) { standardGeneric("join") }) # @export setGeneric("leftOuterJoin", function(x, y, numPartitions) { standardGeneric("leftOuterJoin") }) -#' @rdname partitionBy -#' @export -setGeneric("partitionBy", function(x, ...) { standardGeneric("partitionBy") }) +setGeneric("partitionByRDD", function(x, ...) { standardGeneric("partitionByRDD") }) # @rdname reduceByKey # @seealso groupByKey @@ -414,6 +399,14 @@ setGeneric("as.data.frame", #' @export setGeneric("attach") +#' @rdname cache +#' @export +setGeneric("cache", function(x) { standardGeneric("cache") }) + +#' @rdname collect +#' @export +setGeneric("collect", function(x, ...) { standardGeneric("collect") }) + #' @rdname columns #' @export setGeneric("colnames", function(x, do.NULL = TRUE, prefix = "col") { standardGeneric("colnames") }) @@ -434,6 +427,10 @@ setGeneric("coltypes<-", function(x, value) { standardGeneric("coltypes<-") }) #' @export setGeneric("columns", function(x) {standardGeneric("columns") }) +#' @rdname nrow +#' @export +setGeneric("count", function(x) { standardGeneric("count") }) + #' @rdname cov #' @export setGeneric("cov", function(x, ...) {standardGeneric("cov") }) @@ -477,6 +474,10 @@ setGeneric("gapplyCollect", function(x, ...) { standardGeneric("gapplyCollect") #' @export setGeneric("describe", function(x, col, ...) { standardGeneric("describe") }) +#' @rdname distinct +#' @export +setGeneric("distinct", function(x) { standardGeneric("distinct") }) + #' @rdname drop #' @export setGeneric("drop", function(x, ...) { standardGeneric("drop") }) @@ -519,6 +520,10 @@ setGeneric("fillna", function(x, value, cols = NULL) { standardGeneric("fillna") #' @export setGeneric("filter", function(x, condition) { standardGeneric("filter") }) +#' @rdname first +#' @export +setGeneric("first", function(x, ...) { standardGeneric("first") }) + #' @rdname groupBy #' @export setGeneric("group_by", function(x, ...) { standardGeneric("group_by") }) @@ -555,17 +560,25 @@ setGeneric("mutate", function(.data, ...) {standardGeneric("mutate") }) #' @export setGeneric("orderBy", function(x, col, ...) { standardGeneric("orderBy") }) +#' @rdname persist +#' @export +setGeneric("persist", function(x, newLevel) { standardGeneric("persist") }) + #' @rdname printSchema #' @export setGeneric("printSchema", function(x) { standardGeneric("printSchema") }) +#' @rdname registerTempTable-deprecated +#' @export +setGeneric("registerTempTable", function(x, tableName) { standardGeneric("registerTempTable") }) + #' @rdname rename #' @export setGeneric("rename", function(x, ...) { standardGeneric("rename") }) -#' @rdname registerTempTable-deprecated +#' @rdname repartition #' @export -setGeneric("registerTempTable", function(x, tableName) { standardGeneric("registerTempTable") }) +setGeneric("repartition", function(x, ...) { standardGeneric("repartition") }) #' @rdname sample #' @export @@ -592,6 +605,10 @@ setGeneric("saveAsTable", function(df, tableName, source = NULL, mode = "error", #' @export setGeneric("str") +#' @rdname take +#' @export +setGeneric("take", function(x, num) { standardGeneric("take") }) + #' @rdname mutate #' @export setGeneric("transform", function(`_data`, ...) {standardGeneric("transform") }) @@ -674,6 +691,10 @@ setGeneric("union", function(x, y) { standardGeneric("union") }) #' @export setGeneric("unionAll", function(x, y) { standardGeneric("unionAll") }) +#' @rdname unpersist-methods +#' @export +setGeneric("unpersist", function(x, ...) { standardGeneric("unpersist") }) + #' @rdname filter #' @export setGeneric("where", function(x, condition) { standardGeneric("where") }) @@ -771,6 +792,10 @@ setGeneric("over", function(x, window) { standardGeneric("over") }) ###################### WindowSpec Methods ########################## +#' @rdname partitionBy +#' @export +setGeneric("partitionBy", function(x, ...) { standardGeneric("partitionBy") }) + #' @rdname rowsBetween #' @export setGeneric("rowsBetween", function(x, start, end) { standardGeneric("rowsBetween") }) http://git-wip-us.apache.org/repos/asf/spark/blob/c34b546d/R/pkg/R/pairRDD.R ---------------------------------------------------------------------- diff --git a/R/pkg/R/pairRDD.R b/R/pkg/R/pairRDD.R index d39775c..f0605db 100644 --- a/R/pkg/R/pairRDD.R +++ b/R/pkg/R/pairRDD.R @@ -49,7 +49,7 @@ setMethod("lookup", lapply(filtered, function(i) { i[[2]] }) } valsRDD <- lapplyPartition(x, partitionFunc) - collect(valsRDD) + collectRDD(valsRDD) }) #' Count the number of elements for each key, and return the result to the @@ -85,7 +85,7 @@ setMethod("countByKey", #'\dontrun{ #' sc <- sparkR.init() #' rdd <- parallelize(sc, list(list(1, 2), list(3, 4))) -#' collect(keys(rdd)) # list(1, 3) +#' collectRDD(keys(rdd)) # list(1, 3) #'} # nolint end #' @rdname keys @@ -108,7 +108,7 @@ setMethod("keys", #'\dontrun{ #' sc <- sparkR.init() #' rdd <- parallelize(sc, list(list(1, 2), list(3, 4))) -#' collect(values(rdd)) # list(2, 4) +#' collectRDD(values(rdd)) # list(2, 4) #'} # nolint end #' @rdname values @@ -135,7 +135,7 @@ setMethod("values", #' sc <- sparkR.init() #' rdd <- parallelize(sc, 1:10) #' makePairs <- lapply(rdd, function(x) { list(x, x) }) -#' collect(mapValues(makePairs, function(x) { x * 2) }) +#' collectRDD(mapValues(makePairs, function(x) { x * 2) }) #' Output: list(list(1,2), list(2,4), list(3,6), ...) #'} #' @rdname mapValues @@ -162,7 +162,7 @@ setMethod("mapValues", #'\dontrun{ #' sc <- sparkR.init() #' rdd <- parallelize(sc, list(list(1, c(1,2)), list(2, c(3,4)))) -#' collect(flatMapValues(rdd, function(x) { x })) +#' collectRDD(flatMapValues(rdd, function(x) { x })) #' Output: list(list(1,1), list(1,2), list(2,3), list(2,4)) #'} #' @rdname flatMapValues @@ -198,13 +198,13 @@ setMethod("flatMapValues", #' sc <- sparkR.init() #' pairs <- list(list(1, 2), list(1.1, 3), list(1, 4)) #' rdd <- parallelize(sc, pairs) -#' parts <- partitionBy(rdd, 2L) +#' parts <- partitionByRDD(rdd, 2L) #' collectPartition(parts, 0L) # First partition should contain list(1, 2) and list(1, 4) #'} #' @rdname partitionBy #' @aliases partitionBy,RDD,integer-method #' @noRd -setMethod("partitionBy", +setMethod("partitionByRDD", signature(x = "RDD"), function(x, numPartitions, partitionFunc = hashCode) { stopifnot(is.numeric(numPartitions)) @@ -261,7 +261,7 @@ setMethod("partitionBy", #' pairs <- list(list(1, 2), list(1.1, 3), list(1, 4)) #' rdd <- parallelize(sc, pairs) #' parts <- groupByKey(rdd, 2L) -#' grouped <- collect(parts) +#' grouped <- collectRDD(parts) #' grouped[[1]] # Should be a list(1, list(2, 4)) #'} #' @rdname groupByKey @@ -270,7 +270,7 @@ setMethod("partitionBy", setMethod("groupByKey", signature(x = "RDD", numPartitions = "numeric"), function(x, numPartitions) { - shuffled <- partitionBy(x, numPartitions) + shuffled <- partitionByRDD(x, numPartitions) groupVals <- function(part) { vals <- new.env() keys <- new.env() @@ -321,7 +321,7 @@ setMethod("groupByKey", #' pairs <- list(list(1, 2), list(1.1, 3), list(1, 4)) #' rdd <- parallelize(sc, pairs) #' parts <- reduceByKey(rdd, "+", 2L) -#' reduced <- collect(parts) +#' reduced <- collectRDD(parts) #' reduced[[1]] # Should be a list(1, 6) #'} #' @rdname reduceByKey @@ -342,7 +342,7 @@ setMethod("reduceByKey", convertEnvsToList(keys, vals) } locallyReduced <- lapplyPartition(x, reduceVals) - shuffled <- partitionBy(locallyReduced, numToInt(numPartitions)) + shuffled <- partitionByRDD(locallyReduced, numToInt(numPartitions)) lapplyPartition(shuffled, reduceVals) }) @@ -430,7 +430,7 @@ setMethod("reduceByKeyLocally", #' pairs <- list(list(1, 2), list(1.1, 3), list(1, 4)) #' rdd <- parallelize(sc, pairs) #' parts <- combineByKey(rdd, function(x) { x }, "+", "+", 2L) -#' combined <- collect(parts) +#' combined <- collectRDD(parts) #' combined[[1]] # Should be a list(1, 6) #'} # nolint end @@ -453,7 +453,7 @@ setMethod("combineByKey", convertEnvsToList(keys, combiners) } locallyCombined <- lapplyPartition(x, combineLocally) - shuffled <- partitionBy(locallyCombined, numToInt(numPartitions)) + shuffled <- partitionByRDD(locallyCombined, numToInt(numPartitions)) mergeAfterShuffle <- function(part) { combiners <- new.env() keys <- new.env() @@ -563,13 +563,13 @@ setMethod("foldByKey", #' sc <- sparkR.init() #' rdd1 <- parallelize(sc, list(list(1, 1), list(2, 4))) #' rdd2 <- parallelize(sc, list(list(1, 2), list(1, 3))) -#' join(rdd1, rdd2, 2L) # list(list(1, list(1, 2)), list(1, list(1, 3)) +#' joinRDD(rdd1, rdd2, 2L) # list(list(1, list(1, 2)), list(1, list(1, 3)) #'} # nolint end #' @rdname join-methods #' @aliases join,RDD,RDD-method #' @noRd -setMethod("join", +setMethod("joinRDD", signature(x = "RDD", y = "RDD"), function(x, y, numPartitions) { xTagged <- lapply(x, function(i) { list(i[[1]], list(1L, i[[2]])) }) @@ -772,7 +772,7 @@ setMethod("cogroup", #'\dontrun{ #' sc <- sparkR.init() #' rdd <- parallelize(sc, list(list(3, 1), list(2, 2), list(1, 3))) -#' collect(sortByKey(rdd)) # list (list(1, 3), list(2, 2), list(3, 1)) +#' collectRDD(sortByKey(rdd)) # list (list(1, 3), list(2, 2), list(3, 1)) #'} # nolint end #' @rdname sortByKey @@ -784,12 +784,12 @@ setMethod("sortByKey", rangeBounds <- list() if (numPartitions > 1) { - rddSize <- count(x) + rddSize <- countRDD(x) # constant from Spark's RangePartitioner maxSampleSize <- numPartitions * 20 fraction <- min(maxSampleSize / max(rddSize, 1), 1.0) - samples <- collect(keys(sampleRDD(x, FALSE, fraction, 1L))) + samples <- collectRDD(keys(sampleRDD(x, FALSE, fraction, 1L))) # Note: the built-in R sort() function only works on atomic vectors samples <- sort(unlist(samples, recursive = FALSE), decreasing = !ascending) @@ -822,7 +822,7 @@ setMethod("sortByKey", sortKeyValueList(part, decreasing = !ascending) } - newRDD <- partitionBy(x, numPartitions, rangePartitionFunc) + newRDD <- partitionByRDD(x, numPartitions, rangePartitionFunc) lapplyPartition(newRDD, partitionFunc) }) @@ -841,7 +841,7 @@ setMethod("sortByKey", #' rdd1 <- parallelize(sc, list(list("a", 1), list("b", 4), #' list("b", 5), list("a", 2))) #' rdd2 <- parallelize(sc, list(list("a", 3), list("c", 1))) -#' collect(subtractByKey(rdd1, rdd2)) +#' collectRDD(subtractByKey(rdd1, rdd2)) #' # list(list("b", 4), list("b", 5)) #'} # nolint end http://git-wip-us.apache.org/repos/asf/spark/blob/c34b546d/R/pkg/inst/tests/testthat/test_binaryFile.R ---------------------------------------------------------------------- diff --git a/R/pkg/inst/tests/testthat/test_binaryFile.R b/R/pkg/inst/tests/testthat/test_binaryFile.R index 56ac8eb..b5c279e 100644 --- a/R/pkg/inst/tests/testthat/test_binaryFile.R +++ b/R/pkg/inst/tests/testthat/test_binaryFile.R @@ -31,7 +31,7 @@ test_that("saveAsObjectFile()/objectFile() following textFile() works", { rdd <- textFile(sc, fileName1, 1) saveAsObjectFile(rdd, fileName2) rdd <- objectFile(sc, fileName2) - expect_equal(collect(rdd), as.list(mockFile)) + expect_equal(collectRDD(rdd), as.list(mockFile)) unlink(fileName1) unlink(fileName2, recursive = TRUE) @@ -44,7 +44,7 @@ test_that("saveAsObjectFile()/objectFile() works on a parallelized list", { rdd <- parallelize(sc, l, 1) saveAsObjectFile(rdd, fileName) rdd <- objectFile(sc, fileName) - expect_equal(collect(rdd), l) + expect_equal(collectRDD(rdd), l) unlink(fileName, recursive = TRUE) }) @@ -64,7 +64,7 @@ test_that("saveAsObjectFile()/objectFile() following RDD transformations works", saveAsObjectFile(counts, fileName2) counts <- objectFile(sc, fileName2) - output <- collect(counts) + output <- collectRDD(counts) expected <- list(list("awesome.", 1), list("Spark", 2), list("pretty.", 1), list("is", 2)) expect_equal(sortKeyValueList(output), sortKeyValueList(expected)) @@ -83,7 +83,7 @@ test_that("saveAsObjectFile()/objectFile() works with multiple paths", { saveAsObjectFile(rdd2, fileName2) rdd <- objectFile(sc, c(fileName1, fileName2)) - expect_equal(count(rdd), 2) + expect_equal(countRDD(rdd), 2) unlink(fileName1, recursive = TRUE) unlink(fileName2, recursive = TRUE) http://git-wip-us.apache.org/repos/asf/spark/blob/c34b546d/R/pkg/inst/tests/testthat/test_binary_function.R ---------------------------------------------------------------------- diff --git a/R/pkg/inst/tests/testthat/test_binary_function.R b/R/pkg/inst/tests/testthat/test_binary_function.R index ae7abe2..59cb2e6 100644 --- a/R/pkg/inst/tests/testthat/test_binary_function.R +++ b/R/pkg/inst/tests/testthat/test_binary_function.R @@ -29,7 +29,7 @@ rdd <- parallelize(sc, nums, 2L) mockFile <- c("Spark is pretty.", "Spark is awesome.") test_that("union on two RDDs", { - actual <- collect(unionRDD(rdd, rdd)) + actual <- collectRDD(unionRDD(rdd, rdd)) expect_equal(actual, as.list(rep(nums, 2))) fileName <- tempfile(pattern = "spark-test", fileext = ".tmp") @@ -37,13 +37,13 @@ test_that("union on two RDDs", { text.rdd <- textFile(sc, fileName) union.rdd <- unionRDD(rdd, text.rdd) - actual <- collect(union.rdd) + actual <- collectRDD(union.rdd) expect_equal(actual, c(as.list(nums), mockFile)) expect_equal(getSerializedMode(union.rdd), "byte") rdd <- map(text.rdd, function(x) {x}) union.rdd <- unionRDD(rdd, text.rdd) - actual <- collect(union.rdd) + actual <- collectRDD(union.rdd) expect_equal(actual, as.list(c(mockFile, mockFile))) expect_equal(getSerializedMode(union.rdd), "byte") @@ -54,14 +54,14 @@ test_that("cogroup on two RDDs", { rdd1 <- parallelize(sc, list(list(1, 1), list(2, 4))) rdd2 <- parallelize(sc, list(list(1, 2), list(1, 3))) cogroup.rdd <- cogroup(rdd1, rdd2, numPartitions = 2L) - actual <- collect(cogroup.rdd) + actual <- collectRDD(cogroup.rdd) expect_equal(actual, list(list(1, list(list(1), list(2, 3))), list(2, list(list(4), list())))) rdd1 <- parallelize(sc, list(list("a", 1), list("a", 4))) rdd2 <- parallelize(sc, list(list("b", 2), list("a", 3))) cogroup.rdd <- cogroup(rdd1, rdd2, numPartitions = 2L) - actual <- collect(cogroup.rdd) + actual <- collectRDD(cogroup.rdd) expected <- list(list("b", list(list(), list(2))), list("a", list(list(1, 4), list(3)))) expect_equal(sortKeyValueList(actual), @@ -72,7 +72,7 @@ test_that("zipPartitions() on RDDs", { rdd1 <- parallelize(sc, 1:2, 2L) # 1, 2 rdd2 <- parallelize(sc, 1:4, 2L) # 1:2, 3:4 rdd3 <- parallelize(sc, 1:6, 2L) # 1:3, 4:6 - actual <- collect(zipPartitions(rdd1, rdd2, rdd3, + actual <- collectRDD(zipPartitions(rdd1, rdd2, rdd3, func = function(x, y, z) { list(list(x, y, z))} )) expect_equal(actual, list(list(1, c(1, 2), c(1, 2, 3)), list(2, c(3, 4), c(4, 5, 6)))) @@ -82,19 +82,19 @@ test_that("zipPartitions() on RDDs", { writeLines(mockFile, fileName) rdd <- textFile(sc, fileName, 1) - actual <- collect(zipPartitions(rdd, rdd, + actual <- collectRDD(zipPartitions(rdd, rdd, func = function(x, y) { list(paste(x, y, sep = "\n")) })) expected <- list(paste(mockFile, mockFile, sep = "\n")) expect_equal(actual, expected) rdd1 <- parallelize(sc, 0:1, 1) - actual <- collect(zipPartitions(rdd1, rdd, + actual <- collectRDD(zipPartitions(rdd1, rdd, func = function(x, y) { list(x + nchar(y)) })) expected <- list(0:1 + nchar(mockFile)) expect_equal(actual, expected) rdd <- map(rdd, function(x) { x }) - actual <- collect(zipPartitions(rdd, rdd1, + actual <- collectRDD(zipPartitions(rdd, rdd1, func = function(x, y) { list(y + nchar(x)) })) expect_equal(actual, expected) http://git-wip-us.apache.org/repos/asf/spark/blob/c34b546d/R/pkg/inst/tests/testthat/test_broadcast.R ---------------------------------------------------------------------- diff --git a/R/pkg/inst/tests/testthat/test_broadcast.R b/R/pkg/inst/tests/testthat/test_broadcast.R index c7fefb5..65f204d 100644 --- a/R/pkg/inst/tests/testthat/test_broadcast.R +++ b/R/pkg/inst/tests/testthat/test_broadcast.R @@ -32,7 +32,7 @@ test_that("using broadcast variable", { useBroadcast <- function(x) { sum(SparkR:::value(randomMatBr) * x) } - actual <- collect(lapply(rrdd, useBroadcast)) + actual <- collectRDD(lapply(rrdd, useBroadcast)) expected <- list(sum(randomMat) * 1, sum(randomMat) * 2) expect_equal(actual, expected) }) @@ -43,7 +43,7 @@ test_that("without using broadcast variable", { useBroadcast <- function(x) { sum(randomMat * x) } - actual <- collect(lapply(rrdd, useBroadcast)) + actual <- collectRDD(lapply(rrdd, useBroadcast)) expected <- list(sum(randomMat) * 1, sum(randomMat) * 2) expect_equal(actual, expected) }) http://git-wip-us.apache.org/repos/asf/spark/blob/c34b546d/R/pkg/inst/tests/testthat/test_context.R ---------------------------------------------------------------------- diff --git a/R/pkg/inst/tests/testthat/test_context.R b/R/pkg/inst/tests/testthat/test_context.R index 8bd134a..1ab7f31 100644 --- a/R/pkg/inst/tests/testthat/test_context.R +++ b/R/pkg/inst/tests/testthat/test_context.R @@ -58,7 +58,7 @@ test_that("repeatedly starting and stopping SparkR", { for (i in 1:4) { sc <- suppressWarnings(sparkR.init()) rdd <- parallelize(sc, 1:20, 2L) - expect_equal(count(rdd), 20) + expect_equal(countRDD(rdd), 20) suppressWarnings(sparkR.stop()) } }) @@ -90,8 +90,8 @@ test_that("rdd GC across sparkR.stop", { rm(rdd2) gc() - count(rdd3) - count(rdd4) + countRDD(rdd3) + countRDD(rdd4) sparkR.session.stop() }) http://git-wip-us.apache.org/repos/asf/spark/blob/c34b546d/R/pkg/inst/tests/testthat/test_includePackage.R ---------------------------------------------------------------------- diff --git a/R/pkg/inst/tests/testthat/test_includePackage.R b/R/pkg/inst/tests/testthat/test_includePackage.R index ca2b900..563ea29 100644 --- a/R/pkg/inst/tests/testthat/test_includePackage.R +++ b/R/pkg/inst/tests/testthat/test_includePackage.R @@ -37,7 +37,7 @@ test_that("include inside function", { } data <- lapplyPartition(rdd, generateData) - actual <- collect(data) + actual <- collectRDD(data) } }) @@ -53,7 +53,7 @@ test_that("use include package", { includePackage(sc, plyr) data <- lapplyPartition(rdd, generateData) - actual <- collect(data) + actual <- collectRDD(data) } }) http://git-wip-us.apache.org/repos/asf/spark/blob/c34b546d/R/pkg/inst/tests/testthat/test_parallelize_collect.R ---------------------------------------------------------------------- diff --git a/R/pkg/inst/tests/testthat/test_parallelize_collect.R b/R/pkg/inst/tests/testthat/test_parallelize_collect.R index 959d7ab..55972e1 100644 --- a/R/pkg/inst/tests/testthat/test_parallelize_collect.R +++ b/R/pkg/inst/tests/testthat/test_parallelize_collect.R @@ -67,22 +67,22 @@ test_that("parallelize() on simple vectors and lists returns an RDD", { test_that("collect(), following a parallelize(), gives back the original collections", { numVectorRDD <- parallelize(jsc, numVector, 10) - expect_equal(collect(numVectorRDD), as.list(numVector)) + expect_equal(collectRDD(numVectorRDD), as.list(numVector)) numListRDD <- parallelize(jsc, numList, 1) numListRDD2 <- parallelize(jsc, numList, 4) - expect_equal(collect(numListRDD), as.list(numList)) - expect_equal(collect(numListRDD2), as.list(numList)) + expect_equal(collectRDD(numListRDD), as.list(numList)) + expect_equal(collectRDD(numListRDD2), as.list(numList)) strVectorRDD <- parallelize(jsc, strVector, 2) strVectorRDD2 <- parallelize(jsc, strVector, 3) - expect_equal(collect(strVectorRDD), as.list(strVector)) - expect_equal(collect(strVectorRDD2), as.list(strVector)) + expect_equal(collectRDD(strVectorRDD), as.list(strVector)) + expect_equal(collectRDD(strVectorRDD2), as.list(strVector)) strListRDD <- parallelize(jsc, strList, 4) strListRDD2 <- parallelize(jsc, strList, 1) - expect_equal(collect(strListRDD), as.list(strList)) - expect_equal(collect(strListRDD2), as.list(strList)) + expect_equal(collectRDD(strListRDD), as.list(strList)) + expect_equal(collectRDD(strListRDD2), as.list(strList)) }) test_that("regression: collect() following a parallelize() does not drop elements", { @@ -90,7 +90,7 @@ test_that("regression: collect() following a parallelize() does not drop element collLen <- 10 numPart <- 6 expected <- runif(collLen) - actual <- collect(parallelize(jsc, expected, numPart)) + actual <- collectRDD(parallelize(jsc, expected, numPart)) expect_equal(actual, as.list(expected)) }) @@ -99,14 +99,14 @@ test_that("parallelize() and collect() work for lists of pairs (pairwise data)", numPairsRDDD1 <- parallelize(jsc, numPairs, 1) numPairsRDDD2 <- parallelize(jsc, numPairs, 2) numPairsRDDD3 <- parallelize(jsc, numPairs, 3) - expect_equal(collect(numPairsRDDD1), numPairs) - expect_equal(collect(numPairsRDDD2), numPairs) - expect_equal(collect(numPairsRDDD3), numPairs) + expect_equal(collectRDD(numPairsRDDD1), numPairs) + expect_equal(collectRDD(numPairsRDDD2), numPairs) + expect_equal(collectRDD(numPairsRDDD3), numPairs) # can also leave out the parameter name, if the params are supplied in order strPairsRDDD1 <- parallelize(jsc, strPairs, 1) strPairsRDDD2 <- parallelize(jsc, strPairs, 2) - expect_equal(collect(strPairsRDDD1), strPairs) - expect_equal(collect(strPairsRDDD2), strPairs) + expect_equal(collectRDD(strPairsRDDD1), strPairs) + expect_equal(collectRDD(strPairsRDDD2), strPairs) }) sparkR.session.stop() http://git-wip-us.apache.org/repos/asf/spark/blob/c34b546d/R/pkg/inst/tests/testthat/test_rdd.R ---------------------------------------------------------------------- diff --git a/R/pkg/inst/tests/testthat/test_rdd.R b/R/pkg/inst/tests/testthat/test_rdd.R index 508a3a7..a3d66c2 100644 --- a/R/pkg/inst/tests/testthat/test_rdd.R +++ b/R/pkg/inst/tests/testthat/test_rdd.R @@ -34,14 +34,14 @@ test_that("get number of partitions in RDD", { }) test_that("first on RDD", { - expect_equal(first(rdd), 1) + expect_equal(firstRDD(rdd), 1) newrdd <- lapply(rdd, function(x) x + 1) - expect_equal(first(newrdd), 2) + expect_equal(firstRDD(newrdd), 2) }) test_that("count and length on RDD", { - expect_equal(count(rdd), 10) - expect_equal(length(rdd), 10) + expect_equal(countRDD(rdd), 10) + expect_equal(lengthRDD(rdd), 10) }) test_that("count by values and keys", { @@ -57,40 +57,40 @@ test_that("count by values and keys", { test_that("lapply on RDD", { multiples <- lapply(rdd, function(x) { 2 * x }) - actual <- collect(multiples) + actual <- collectRDD(multiples) expect_equal(actual, as.list(nums * 2)) }) test_that("lapplyPartition on RDD", { sums <- lapplyPartition(rdd, function(part) { sum(unlist(part)) }) - actual <- collect(sums) + actual <- collectRDD(sums) expect_equal(actual, list(15, 40)) }) test_that("mapPartitions on RDD", { sums <- mapPartitions(rdd, function(part) { sum(unlist(part)) }) - actual <- collect(sums) + actual <- collectRDD(sums) expect_equal(actual, list(15, 40)) }) test_that("flatMap() on RDDs", { flat <- flatMap(intRdd, function(x) { list(x, x) }) - actual <- collect(flat) + actual <- collectRDD(flat) expect_equal(actual, rep(intPairs, each = 2)) }) test_that("filterRDD on RDD", { filtered.rdd <- filterRDD(rdd, function(x) { x %% 2 == 0 }) - actual <- collect(filtered.rdd) + actual <- collectRDD(filtered.rdd) expect_equal(actual, list(2, 4, 6, 8, 10)) filtered.rdd <- Filter(function(x) { x[[2]] < 0 }, intRdd) - actual <- collect(filtered.rdd) + actual <- collectRDD(filtered.rdd) expect_equal(actual, list(list(1L, -1))) # Filter out all elements. filtered.rdd <- filterRDD(rdd, function(x) { x > 10 }) - actual <- collect(filtered.rdd) + actual <- collectRDD(filtered.rdd) expect_equal(actual, list()) }) @@ -110,7 +110,7 @@ test_that("several transformations on RDD (a benchmark on PipelinedRDD)", { part <- as.list(unlist(part) * partIndex + i) }) rdd2 <- lapply(rdd2, function(x) x + x) - actual <- collect(rdd2) + actual <- collectRDD(rdd2) expected <- list(24, 24, 24, 24, 24, 168, 170, 172, 174, 176) expect_equal(actual, expected) @@ -126,20 +126,20 @@ test_that("PipelinedRDD support actions: cache(), persist(), unpersist(), checkp part <- as.list(unlist(part) * partIndex) }) - cache(rdd2) + cacheRDD(rdd2) expect_true(rdd2@env$isCached) rdd2 <- lapply(rdd2, function(x) x) expect_false(rdd2@env$isCached) - unpersist(rdd2) + unpersistRDD(rdd2) expect_false(rdd2@env$isCached) - persist(rdd2, "MEMORY_AND_DISK") + persistRDD(rdd2, "MEMORY_AND_DISK") expect_true(rdd2@env$isCached) rdd2 <- lapply(rdd2, function(x) x) expect_false(rdd2@env$isCached) - unpersist(rdd2) + unpersistRDD(rdd2) expect_false(rdd2@env$isCached) tempDir <- tempfile(pattern = "checkpoint") @@ -152,7 +152,7 @@ test_that("PipelinedRDD support actions: cache(), persist(), unpersist(), checkp expect_false(rdd2@env$isCheckpointed) # make sure the data is collectable - collect(rdd2) + collectRDD(rdd2) unlink(tempDir) }) @@ -169,21 +169,21 @@ test_that("reduce on RDD", { test_that("lapply with dependency", { fa <- 5 multiples <- lapply(rdd, function(x) { fa * x }) - actual <- collect(multiples) + actual <- collectRDD(multiples) expect_equal(actual, as.list(nums * 5)) }) test_that("lapplyPartitionsWithIndex on RDDs", { func <- function(partIndex, part) { list(partIndex, Reduce("+", part)) } - actual <- collect(lapplyPartitionsWithIndex(rdd, func), flatten = FALSE) + actual <- collectRDD(lapplyPartitionsWithIndex(rdd, func), flatten = FALSE) expect_equal(actual, list(list(0, 15), list(1, 40))) pairsRDD <- parallelize(sc, list(list(1, 2), list(3, 4), list(4, 8)), 1L) partitionByParity <- function(key) { if (key %% 2 == 1) 0 else 1 } mkTup <- function(partIndex, part) { list(partIndex, part) } - actual <- collect(lapplyPartitionsWithIndex( - partitionBy(pairsRDD, 2L, partitionByParity), + actual <- collectRDD(lapplyPartitionsWithIndex( + partitionByRDD(pairsRDD, 2L, partitionByParity), mkTup), FALSE) expect_equal(actual, list(list(0, list(list(1, 2), list(3, 4))), @@ -191,7 +191,7 @@ test_that("lapplyPartitionsWithIndex on RDDs", { }) test_that("sampleRDD() on RDDs", { - expect_equal(unlist(collect(sampleRDD(rdd, FALSE, 1.0, 2014L))), nums) + expect_equal(unlist(collectRDD(sampleRDD(rdd, FALSE, 1.0, 2014L))), nums) }) test_that("takeSample() on RDDs", { @@ -238,7 +238,7 @@ test_that("takeSample() on RDDs", { test_that("mapValues() on pairwise RDDs", { multiples <- mapValues(intRdd, function(x) { x * 2 }) - actual <- collect(multiples) + actual <- collectRDD(multiples) expected <- lapply(intPairs, function(x) { list(x[[1]], x[[2]] * 2) }) @@ -247,11 +247,11 @@ test_that("mapValues() on pairwise RDDs", { test_that("flatMapValues() on pairwise RDDs", { l <- parallelize(sc, list(list(1, c(1, 2)), list(2, c(3, 4)))) - actual <- collect(flatMapValues(l, function(x) { x })) + actual <- collectRDD(flatMapValues(l, function(x) { x })) expect_equal(actual, list(list(1, 1), list(1, 2), list(2, 3), list(2, 4))) # Generate x to x+1 for every value - actual <- collect(flatMapValues(intRdd, function(x) { x: (x + 1) })) + actual <- collectRDD(flatMapValues(intRdd, function(x) { x: (x + 1) })) expect_equal(actual, list(list(1L, -1), list(1L, 0), list(2L, 100), list(2L, 101), list(2L, 1), list(2L, 2), list(1L, 200), list(1L, 201))) @@ -273,8 +273,8 @@ test_that("reduceByKeyLocally() on PairwiseRDDs", { test_that("distinct() on RDDs", { nums.rep2 <- rep(1:10, 2) rdd.rep2 <- parallelize(sc, nums.rep2, 2L) - uniques <- distinct(rdd.rep2) - actual <- sort(unlist(collect(uniques))) + uniques <- distinctRDD(rdd.rep2) + actual <- sort(unlist(collectRDD(uniques))) expect_equal(actual, nums) }) @@ -296,7 +296,7 @@ test_that("sumRDD() on RDDs", { test_that("keyBy on RDDs", { func <- function(x) { x * x } keys <- keyBy(rdd, func) - actual <- collect(keys) + actual <- collectRDD(keys) expect_equal(actual, lapply(nums, function(x) { list(func(x), x) })) }) @@ -304,12 +304,12 @@ test_that("repartition/coalesce on RDDs", { rdd <- parallelize(sc, 1:20, 4L) # each partition contains 5 elements # repartition - r1 <- repartition(rdd, 2) + r1 <- repartitionRDD(rdd, 2) expect_equal(getNumPartitions(r1), 2L) count <- length(collectPartition(r1, 0L)) expect_true(count >= 8 && count <= 12) - r2 <- repartition(rdd, 6) + r2 <- repartitionRDD(rdd, 6) expect_equal(getNumPartitions(r2), 6L) count <- length(collectPartition(r2, 0L)) expect_true(count >= 0 && count <= 4) @@ -323,12 +323,12 @@ test_that("repartition/coalesce on RDDs", { test_that("sortBy() on RDDs", { sortedRdd <- sortBy(rdd, function(x) { x * x }, ascending = FALSE) - actual <- collect(sortedRdd) + actual <- collectRDD(sortedRdd) expect_equal(actual, as.list(sort(nums, decreasing = TRUE))) rdd2 <- parallelize(sc, sort(nums, decreasing = TRUE), 2L) sortedRdd2 <- sortBy(rdd2, function(x) { x * x }) - actual <- collect(sortedRdd2) + actual <- collectRDD(sortedRdd2) expect_equal(actual, as.list(nums)) }) @@ -380,13 +380,13 @@ test_that("aggregateRDD() on RDDs", { test_that("zipWithUniqueId() on RDDs", { rdd <- parallelize(sc, list("a", "b", "c", "d", "e"), 3L) - actual <- collect(zipWithUniqueId(rdd)) + actual <- collectRDD(zipWithUniqueId(rdd)) expected <- list(list("a", 0), list("b", 3), list("c", 1), list("d", 4), list("e", 2)) expect_equal(actual, expected) rdd <- parallelize(sc, list("a", "b", "c", "d", "e"), 1L) - actual <- collect(zipWithUniqueId(rdd)) + actual <- collectRDD(zipWithUniqueId(rdd)) expected <- list(list("a", 0), list("b", 1), list("c", 2), list("d", 3), list("e", 4)) expect_equal(actual, expected) @@ -394,13 +394,13 @@ test_that("zipWithUniqueId() on RDDs", { test_that("zipWithIndex() on RDDs", { rdd <- parallelize(sc, list("a", "b", "c", "d", "e"), 3L) - actual <- collect(zipWithIndex(rdd)) + actual <- collectRDD(zipWithIndex(rdd)) expected <- list(list("a", 0), list("b", 1), list("c", 2), list("d", 3), list("e", 4)) expect_equal(actual, expected) rdd <- parallelize(sc, list("a", "b", "c", "d", "e"), 1L) - actual <- collect(zipWithIndex(rdd)) + actual <- collectRDD(zipWithIndex(rdd)) expected <- list(list("a", 0), list("b", 1), list("c", 2), list("d", 3), list("e", 4)) expect_equal(actual, expected) @@ -408,35 +408,35 @@ test_that("zipWithIndex() on RDDs", { test_that("glom() on RDD", { rdd <- parallelize(sc, as.list(1:4), 2L) - actual <- collect(glom(rdd)) + actual <- collectRDD(glom(rdd)) expect_equal(actual, list(list(1, 2), list(3, 4))) }) test_that("keys() on RDDs", { keys <- keys(intRdd) - actual <- collect(keys) + actual <- collectRDD(keys) expect_equal(actual, lapply(intPairs, function(x) { x[[1]] })) }) test_that("values() on RDDs", { values <- values(intRdd) - actual <- collect(values) + actual <- collectRDD(values) expect_equal(actual, lapply(intPairs, function(x) { x[[2]] })) }) test_that("pipeRDD() on RDDs", { - actual <- collect(pipeRDD(rdd, "more")) + actual <- collectRDD(pipeRDD(rdd, "more")) expected <- as.list(as.character(1:10)) expect_equal(actual, expected) trailed.rdd <- parallelize(sc, c("1", "", "2\n", "3\n\r\n")) - actual <- collect(pipeRDD(trailed.rdd, "sort")) + actual <- collectRDD(pipeRDD(trailed.rdd, "sort")) expected <- list("", "1", "2", "3") expect_equal(actual, expected) rev.nums <- 9:0 rev.rdd <- parallelize(sc, rev.nums, 2L) - actual <- collect(pipeRDD(rev.rdd, "sort")) + actual <- collectRDD(pipeRDD(rev.rdd, "sort")) expected <- as.list(as.character(c(5:9, 0:4))) expect_equal(actual, expected) }) @@ -444,7 +444,7 @@ test_that("pipeRDD() on RDDs", { test_that("zipRDD() on RDDs", { rdd1 <- parallelize(sc, 0:4, 2) rdd2 <- parallelize(sc, 1000:1004, 2) - actual <- collect(zipRDD(rdd1, rdd2)) + actual <- collectRDD(zipRDD(rdd1, rdd2)) expect_equal(actual, list(list(0, 1000), list(1, 1001), list(2, 1002), list(3, 1003), list(4, 1004))) @@ -453,17 +453,17 @@ test_that("zipRDD() on RDDs", { writeLines(mockFile, fileName) rdd <- textFile(sc, fileName, 1) - actual <- collect(zipRDD(rdd, rdd)) + actual <- collectRDD(zipRDD(rdd, rdd)) expected <- lapply(mockFile, function(x) { list(x, x) }) expect_equal(actual, expected) rdd1 <- parallelize(sc, 0:1, 1) - actual <- collect(zipRDD(rdd1, rdd)) + actual <- collectRDD(zipRDD(rdd1, rdd)) expected <- lapply(0:1, function(x) { list(x, mockFile[x + 1]) }) expect_equal(actual, expected) rdd1 <- map(rdd, function(x) { x }) - actual <- collect(zipRDD(rdd, rdd1)) + actual <- collectRDD(zipRDD(rdd, rdd1)) expected <- lapply(mockFile, function(x) { list(x, x) }) expect_equal(actual, expected) @@ -472,7 +472,7 @@ test_that("zipRDD() on RDDs", { test_that("cartesian() on RDDs", { rdd <- parallelize(sc, 1:3) - actual <- collect(cartesian(rdd, rdd)) + actual <- collectRDD(cartesian(rdd, rdd)) expect_equal(sortKeyValueList(actual), list( list(1, 1), list(1, 2), list(1, 3), @@ -481,7 +481,7 @@ test_that("cartesian() on RDDs", { # test case where one RDD is empty emptyRdd <- parallelize(sc, list()) - actual <- collect(cartesian(rdd, emptyRdd)) + actual <- collectRDD(cartesian(rdd, emptyRdd)) expect_equal(actual, list()) mockFile <- c("Spark is pretty.", "Spark is awesome.") @@ -489,7 +489,7 @@ test_that("cartesian() on RDDs", { writeLines(mockFile, fileName) rdd <- textFile(sc, fileName) - actual <- collect(cartesian(rdd, rdd)) + actual <- collectRDD(cartesian(rdd, rdd)) expected <- list( list("Spark is awesome.", "Spark is pretty."), list("Spark is awesome.", "Spark is awesome."), @@ -498,7 +498,7 @@ test_that("cartesian() on RDDs", { expect_equal(sortKeyValueList(actual), expected) rdd1 <- parallelize(sc, 0:1) - actual <- collect(cartesian(rdd1, rdd)) + actual <- collectRDD(cartesian(rdd1, rdd)) expect_equal(sortKeyValueList(actual), list( list(0, "Spark is pretty."), @@ -507,7 +507,7 @@ test_that("cartesian() on RDDs", { list(1, "Spark is awesome."))) rdd1 <- map(rdd, function(x) { x }) - actual <- collect(cartesian(rdd, rdd1)) + actual <- collectRDD(cartesian(rdd, rdd1)) expect_equal(sortKeyValueList(actual), expected) unlink(fileName) @@ -518,24 +518,24 @@ test_that("subtract() on RDDs", { rdd1 <- parallelize(sc, l) # subtract by itself - actual <- collect(subtract(rdd1, rdd1)) + actual <- collectRDD(subtract(rdd1, rdd1)) expect_equal(actual, list()) # subtract by an empty RDD rdd2 <- parallelize(sc, list()) - actual <- collect(subtract(rdd1, rdd2)) + actual <- collectRDD(subtract(rdd1, rdd2)) expect_equal(as.list(sort(as.vector(actual, mode = "integer"))), l) rdd2 <- parallelize(sc, list(2, 4)) - actual <- collect(subtract(rdd1, rdd2)) + actual <- collectRDD(subtract(rdd1, rdd2)) expect_equal(as.list(sort(as.vector(actual, mode = "integer"))), list(1, 1, 3)) l <- list("a", "a", "b", "b", "c", "d") rdd1 <- parallelize(sc, l) rdd2 <- parallelize(sc, list("b", "d")) - actual <- collect(subtract(rdd1, rdd2)) + actual <- collectRDD(subtract(rdd1, rdd2)) expect_equal(as.list(sort(as.vector(actual, mode = "character"))), list("a", "a", "c")) }) @@ -546,17 +546,17 @@ test_that("subtractByKey() on pairwise RDDs", { rdd1 <- parallelize(sc, l) # subtractByKey by itself - actual <- collect(subtractByKey(rdd1, rdd1)) + actual <- collectRDD(subtractByKey(rdd1, rdd1)) expect_equal(actual, list()) # subtractByKey by an empty RDD rdd2 <- parallelize(sc, list()) - actual <- collect(subtractByKey(rdd1, rdd2)) + actual <- collectRDD(subtractByKey(rdd1, rdd2)) expect_equal(sortKeyValueList(actual), sortKeyValueList(l)) rdd2 <- parallelize(sc, list(list("a", 3), list("c", 1))) - actual <- collect(subtractByKey(rdd1, rdd2)) + actual <- collectRDD(subtractByKey(rdd1, rdd2)) expect_equal(actual, list(list("b", 4), list("b", 5))) @@ -564,76 +564,76 @@ test_that("subtractByKey() on pairwise RDDs", { list(2, 5), list(1, 2)) rdd1 <- parallelize(sc, l) rdd2 <- parallelize(sc, list(list(1, 3), list(3, 1))) - actual <- collect(subtractByKey(rdd1, rdd2)) + actual <- collectRDD(subtractByKey(rdd1, rdd2)) expect_equal(actual, list(list(2, 4), list(2, 5))) }) test_that("intersection() on RDDs", { # intersection with self - actual <- collect(intersection(rdd, rdd)) + actual <- collectRDD(intersection(rdd, rdd)) expect_equal(sort(as.integer(actual)), nums) # intersection with an empty RDD emptyRdd <- parallelize(sc, list()) - actual <- collect(intersection(rdd, emptyRdd)) + actual <- collectRDD(intersection(rdd, emptyRdd)) expect_equal(actual, list()) rdd1 <- parallelize(sc, list(1, 10, 2, 3, 4, 5)) rdd2 <- parallelize(sc, list(1, 6, 2, 3, 7, 8)) - actual <- collect(intersection(rdd1, rdd2)) + actual <- collectRDD(intersection(rdd1, rdd2)) expect_equal(sort(as.integer(actual)), 1:3) }) test_that("join() on pairwise RDDs", { rdd1 <- parallelize(sc, list(list(1, 1), list(2, 4))) rdd2 <- parallelize(sc, list(list(1, 2), list(1, 3))) - actual <- collect(join(rdd1, rdd2, 2L)) + actual <- collectRDD(joinRDD(rdd1, rdd2, 2L)) expect_equal(sortKeyValueList(actual), sortKeyValueList(list(list(1, list(1, 2)), list(1, list(1, 3))))) rdd1 <- parallelize(sc, list(list("a", 1), list("b", 4))) rdd2 <- parallelize(sc, list(list("a", 2), list("a", 3))) - actual <- collect(join(rdd1, rdd2, 2L)) + actual <- collectRDD(joinRDD(rdd1, rdd2, 2L)) expect_equal(sortKeyValueList(actual), sortKeyValueList(list(list("a", list(1, 2)), list("a", list(1, 3))))) rdd1 <- parallelize(sc, list(list(1, 1), list(2, 2))) rdd2 <- parallelize(sc, list(list(3, 3), list(4, 4))) - actual <- collect(join(rdd1, rdd2, 2L)) + actual <- collectRDD(joinRDD(rdd1, rdd2, 2L)) expect_equal(actual, list()) rdd1 <- parallelize(sc, list(list("a", 1), list("b", 2))) rdd2 <- parallelize(sc, list(list("c", 3), list("d", 4))) - actual <- collect(join(rdd1, rdd2, 2L)) + actual <- collectRDD(joinRDD(rdd1, rdd2, 2L)) expect_equal(actual, list()) }) test_that("leftOuterJoin() on pairwise RDDs", { rdd1 <- parallelize(sc, list(list(1, 1), list(2, 4))) rdd2 <- parallelize(sc, list(list(1, 2), list(1, 3))) - actual <- collect(leftOuterJoin(rdd1, rdd2, 2L)) + actual <- collectRDD(leftOuterJoin(rdd1, rdd2, 2L)) expected <- list(list(1, list(1, 2)), list(1, list(1, 3)), list(2, list(4, NULL))) expect_equal(sortKeyValueList(actual), sortKeyValueList(expected)) rdd1 <- parallelize(sc, list(list("a", 1), list("b", 4))) rdd2 <- parallelize(sc, list(list("a", 2), list("a", 3))) - actual <- collect(leftOuterJoin(rdd1, rdd2, 2L)) + actual <- collectRDD(leftOuterJoin(rdd1, rdd2, 2L)) expected <- list(list("b", list(4, NULL)), list("a", list(1, 2)), list("a", list(1, 3))) expect_equal(sortKeyValueList(actual), sortKeyValueList(expected)) rdd1 <- parallelize(sc, list(list(1, 1), list(2, 2))) rdd2 <- parallelize(sc, list(list(3, 3), list(4, 4))) - actual <- collect(leftOuterJoin(rdd1, rdd2, 2L)) + actual <- collectRDD(leftOuterJoin(rdd1, rdd2, 2L)) expected <- list(list(1, list(1, NULL)), list(2, list(2, NULL))) expect_equal(sortKeyValueList(actual), sortKeyValueList(expected)) rdd1 <- parallelize(sc, list(list("a", 1), list("b", 2))) rdd2 <- parallelize(sc, list(list("c", 3), list("d", 4))) - actual <- collect(leftOuterJoin(rdd1, rdd2, 2L)) + actual <- collectRDD(leftOuterJoin(rdd1, rdd2, 2L)) expected <- list(list("b", list(2, NULL)), list("a", list(1, NULL))) expect_equal(sortKeyValueList(actual), sortKeyValueList(expected)) @@ -642,26 +642,26 @@ test_that("leftOuterJoin() on pairwise RDDs", { test_that("rightOuterJoin() on pairwise RDDs", { rdd1 <- parallelize(sc, list(list(1, 2), list(1, 3))) rdd2 <- parallelize(sc, list(list(1, 1), list(2, 4))) - actual <- collect(rightOuterJoin(rdd1, rdd2, 2L)) + actual <- collectRDD(rightOuterJoin(rdd1, rdd2, 2L)) expected <- list(list(1, list(2, 1)), list(1, list(3, 1)), list(2, list(NULL, 4))) expect_equal(sortKeyValueList(actual), sortKeyValueList(expected)) rdd1 <- parallelize(sc, list(list("a", 2), list("a", 3))) rdd2 <- parallelize(sc, list(list("a", 1), list("b", 4))) - actual <- collect(rightOuterJoin(rdd1, rdd2, 2L)) + actual <- collectRDD(rightOuterJoin(rdd1, rdd2, 2L)) expected <- list(list("b", list(NULL, 4)), list("a", list(2, 1)), list("a", list(3, 1))) expect_equal(sortKeyValueList(actual), sortKeyValueList(expected)) rdd1 <- parallelize(sc, list(list(1, 1), list(2, 2))) rdd2 <- parallelize(sc, list(list(3, 3), list(4, 4))) - actual <- collect(rightOuterJoin(rdd1, rdd2, 2L)) + actual <- collectRDD(rightOuterJoin(rdd1, rdd2, 2L)) expect_equal(sortKeyValueList(actual), sortKeyValueList(list(list(3, list(NULL, 3)), list(4, list(NULL, 4))))) rdd1 <- parallelize(sc, list(list("a", 1), list("b", 2))) rdd2 <- parallelize(sc, list(list("c", 3), list("d", 4))) - actual <- collect(rightOuterJoin(rdd1, rdd2, 2L)) + actual <- collectRDD(rightOuterJoin(rdd1, rdd2, 2L)) expect_equal(sortKeyValueList(actual), sortKeyValueList(list(list("d", list(NULL, 4)), list("c", list(NULL, 3))))) }) @@ -669,14 +669,14 @@ test_that("rightOuterJoin() on pairwise RDDs", { test_that("fullOuterJoin() on pairwise RDDs", { rdd1 <- parallelize(sc, list(list(1, 2), list(1, 3), list(3, 3))) rdd2 <- parallelize(sc, list(list(1, 1), list(2, 4))) - actual <- collect(fullOuterJoin(rdd1, rdd2, 2L)) + actual <- collectRDD(fullOuterJoin(rdd1, rdd2, 2L)) expected <- list(list(1, list(2, 1)), list(1, list(3, 1)), list(2, list(NULL, 4)), list(3, list(3, NULL))) expect_equal(sortKeyValueList(actual), sortKeyValueList(expected)) rdd1 <- parallelize(sc, list(list("a", 2), list("a", 3), list("c", 1))) rdd2 <- parallelize(sc, list(list("a", 1), list("b", 4))) - actual <- collect(fullOuterJoin(rdd1, rdd2, 2L)) + actual <- collectRDD(fullOuterJoin(rdd1, rdd2, 2L)) expected <- list(list("b", list(NULL, 4)), list("a", list(2, 1)), list("a", list(3, 1)), list("c", list(1, NULL))) expect_equal(sortKeyValueList(actual), @@ -684,14 +684,14 @@ test_that("fullOuterJoin() on pairwise RDDs", { rdd1 <- parallelize(sc, list(list(1, 1), list(2, 2))) rdd2 <- parallelize(sc, list(list(3, 3), list(4, 4))) - actual <- collect(fullOuterJoin(rdd1, rdd2, 2L)) + actual <- collectRDD(fullOuterJoin(rdd1, rdd2, 2L)) expect_equal(sortKeyValueList(actual), sortKeyValueList(list(list(1, list(1, NULL)), list(2, list(2, NULL)), list(3, list(NULL, 3)), list(4, list(NULL, 4))))) rdd1 <- parallelize(sc, list(list("a", 1), list("b", 2))) rdd2 <- parallelize(sc, list(list("c", 3), list("d", 4))) - actual <- collect(fullOuterJoin(rdd1, rdd2, 2L)) + actual <- collectRDD(fullOuterJoin(rdd1, rdd2, 2L)) expect_equal(sortKeyValueList(actual), sortKeyValueList(list(list("a", list(1, NULL)), list("b", list(2, NULL)), list("d", list(NULL, 4)), list("c", list(NULL, 3))))) @@ -700,21 +700,21 @@ test_that("fullOuterJoin() on pairwise RDDs", { test_that("sortByKey() on pairwise RDDs", { numPairsRdd <- map(rdd, function(x) { list (x, x) }) sortedRdd <- sortByKey(numPairsRdd, ascending = FALSE) - actual <- collect(sortedRdd) + actual <- collectRDD(sortedRdd) numPairs <- lapply(nums, function(x) { list (x, x) }) expect_equal(actual, sortKeyValueList(numPairs, decreasing = TRUE)) rdd2 <- parallelize(sc, sort(nums, decreasing = TRUE), 2L) numPairsRdd2 <- map(rdd2, function(x) { list (x, x) }) sortedRdd2 <- sortByKey(numPairsRdd2) - actual <- collect(sortedRdd2) + actual <- collectRDD(sortedRdd2) expect_equal(actual, numPairs) # sort by string keys l <- list(list("a", 1), list("b", 2), list("1", 3), list("d", 4), list("2", 5)) rdd3 <- parallelize(sc, l, 2L) sortedRdd3 <- sortByKey(rdd3) - actual <- collect(sortedRdd3) + actual <- collectRDD(sortedRdd3) expect_equal(actual, list(list("1", 3), list("2", 5), list("a", 1), list("b", 2), list("d", 4))) # test on the boundary cases @@ -722,27 +722,27 @@ test_that("sortByKey() on pairwise RDDs", { # boundary case 1: the RDD to be sorted has only 1 partition rdd4 <- parallelize(sc, l, 1L) sortedRdd4 <- sortByKey(rdd4) - actual <- collect(sortedRdd4) + actual <- collectRDD(sortedRdd4) expect_equal(actual, list(list("1", 3), list("2", 5), list("a", 1), list("b", 2), list("d", 4))) # boundary case 2: the sorted RDD has only 1 partition rdd5 <- parallelize(sc, l, 2L) sortedRdd5 <- sortByKey(rdd5, numPartitions = 1L) - actual <- collect(sortedRdd5) + actual <- collectRDD(sortedRdd5) expect_equal(actual, list(list("1", 3), list("2", 5), list("a", 1), list("b", 2), list("d", 4))) # boundary case 3: the RDD to be sorted has only 1 element l2 <- list(list("a", 1)) rdd6 <- parallelize(sc, l2, 2L) sortedRdd6 <- sortByKey(rdd6) - actual <- collect(sortedRdd6) + actual <- collectRDD(sortedRdd6) expect_equal(actual, l2) # boundary case 4: the RDD to be sorted has 0 element l3 <- list() rdd7 <- parallelize(sc, l3, 2L) sortedRdd7 <- sortByKey(rdd7) - actual <- collect(sortedRdd7) + actual <- collectRDD(sortedRdd7) expect_equal(actual, l3) }) @@ -766,7 +766,7 @@ test_that("collectAsMap() on a pairwise RDD", { test_that("show()", { rdd <- parallelize(sc, list(1:10)) - expect_output(show(rdd), "ParallelCollectionRDD\\[\\d+\\] at parallelize at RRDD\\.scala:\\d+") + expect_output(showRDD(rdd), "ParallelCollectionRDD\\[\\d+\\] at parallelize at RRDD\\.scala:\\d+") }) test_that("sampleByKey() on pairwise RDDs", { http://git-wip-us.apache.org/repos/asf/spark/blob/c34b546d/R/pkg/inst/tests/testthat/test_shuffle.R ---------------------------------------------------------------------- diff --git a/R/pkg/inst/tests/testthat/test_shuffle.R b/R/pkg/inst/tests/testthat/test_shuffle.R index 2586056..d38efab 100644 --- a/R/pkg/inst/tests/testthat/test_shuffle.R +++ b/R/pkg/inst/tests/testthat/test_shuffle.R @@ -39,7 +39,7 @@ strListRDD <- parallelize(sc, strList, 4) test_that("groupByKey for integers", { grouped <- groupByKey(intRdd, 2L) - actual <- collect(grouped) + actual <- collectRDD(grouped) expected <- list(list(2L, list(100, 1)), list(1L, list(-1, 200))) expect_equal(sortKeyValueList(actual), sortKeyValueList(expected)) @@ -48,7 +48,7 @@ test_that("groupByKey for integers", { test_that("groupByKey for doubles", { grouped <- groupByKey(doubleRdd, 2L) - actual <- collect(grouped) + actual <- collectRDD(grouped) expected <- list(list(1.5, list(-1, 200)), list(2.5, list(100, 1))) expect_equal(sortKeyValueList(actual), sortKeyValueList(expected)) @@ -57,7 +57,7 @@ test_that("groupByKey for doubles", { test_that("reduceByKey for ints", { reduced <- reduceByKey(intRdd, "+", 2L) - actual <- collect(reduced) + actual <- collectRDD(reduced) expected <- list(list(2L, 101), list(1L, 199)) expect_equal(sortKeyValueList(actual), sortKeyValueList(expected)) @@ -65,7 +65,7 @@ test_that("reduceByKey for ints", { test_that("reduceByKey for doubles", { reduced <- reduceByKey(doubleRdd, "+", 2L) - actual <- collect(reduced) + actual <- collectRDD(reduced) expected <- list(list(1.5, 199), list(2.5, 101)) expect_equal(sortKeyValueList(actual), sortKeyValueList(expected)) @@ -74,7 +74,7 @@ test_that("reduceByKey for doubles", { test_that("combineByKey for ints", { reduced <- combineByKey(intRdd, function(x) { x }, "+", "+", 2L) - actual <- collect(reduced) + actual <- collectRDD(reduced) expected <- list(list(2L, 101), list(1L, 199)) expect_equal(sortKeyValueList(actual), sortKeyValueList(expected)) @@ -82,7 +82,7 @@ test_that("combineByKey for ints", { test_that("combineByKey for doubles", { reduced <- combineByKey(doubleRdd, function(x) { x }, "+", "+", 2L) - actual <- collect(reduced) + actual <- collectRDD(reduced) expected <- list(list(1.5, 199), list(2.5, 101)) expect_equal(sortKeyValueList(actual), sortKeyValueList(expected)) @@ -94,7 +94,7 @@ test_that("combineByKey for characters", { list("other", 3L), list("max", 4L)), 2L) reduced <- combineByKey(stringKeyRDD, function(x) { x }, "+", "+", 2L) - actual <- collect(reduced) + actual <- collectRDD(reduced) expected <- list(list("max", 5L), list("min", 2L), list("other", 3L)) expect_equal(sortKeyValueList(actual), sortKeyValueList(expected)) @@ -109,7 +109,7 @@ test_that("aggregateByKey", { combOp <- function(x, y) { list(x[[1]] + y[[1]], x[[2]] + y[[2]]) } aggregatedRDD <- aggregateByKey(rdd, zeroValue, seqOp, combOp, 2L) - actual <- collect(aggregatedRDD) + actual <- collectRDD(aggregatedRDD) expected <- list(list(1, list(3, 2)), list(2, list(7, 2))) expect_equal(sortKeyValueList(actual), sortKeyValueList(expected)) @@ -122,7 +122,7 @@ test_that("aggregateByKey", { combOp <- function(x, y) { list(x[[1]] + y[[1]], x[[2]] + y[[2]]) } aggregatedRDD <- aggregateByKey(rdd, zeroValue, seqOp, combOp, 2L) - actual <- collect(aggregatedRDD) + actual <- collectRDD(aggregatedRDD) expected <- list(list("a", list(3, 2)), list("b", list(7, 2))) expect_equal(sortKeyValueList(actual), sortKeyValueList(expected)) @@ -132,7 +132,7 @@ test_that("foldByKey", { # test foldByKey for int keys folded <- foldByKey(intRdd, 0, "+", 2L) - actual <- collect(folded) + actual <- collectRDD(folded) expected <- list(list(2L, 101), list(1L, 199)) expect_equal(sortKeyValueList(actual), sortKeyValueList(expected)) @@ -140,7 +140,7 @@ test_that("foldByKey", { # test foldByKey for double keys folded <- foldByKey(doubleRdd, 0, "+", 2L) - actual <- collect(folded) + actual <- collectRDD(folded) expected <- list(list(1.5, 199), list(2.5, 101)) expect_equal(sortKeyValueList(actual), sortKeyValueList(expected)) @@ -151,7 +151,7 @@ test_that("foldByKey", { stringKeyRDD <- parallelize(sc, stringKeyPairs) folded <- foldByKey(stringKeyRDD, 0, "+", 2L) - actual <- collect(folded) + actual <- collectRDD(folded) expected <- list(list("b", 101), list("a", 199)) expect_equal(sortKeyValueList(actual), sortKeyValueList(expected)) @@ -159,14 +159,14 @@ test_that("foldByKey", { # test foldByKey for empty pair RDD rdd <- parallelize(sc, list()) folded <- foldByKey(rdd, 0, "+", 2L) - actual <- collect(folded) + actual <- collectRDD(folded) expected <- list() expect_equal(actual, expected) # test foldByKey for RDD with only 1 pair rdd <- parallelize(sc, list(list(1, 1))) folded <- foldByKey(rdd, 0, "+", 2L) - actual <- collect(folded) + actual <- collectRDD(folded) expected <- list(list(1, 1)) expect_equal(actual, expected) }) @@ -175,7 +175,7 @@ test_that("partitionBy() partitions data correctly", { # Partition by magnitude partitionByMagnitude <- function(key) { if (key >= 3) 1 else 0 } - resultRDD <- partitionBy(numPairsRdd, 2L, partitionByMagnitude) + resultRDD <- partitionByRDD(numPairsRdd, 2L, partitionByMagnitude) expected_first <- list(list(1, 100), list(2, 200)) # key less than 3 expected_second <- list(list(4, -1), list(3, 1), list(3, 0)) # key greater than or equal 3 @@ -191,7 +191,7 @@ test_that("partitionBy works with dependencies", { partitionByParity <- function(key) { if (key %% 2 == kOne) 7 else 4 } # Partition by parity - resultRDD <- partitionBy(numPairsRdd, numPartitions = 2L, partitionByParity) + resultRDD <- partitionByRDD(numPairsRdd, numPartitions = 2L, partitionByParity) # keys even; 100 %% 2 == 0 expected_first <- list(list(2, 200), list(4, -1)) @@ -208,7 +208,7 @@ test_that("test partitionBy with string keys", { words <- flatMap(strListRDD, function(line) { strsplit(line, " ")[[1]] }) wordCount <- lapply(words, function(word) { list(word, 1L) }) - resultRDD <- partitionBy(wordCount, 2L) + resultRDD <- partitionByRDD(wordCount, 2L) expected_first <- list(list("Dexter", 1), list("Dexter", 1)) expected_second <- list(list("and", 1), list("and", 1)) http://git-wip-us.apache.org/repos/asf/spark/blob/c34b546d/R/pkg/inst/tests/testthat/test_sparkSQL.R ---------------------------------------------------------------------- diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 39ed4fe..3ccb8b6 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -490,7 +490,7 @@ test_that("read/write json files", { test_that("jsonRDD() on a RDD with json string", { sqlContext <- suppressWarnings(sparkRSQL.init(sc)) rdd <- parallelize(sc, mockLines) - expect_equal(count(rdd), 3) + expect_equal(countRDD(rdd), 3) df <- suppressWarnings(jsonRDD(sqlContext, rdd)) expect_is(df, "SparkDataFrame") expect_equal(count(df), 3) @@ -582,7 +582,7 @@ test_that("toRDD() returns an RRDD", { df <- read.json(jsonPath) testRDD <- toRDD(df) expect_is(testRDD, "RDD") - expect_equal(count(testRDD), 3) + expect_equal(countRDD(testRDD), 3) }) test_that("union on two RDDs created from DataFrames returns an RRDD", { @@ -592,7 +592,7 @@ test_that("union on two RDDs created from DataFrames returns an RRDD", { unioned <- unionRDD(RDD1, RDD2) expect_is(unioned, "RDD") expect_equal(getSerializedMode(unioned), "byte") - expect_equal(collect(unioned)[[2]]$name, "Andy") + expect_equal(collectRDD(unioned)[[2]]$name, "Andy") }) test_that("union on mixed serialization types correctly returns a byte RRDD", { @@ -614,14 +614,14 @@ test_that("union on mixed serialization types correctly returns a byte RRDD", { unionByte <- unionRDD(rdd, dfRDD) expect_is(unionByte, "RDD") expect_equal(getSerializedMode(unionByte), "byte") - expect_equal(collect(unionByte)[[1]], 1) - expect_equal(collect(unionByte)[[12]]$name, "Andy") + expect_equal(collectRDD(unionByte)[[1]], 1) + expect_equal(collectRDD(unionByte)[[12]]$name, "Andy") unionString <- unionRDD(textRDD, dfRDD) expect_is(unionString, "RDD") expect_equal(getSerializedMode(unionString), "byte") - expect_equal(collect(unionString)[[1]], "Michael") - expect_equal(collect(unionString)[[5]]$name, "Andy") + expect_equal(collectRDD(unionString)[[1]], "Michael") + expect_equal(collectRDD(unionString)[[5]]$name, "Andy") }) test_that("objectFile() works with row serialization", { @@ -633,7 +633,7 @@ test_that("objectFile() works with row serialization", { expect_is(objectIn, "RDD") expect_equal(getSerializedMode(objectIn), "byte") - expect_equal(collect(objectIn)[[2]]$age, 30) + expect_equal(collectRDD(objectIn)[[2]]$age, 30) }) test_that("lapply() on a DataFrame returns an RDD with the correct columns", { @@ -643,7 +643,7 @@ test_that("lapply() on a DataFrame returns an RDD with the correct columns", { row }) expect_is(testRDD, "RDD") - collected <- collect(testRDD) + collected <- collectRDD(testRDD) expect_equal(collected[[1]]$name, "Michael") expect_equal(collected[[2]]$newCol, 35) }) @@ -715,10 +715,10 @@ test_that("multiple pipeline transformations result in an RDD with the correct v row }) expect_is(second, "RDD") - expect_equal(count(second), 3) - expect_equal(collect(second)[[2]]$age, 35) - expect_true(collect(second)[[2]]$testCol) - expect_false(collect(second)[[3]]$testCol) + expect_equal(countRDD(second), 3) + expect_equal(collectRDD(second)[[2]]$age, 35) + expect_true(collectRDD(second)[[2]]$testCol) + expect_false(collectRDD(second)[[3]]$testCol) }) test_that("cache(), persist(), and unpersist() on a DataFrame", { @@ -1608,7 +1608,7 @@ test_that("toJSON() returns an RDD of the correct values", { testRDD <- toJSON(df) expect_is(testRDD, "RDD") expect_equal(getSerializedMode(testRDD), "string") - expect_equal(collect(testRDD)[[1]], mockLines[1]) + expect_equal(collectRDD(testRDD)[[1]], mockLines[1]) }) test_that("showDF()", { http://git-wip-us.apache.org/repos/asf/spark/blob/c34b546d/R/pkg/inst/tests/testthat/test_take.R ---------------------------------------------------------------------- diff --git a/R/pkg/inst/tests/testthat/test_take.R b/R/pkg/inst/tests/testthat/test_take.R index 07f00c9..aaa5328 100644 --- a/R/pkg/inst/tests/testthat/test_take.R +++ b/R/pkg/inst/tests/testthat/test_take.R @@ -36,34 +36,34 @@ sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", test_that("take() gives back the original elements in correct count and order", { numVectorRDD <- parallelize(sc, numVector, 10) # case: number of elements to take is less than the size of the first partition - expect_equal(take(numVectorRDD, 1), as.list(head(numVector, n = 1))) + expect_equal(takeRDD(numVectorRDD, 1), as.list(head(numVector, n = 1))) # case: number of elements to take is the same as the size of the first partition - expect_equal(take(numVectorRDD, 11), as.list(head(numVector, n = 11))) + expect_equal(takeRDD(numVectorRDD, 11), as.list(head(numVector, n = 11))) # case: number of elements to take is greater than all elements - expect_equal(take(numVectorRDD, length(numVector)), as.list(numVector)) - expect_equal(take(numVectorRDD, length(numVector) + 1), as.list(numVector)) + expect_equal(takeRDD(numVectorRDD, length(numVector)), as.list(numVector)) + expect_equal(takeRDD(numVectorRDD, length(numVector) + 1), as.list(numVector)) numListRDD <- parallelize(sc, numList, 1) numListRDD2 <- parallelize(sc, numList, 4) - expect_equal(take(numListRDD, 3), take(numListRDD2, 3)) - expect_equal(take(numListRDD, 5), take(numListRDD2, 5)) - expect_equal(take(numListRDD, 1), as.list(head(numList, n = 1))) - expect_equal(take(numListRDD2, 999), numList) + expect_equal(takeRDD(numListRDD, 3), takeRDD(numListRDD2, 3)) + expect_equal(takeRDD(numListRDD, 5), takeRDD(numListRDD2, 5)) + expect_equal(takeRDD(numListRDD, 1), as.list(head(numList, n = 1))) + expect_equal(takeRDD(numListRDD2, 999), numList) strVectorRDD <- parallelize(sc, strVector, 2) strVectorRDD2 <- parallelize(sc, strVector, 3) - expect_equal(take(strVectorRDD, 4), as.list(strVector)) - expect_equal(take(strVectorRDD2, 2), as.list(head(strVector, n = 2))) + expect_equal(takeRDD(strVectorRDD, 4), as.list(strVector)) + expect_equal(takeRDD(strVectorRDD2, 2), as.list(head(strVector, n = 2))) strListRDD <- parallelize(sc, strList, 4) strListRDD2 <- parallelize(sc, strList, 1) - expect_equal(take(strListRDD, 3), as.list(head(strList, n = 3))) - expect_equal(take(strListRDD2, 1), as.list(head(strList, n = 1))) + expect_equal(takeRDD(strListRDD, 3), as.list(head(strList, n = 3))) + expect_equal(takeRDD(strListRDD2, 1), as.list(head(strList, n = 1))) - expect_equal(length(take(strListRDD, 0)), 0) - expect_equal(length(take(strVectorRDD, 0)), 0) - expect_equal(length(take(numListRDD, 0)), 0) - expect_equal(length(take(numVectorRDD, 0)), 0) + expect_equal(length(takeRDD(strListRDD, 0)), 0) + expect_equal(length(takeRDD(strVectorRDD, 0)), 0) + expect_equal(length(takeRDD(numListRDD, 0)), 0) + expect_equal(length(takeRDD(numVectorRDD, 0)), 0) }) sparkR.session.stop() http://git-wip-us.apache.org/repos/asf/spark/blob/c34b546d/R/pkg/inst/tests/testthat/test_textFile.R ---------------------------------------------------------------------- diff --git a/R/pkg/inst/tests/testthat/test_textFile.R b/R/pkg/inst/tests/testthat/test_textFile.R index b7dcbe4..3b46606 100644 --- a/R/pkg/inst/tests/testthat/test_textFile.R +++ b/R/pkg/inst/tests/testthat/test_textFile.R @@ -29,8 +29,8 @@ test_that("textFile() on a local file returns an RDD", { rdd <- textFile(sc, fileName) expect_is(rdd, "RDD") - expect_true(count(rdd) > 0) - expect_equal(count(rdd), 2) + expect_true(countRDD(rdd) > 0) + expect_equal(countRDD(rdd), 2) unlink(fileName) }) @@ -40,7 +40,7 @@ test_that("textFile() followed by a collect() returns the same content", { writeLines(mockFile, fileName) rdd <- textFile(sc, fileName) - expect_equal(collect(rdd), as.list(mockFile)) + expect_equal(collectRDD(rdd), as.list(mockFile)) unlink(fileName) }) @@ -55,7 +55,7 @@ test_that("textFile() word count works as expected", { wordCount <- lapply(words, function(word) { list(word, 1L) }) counts <- reduceByKey(wordCount, "+", 2L) - output <- collect(counts) + output <- collectRDD(counts) expected <- list(list("pretty.", 1), list("is", 2), list("awesome.", 1), list("Spark", 2)) expect_equal(sortKeyValueList(output), sortKeyValueList(expected)) @@ -72,7 +72,7 @@ test_that("several transformations on RDD created by textFile()", { # PipelinedRDD initially created from RDD rdd <- lapply(rdd, function(x) paste(x, x)) } - collect(rdd) + collectRDD(rdd) unlink(fileName) }) @@ -85,7 +85,7 @@ test_that("textFile() followed by a saveAsTextFile() returns the same content", rdd <- textFile(sc, fileName1, 1L) saveAsTextFile(rdd, fileName2) rdd <- textFile(sc, fileName2) - expect_equal(collect(rdd), as.list(mockFile)) + expect_equal(collectRDD(rdd), as.list(mockFile)) unlink(fileName1) unlink(fileName2) @@ -97,7 +97,7 @@ test_that("saveAsTextFile() on a parallelized list works as expected", { rdd <- parallelize(sc, l, 1L) saveAsTextFile(rdd, fileName) rdd <- textFile(sc, fileName) - expect_equal(collect(rdd), lapply(l, function(x) {toString(x)})) + expect_equal(collectRDD(rdd), lapply(l, function(x) {toString(x)})) unlink(fileName) }) @@ -117,7 +117,7 @@ test_that("textFile() and saveAsTextFile() word count works as expected", { saveAsTextFile(counts, fileName2) rdd <- textFile(sc, fileName2) - output <- collect(rdd) + output <- collectRDD(rdd) expected <- list(list("awesome.", 1), list("Spark", 2), list("pretty.", 1), list("is", 2)) expectedStr <- lapply(expected, function(x) { toString(x) }) @@ -134,7 +134,7 @@ test_that("textFile() on multiple paths", { writeLines("Spark is awesome.", fileName2) rdd <- textFile(sc, c(fileName1, fileName2)) - expect_equal(count(rdd), 2) + expect_equal(countRDD(rdd), 2) unlink(fileName1) unlink(fileName2) @@ -147,16 +147,16 @@ test_that("Pipelined operations on RDDs created using textFile", { rdd <- textFile(sc, fileName) lengths <- lapply(rdd, function(x) { length(x) }) - expect_equal(collect(lengths), list(1, 1)) + expect_equal(collectRDD(lengths), list(1, 1)) lengthsPipelined <- lapply(lengths, function(x) { x + 10 }) - expect_equal(collect(lengthsPipelined), list(11, 11)) + expect_equal(collectRDD(lengthsPipelined), list(11, 11)) lengths30 <- lapply(lengthsPipelined, function(x) { x + 20 }) - expect_equal(collect(lengths30), list(31, 31)) + expect_equal(collectRDD(lengths30), list(31, 31)) lengths20 <- lapply(lengths, function(x) { x + 20 }) - expect_equal(collect(lengths20), list(21, 21)) + expect_equal(collectRDD(lengths20), list(21, 21)) unlink(fileName) }) http://git-wip-us.apache.org/repos/asf/spark/blob/c34b546d/R/pkg/inst/tests/testthat/test_utils.R ---------------------------------------------------------------------- diff --git a/R/pkg/inst/tests/testthat/test_utils.R b/R/pkg/inst/tests/testthat/test_utils.R index 58ff3de..83e94a1 100644 --- a/R/pkg/inst/tests/testthat/test_utils.R +++ b/R/pkg/inst/tests/testthat/test_utils.R @@ -24,7 +24,7 @@ sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", test_that("convertJListToRList() gives back (deserializes) the original JLists of strings and integers", { # It's hard to manually create a Java List using rJava, since it does not - # support generics well. Instead, we rely on collect() returning a + # support generics well. Instead, we rely on collectRDD() returning a # JList. nums <- as.list(1:10) rdd <- parallelize(sc, nums, 1L) @@ -48,7 +48,7 @@ test_that("serializeToBytes on RDD", { text.rdd <- textFile(sc, fileName) expect_equal(getSerializedMode(text.rdd), "string") ser.rdd <- serializeToBytes(text.rdd) - expect_equal(collect(ser.rdd), as.list(mockFile)) + expect_equal(collectRDD(ser.rdd), as.list(mockFile)) expect_equal(getSerializedMode(ser.rdd), "byte") unlink(fileName) @@ -128,7 +128,7 @@ test_that("cleanClosure on R functions", { env <- environment(newF) expect_equal(ls(env), "t") expect_equal(get("t", envir = env, inherits = FALSE), t) - actual <- collect(lapply(rdd, f)) + actual <- collectRDD(lapply(rdd, f)) expected <- as.list(c(rep(FALSE, 4), rep(TRUE, 6))) expect_equal(actual, expected) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org