Repository: spark
Updated Branches:
  refs/heads/branch-1.6 03d801587 -> 47461fea7


[SPARK-12158][SPARKR][SQL] Fix 'sample' functions that break R unit test cases

The existing sample functions miss the parameter `seed`, however, the 
corresponding function interface in `generics` has such a parameter. Thus, 
although the function caller can call the function with the 'seed', we are not 
using the value.

This could cause SparkR unit tests failed. For example, I hit it in another PR:
https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/47213/consoleFull

Author: gatorsmile <gatorsm...@gmail.com>

Closes #10160 from gatorsmile/sampleR.

(cherry picked from commit 1e3526c2d3de723225024fedd45753b556e18fc6)
Signed-off-by: Shivaram Venkataraman <shiva...@cs.berkeley.edu>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/47461fea
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/47461fea
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/47461fea

Branch: refs/heads/branch-1.6
Commit: 47461fea7c079819de6add308f823c7a8294f891
Parents: 03d8015
Author: gatorsmile <gatorsm...@gmail.com>
Authored: Fri Dec 11 20:55:16 2015 -0800
Committer: Shivaram Venkataraman <shiva...@cs.berkeley.edu>
Committed: Fri Dec 11 20:55:24 2015 -0800

----------------------------------------------------------------------
 R/pkg/R/DataFrame.R                       | 17 +++++++++++------
 R/pkg/inst/tests/testthat/test_sparkSQL.R |  4 ++++
 2 files changed, 15 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/47461fea/R/pkg/R/DataFrame.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 975b058..764597d 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -662,6 +662,7 @@ setMethod("unique",
 #' @param x A SparkSQL DataFrame
 #' @param withReplacement Sampling with replacement or not
 #' @param fraction The (rough) sample target fraction
+#' @param seed Randomness seed value
 #'
 #' @family DataFrame functions
 #' @rdname sample
@@ -677,13 +678,17 @@ setMethod("unique",
 #' collect(sample(df, TRUE, 0.5))
 #'}
 setMethod("sample",
-          # TODO : Figure out how to send integer as java.lang.Long to JVM so
-          # we can send seed as an argument through callJMethod
           signature(x = "DataFrame", withReplacement = "logical",
                     fraction = "numeric"),
-          function(x, withReplacement, fraction) {
+          function(x, withReplacement, fraction, seed) {
             if (fraction < 0.0) stop(cat("Negative fraction value:", fraction))
-            sdf <- callJMethod(x@sdf, "sample", withReplacement, fraction)
+            if (!missing(seed)) {
+              # TODO : Figure out how to send integer as java.lang.Long to JVM 
so
+              # we can send seed as an argument through callJMethod
+              sdf <- callJMethod(x@sdf, "sample", withReplacement, fraction, 
as.integer(seed))
+            } else {
+              sdf <- callJMethod(x@sdf, "sample", withReplacement, fraction)
+            }
             dataFrame(sdf)
           })
 
@@ -692,8 +697,8 @@ setMethod("sample",
 setMethod("sample_frac",
           signature(x = "DataFrame", withReplacement = "logical",
                     fraction = "numeric"),
-          function(x, withReplacement, fraction) {
-            sample(x, withReplacement, fraction)
+          function(x, withReplacement, fraction, seed) {
+            sample(x, withReplacement, fraction, seed)
           })
 
 #' nrow

http://git-wip-us.apache.org/repos/asf/spark/blob/47461fea/R/pkg/inst/tests/testthat/test_sparkSQL.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R 
b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index ed9b2c9..071fd31 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -724,6 +724,10 @@ test_that("sample on a DataFrame", {
   sampled2 <- sample(df, FALSE, 0.1, 0) # set seed for predictable result
   expect_true(count(sampled2) < 3)
 
+  count1 <- count(sample(df, FALSE, 0.1, 0))
+  count2 <- count(sample(df, FALSE, 0.1, 0))
+  expect_equal(count1, count2)
+
   # Also test sample_frac
   sampled3 <- sample_frac(df, FALSE, 0.1, 0) # set seed for predictable result
   expect_true(count(sampled3) < 3)


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to