spark git commit: [SPARK-15177][.1][R] make SparkR model params and default values consistent with MLlib

meng Tue, 21 Jun 2016 08:31:41 -0700

Repository: spark
Updated Branches:
  refs/heads/master f3a768b7b -> 4f83ca105



[SPARK-15177][.1][R] make SparkR model params and default values consistent 
with MLlib

## What changes were proposed in this pull request?

This PR is a subset of #13023 by yanboliang to make SparkR model param names 
and default values consistent with MLlib. I tried to avoid other changes from 
#13023 to keep this PR minimal. I will send a follow-up PR to improve the 
documentation.

Main changes:
* `spark.glm`: epsilon -> tol, maxit -> maxIter
* `spark.kmeans`: default k -> 2, default maxIter -> 20, default initMode -> 
"k-means||"
* `spark.naiveBayes`: laplace -> smoothing, default 1.0

## How was this patch tested?

Existing unit tests.

Author: Xiangrui Meng <m...@databricks.com>

Closes #13801 from mengxr/SPARK-15177.1.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4f83ca10
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4f83ca10
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4f83ca10

Branch: refs/heads/master
Commit: 4f83ca1059a3b580fca3f006974ff5ac4d5212a1
Parents: f3a768b
Author: Xiangrui Meng <m...@databricks.com>
Authored: Tue Jun 21 08:31:15 2016 -0700
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Tue Jun 21 08:31:15 2016 -0700

----------------------------------------------------------------------
 R/pkg/R/mllib.R                                 | 74 ++++++++++----------
 R/pkg/inst/tests/testthat/test_mllib.R          |  4 +-
 .../r/GeneralizedLinearRegressionWrapper.scala  |  8 +--
 .../apache/spark/ml/r/NaiveBayesWrapper.scala   |  4 +-
 4 files changed, 44 insertions(+), 46 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/4f83ca10/R/pkg/R/mllib.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index 74dba8f..b83b3b3 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -64,8 +64,8 @@ setClass("KMeansModel", representation(jobj = "jobj"))
 #'               This can be a character string naming a family function, a 
family function or
 #'               the result of a call to a family function. Refer R family at
 #'               
\url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}.
-#' @param epsilon Positive convergence tolerance of iterations.
-#' @param maxit Integer giving the maximal number of IRLS iterations.
+#' @param tol Positive convergence tolerance of iterations.
+#' @param maxIter Integer giving the maximal number of IRLS iterations.
 #' @return a fitted generalized linear model
 #' @rdname spark.glm
 #' @export
@@ -74,32 +74,30 @@ setClass("KMeansModel", representation(jobj = "jobj"))
 #' sparkR.session()
 #' data(iris)
 #' df <- createDataFrame(iris)
-#' model <- spark.glm(df, Sepal_Length ~ Sepal_Width, family="gaussian")
+#' model <- spark.glm(df, Sepal_Length ~ Sepal_Width, family = "gaussian")
 #' summary(model)
 #' }
 #' @note spark.glm since 2.0.0
-setMethod(
-    "spark.glm",
-    signature(data = "SparkDataFrame", formula = "formula"),
-    function(data, formula, family = gaussian, epsilon = 1e-06, maxit = 25) {
-        if (is.character(family)) {
-            family <- get(family, mode = "function", envir = parent.frame())
-        }
-        if (is.function(family)) {
-            family <- family()
-        }
-        if (is.null(family$family)) {
-            print(family)
-            stop("'family' not recognized")
-        }
+setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
+          function(data, formula, family = gaussian, tol = 1e-6, maxIter = 25) 
{
+            if (is.character(family)) {
+              family <- get(family, mode = "function", envir = parent.frame())
+            }
+            if (is.function(family)) {
+              family <- family()
+            }
+            if (is.null(family$family)) {
+              print(family)
+              stop("'family' not recognized")
+            }
 
-        formula <- paste(deparse(formula), collapse = "")
+            formula <- paste(deparse(formula), collapse = "")
 
-        jobj <- 
callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper",
-        "fit", formula, data@sdf, family$family, family$link,
-        epsilon, as.integer(maxit))
-        return(new("GeneralizedLinearRegressionModel", jobj = jobj))
-})
+            jobj <- 
callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper",
+                                "fit", formula, data@sdf, family$family, 
family$link,
+                                tol, as.integer(maxIter))
+            return(new("GeneralizedLinearRegressionModel", jobj = jobj))
+          })
 
 #' Fits a generalized linear model (R-compliant).
 #'
@@ -122,13 +120,13 @@ setMethod(
 #' sparkR.session()
 #' data(iris)
 #' df <- createDataFrame(iris)
-#' model <- glm(Sepal_Length ~ Sepal_Width, df, family="gaussian")
+#' model <- glm(Sepal_Length ~ Sepal_Width, df, family = "gaussian")
 #' summary(model)
 #' }
 #' @note glm since 1.5.0
 setMethod("glm", signature(formula = "formula", family = "ANY", data = 
"SparkDataFrame"),
-          function(formula, family = gaussian, data, epsilon = 1e-06, maxit = 
25) {
-            spark.glm(data, formula, family, epsilon, maxit)
+          function(formula, family = gaussian, data, epsilon = 1e-6, maxit = 
25) {
+            spark.glm(data, formula, family, tol = epsilon, maxIter = maxit)
           })
 
 #' Get the summary of a generalized linear model
@@ -296,17 +294,17 @@ setMethod("summary", signature(object = 
"NaiveBayesModel"),
 #' @export
 #' @examples
 #' \dontrun{
-#' model <- spark.kmeans(data, ~ ., k=2, initMode="random")
+#' model <- spark.kmeans(data, ~ ., k = 4, initMode = "random")
 #' }
 #' @note spark.kmeans since 2.0.0
 setMethod("spark.kmeans", signature(data = "SparkDataFrame", formula = 
"formula"),
-          function(data, formula, k, maxIter = 10, initMode = c("random", 
"k-means||")) {
+          function(data, formula, k = 2, maxIter = 20, initMode = 
c("k-means||", "random")) {
             formula <- paste(deparse(formula), collapse = "")
             initMode <- match.arg(initMode)
             jobj <- callJStatic("org.apache.spark.ml.r.KMeansWrapper", "fit", 
data@sdf, formula,
                                 as.integer(k), as.integer(maxIter), initMode)
             return(new("KMeansModel", jobj = jobj))
-         })
+          })
 
 #' Get fitted result from a k-means model
 #'
@@ -397,7 +395,7 @@ setMethod("predict", signature(object = "KMeansModel"),
 #' @param data SparkDataFrame for training
 #' @param formula A symbolic description of the model to be fitted. Currently 
only a few formula
 #'               operators are supported, including '~', '.', ':', '+', and 
'-'.
-#' @param laplace Smoothing parameter
+#' @param smoothing Smoothing parameter
 #' @return a fitted naive Bayes model
 #' @rdname spark.naiveBayes
 #' @seealso e1071: \url{https://cran.r-project.org/web/packages/e1071/}
@@ -405,16 +403,16 @@ setMethod("predict", signature(object = "KMeansModel"),
 #' @examples
 #' \dontrun{
 #' df <- createDataFrame(infert)
-#' model <- spark.naiveBayes(df, education ~ ., laplace = 0)
+#' model <- spark.naiveBayes(df, education ~ ., smoothing = 0)
 #'}
 #' @note spark.naiveBayes since 2.0.0
 setMethod("spark.naiveBayes", signature(data = "SparkDataFrame", formula = 
"formula"),
-    function(data, formula, laplace = 0, ...) {
-        formula <- paste(deparse(formula), collapse = "")
-        jobj <- callJStatic("org.apache.spark.ml.r.NaiveBayesWrapper", "fit",
-          formula, data@sdf, laplace)
-        return(new("NaiveBayesModel", jobj = jobj))
-    })
+          function(data, formula, smoothing = 1.0, ...) {
+            formula <- paste(deparse(formula), collapse = "")
+            jobj <- callJStatic("org.apache.spark.ml.r.NaiveBayesWrapper", 
"fit",
+            formula, data@sdf, smoothing)
+            return(new("NaiveBayesModel", jobj = jobj))
+          })
 
 #' Save fitted MLlib model to the input path
 #'
@@ -431,7 +429,7 @@ setMethod("spark.naiveBayes", signature(data = 
"SparkDataFrame", formula = "form
 #' @examples
 #' \dontrun{
 #' df <- createDataFrame(infert)
-#' model <- spark.naiveBayes(df, education ~ ., laplace = 0)
+#' model <- spark.naiveBayes(df, education ~ ., smoothing = 0)
 #' path <- "path/to/model"
 #' write.ml(model, path)
 #' }

http://git-wip-us.apache.org/repos/asf/spark/blob/4f83ca10/R/pkg/inst/tests/testthat/test_mllib.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_mllib.R 
b/R/pkg/inst/tests/testthat/test_mllib.R
index c8c5ef2..753da81 100644
--- a/R/pkg/inst/tests/testthat/test_mllib.R
+++ b/R/pkg/inst/tests/testthat/test_mllib.R
@@ -288,7 +288,7 @@ test_that("spark.kmeans", {
 
   take(training, 1)
 
-  model <- spark.kmeans(data = training, ~ ., k = 2)
+  model <- spark.kmeans(data = training, ~ ., k = 2, maxIter = 10, initMode = 
"random")
   sample <- take(select(predict(model, training), "prediction"), 1)
   expect_equal(typeof(sample$prediction), "integer")
   expect_equal(sample$prediction, 1)
@@ -363,7 +363,7 @@ test_that("spark.naiveBayes", {
   t <- as.data.frame(Titanic)
   t1 <- t[t$Freq > 0, -5]
   df <- suppressWarnings(createDataFrame(t1))
-  m <- spark.naiveBayes(df, Survived ~ .)
+  m <- spark.naiveBayes(df, Survived ~ ., smoothing = 0.0)
   s <- summary(m)
   expect_equal(as.double(s$apriori[1, "Yes"]), 0.5833333, tolerance = 1e-6)
   expect_equal(sum(s$apriori), 1)

http://git-wip-us.apache.org/repos/asf/spark/blob/4f83ca10/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
----------------------------------------------------------------------
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
index 9618a34..5642abc 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
@@ -67,8 +67,8 @@ private[r] object GeneralizedLinearRegressionWrapper
       data: DataFrame,
       family: String,
       link: String,
-      epsilon: Double,
-      maxit: Int): GeneralizedLinearRegressionWrapper = {
+      tol: Double,
+      maxIter: Int): GeneralizedLinearRegressionWrapper = {
     val rFormula = new RFormula()
       .setFormula(formula)
     val rFormulaModel = rFormula.fit(data)
@@ -82,8 +82,8 @@ private[r] object GeneralizedLinearRegressionWrapper
       .setFamily(family)
       .setLink(link)
       .setFitIntercept(rFormula.hasIntercept)
-      .setTol(epsilon)
-      .setMaxIter(maxit)
+      .setTol(tol)
+      .setMaxIter(maxIter)
     val pipeline = new Pipeline()
       .setStages(Array(rFormulaModel, glr))
       .fit(data)

http://git-wip-us.apache.org/repos/asf/spark/blob/4f83ca10/mllib/src/main/scala/org/apache/spark/ml/r/NaiveBayesWrapper.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/NaiveBayesWrapper.scala 
b/mllib/src/main/scala/org/apache/spark/ml/r/NaiveBayesWrapper.scala
index 28925c7..1dac246 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/NaiveBayesWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/NaiveBayesWrapper.scala
@@ -56,7 +56,7 @@ private[r] object NaiveBayesWrapper extends 
MLReadable[NaiveBayesWrapper] {
   val PREDICTED_LABEL_INDEX_COL = "pred_label_idx"
   val PREDICTED_LABEL_COL = "prediction"
 
-  def fit(formula: String, data: DataFrame, laplace: Double): 
NaiveBayesWrapper = {
+  def fit(formula: String, data: DataFrame, smoothing: Double): 
NaiveBayesWrapper = {
     val rFormula = new RFormula()
       .setFormula(formula)
       .fit(data)
@@ -70,7 +70,7 @@ private[r] object NaiveBayesWrapper extends 
MLReadable[NaiveBayesWrapper] {
     val features = featureAttrs.map(_.name.get)
     // assemble and fit the pipeline
     val naiveBayes = new NaiveBayes()
-      .setSmoothing(laplace)
+      .setSmoothing(smoothing)
       .setModelType("bernoulli")
       .setPredictionCol(PREDICTED_LABEL_INDEX_COL)
     val idxToStr = new IndexToString()


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-15177][.1][R] make SparkR model params and default values consistent with MLlib

Reply via email to