Repository: spark
Updated Branches:
  refs/heads/master cad88f17e -> ad459cfb1


[SPARK-20917][ML][SPARKR] SparkR supports string encoding consistent with R

## What changes were proposed in this pull request?

Add `stringIndexerOrderType` to `spark.glm` and `spark.survreg` to support 
string encoding that is consistent with default R.

## How was this patch tested?
new tests

Author: actuaryzhang <actuaryzhan...@gmail.com>

Closes #18140 from actuaryzhang/sparkRFormula.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ad459cfb
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ad459cfb
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ad459cfb

Branch: refs/heads/master
Commit: ad459cfb1d169d8dd7b9e039ca135ba5cafcab83
Parents: cad88f1
Author: actuaryzhang <actuaryzhan...@gmail.com>
Authored: Wed Jun 21 10:35:16 2017 -0700
Committer: Felix Cheung <felixche...@apache.org>
Committed: Wed Jun 21 10:35:16 2017 -0700

----------------------------------------------------------------------
 R/pkg/R/mllib_regression.R                      | 52 +++++++++++++---
 R/pkg/tests/fulltests/test_mllib_regression.R   | 62 ++++++++++++++++++++
 .../ml/r/AFTSurvivalRegressionWrapper.scala     |  4 +-
 .../r/GeneralizedLinearRegressionWrapper.scala  |  6 +-
 4 files changed, 115 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/ad459cfb/R/pkg/R/mllib_regression.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R
index d59c890..9ecd887 100644
--- a/R/pkg/R/mllib_regression.R
+++ b/R/pkg/R/mllib_regression.R
@@ -70,6 +70,12 @@ setClass("IsotonicRegressionModel", representation(jobj = 
"jobj"))
 #'                      the relationship between the variance and mean of the 
distribution. Only
 #'                      applicable to the Tweedie family.
 #' @param link.power the index in the power link function. Only applicable to 
the Tweedie family.
+#' @param stringIndexerOrderType how to order categories of a string feature 
column. This is used to
+#'                               decide the base level of a string feature as 
the last category after
+#'                               ordering is dropped when encoding strings. 
Supported options are
+#'                               "frequencyDesc", "frequencyAsc", 
"alphabetDesc", and "alphabetAsc".
+#'                               The default value is "frequencyDesc". When 
the ordering is set to
+#'                               "alphabetDesc", this drops the same category 
as R when encoding strings.
 #' @param ... additional arguments passed to the method.
 #' @aliases spark.glm,SparkDataFrame,formula-method
 #' @return \code{spark.glm} returns a fitted generalized linear model.
@@ -79,7 +85,7 @@ setClass("IsotonicRegressionModel", representation(jobj = 
"jobj"))
 #' @examples
 #' \dontrun{
 #' sparkR.session()
-#' t <- as.data.frame(Titanic)
+#' t <- as.data.frame(Titanic, stringsAsFactors = FALSE)
 #' df <- createDataFrame(t)
 #' model <- spark.glm(df, Freq ~ Sex + Age, family = "gaussian")
 #' summary(model)
@@ -96,6 +102,15 @@ setClass("IsotonicRegressionModel", representation(jobj = 
"jobj"))
 #' savedModel <- read.ml(path)
 #' summary(savedModel)
 #'
+#' # note that the default string encoding is different from R's glm
+#' model2 <- glm(Freq ~ Sex + Age, family = "gaussian", data = t)
+#' summary(model2)
+#' # use stringIndexerOrderType = "alphabetDesc" to force string encoding
+#' # to be consistent with R
+#' model3 <- spark.glm(df, Freq ~ Sex + Age, family = "gaussian",
+#'                    stringIndexerOrderType = "alphabetDesc")
+#' summary(model3)
+#'
 #' # fit tweedie model
 #' model <- spark.glm(df, Freq ~ Sex + Age, family = "tweedie",
 #'                    var.power = 1.2, link.power = 0)
@@ -110,8 +125,11 @@ setClass("IsotonicRegressionModel", representation(jobj = 
"jobj"))
 #' @seealso \link{glm}, \link{read.ml}
 setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
           function(data, formula, family = gaussian, tol = 1e-6, maxIter = 25, 
weightCol = NULL,
-                   regParam = 0.0, var.power = 0.0, link.power = 1.0 - 
var.power) {
+                   regParam = 0.0, var.power = 0.0, link.power = 1.0 - 
var.power,
+                   stringIndexerOrderType = c("frequencyDesc", "frequencyAsc",
+                                              "alphabetDesc", "alphabetAsc")) {
 
+            stringIndexerOrderType <- match.arg(stringIndexerOrderType)
             if (is.character(family)) {
               # Handle when family = "tweedie"
               if (tolower(family) == "tweedie") {
@@ -145,7 +163,8 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", 
formula = "formula"),
             jobj <- 
callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper",
                                 "fit", formula, data@sdf, 
tolower(family$family), family$link,
                                 tol, as.integer(maxIter), weightCol, regParam,
-                                as.double(var.power), as.double(link.power))
+                                as.double(var.power), as.double(link.power),
+                                stringIndexerOrderType)
             new("GeneralizedLinearRegressionModel", jobj = jobj)
           })
 
@@ -167,6 +186,12 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", 
formula = "formula"),
 #' @param maxit integer giving the maximal number of IRLS iterations.
 #' @param var.power the index of the power variance function in the Tweedie 
family.
 #' @param link.power the index of the power link function in the Tweedie 
family.
+#' @param stringIndexerOrderType how to order categories of a string feature 
column. This is used to
+#'                               decide the base level of a string feature as 
the last category after
+#'                               ordering is dropped when encoding strings. 
Supported options are
+#'                               "frequencyDesc", "frequencyAsc", 
"alphabetDesc", and "alphabetAsc".
+#'                               The default value is "frequencyDesc". When 
the ordering is set to
+#'                               "alphabetDesc", this drops the same category 
as R when encoding strings.
 #' @return \code{glm} returns a fitted generalized linear model.
 #' @rdname glm
 #' @export
@@ -182,9 +207,12 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", 
formula = "formula"),
 #' @seealso \link{spark.glm}
 setMethod("glm", signature(formula = "formula", family = "ANY", data = 
"SparkDataFrame"),
           function(formula, family = gaussian, data, epsilon = 1e-6, maxit = 
25, weightCol = NULL,
-                   var.power = 0.0, link.power = 1.0 - var.power) {
+                   var.power = 0.0, link.power = 1.0 - var.power,
+                   stringIndexerOrderType = c("frequencyDesc", "frequencyAsc",
+                                              "alphabetDesc", "alphabetAsc")) {
             spark.glm(data, formula, family, tol = epsilon, maxIter = maxit, 
weightCol = weightCol,
-                      var.power = var.power, link.power = link.power)
+                      var.power = var.power, link.power = link.power,
+                      stringIndexerOrderType = stringIndexerOrderType)
           })
 
 #  Returns the summary of a model produced by glm() or spark.glm(), similarly 
to R's summary().
@@ -418,6 +446,12 @@ setMethod("write.ml", signature(object = 
"IsotonicRegressionModel", path = "char
 #' @param aggregationDepth The depth for treeAggregate (greater than or equal 
to 2). If the dimensions of features
 #'                         or the number of partitions are large, this param 
could be adjusted to a larger size.
 #'                         This is an expert parameter. Default value should 
be good for most cases.
+#' @param stringIndexerOrderType how to order categories of a string feature 
column. This is used to
+#'                               decide the base level of a string feature as 
the last category after
+#'                               ordering is dropped when encoding strings. 
Supported options are
+#'                               "frequencyDesc", "frequencyAsc", 
"alphabetDesc", and "alphabetAsc".
+#'                               The default value is "frequencyDesc". When 
the ordering is set to
+#'                               "alphabetDesc", this drops the same category 
as R when encoding strings.
 #' @param ... additional arguments passed to the method.
 #' @return \code{spark.survreg} returns a fitted AFT survival regression model.
 #' @rdname spark.survreg
@@ -443,10 +477,14 @@ setMethod("write.ml", signature(object = 
"IsotonicRegressionModel", path = "char
 #' }
 #' @note spark.survreg since 2.0.0
 setMethod("spark.survreg", signature(data = "SparkDataFrame", formula = 
"formula"),
-          function(data, formula, aggregationDepth = 2) {
+          function(data, formula, aggregationDepth = 2,
+                   stringIndexerOrderType = c("frequencyDesc", "frequencyAsc",
+                                              "alphabetDesc", "alphabetAsc")) {
+            stringIndexerOrderType <- match.arg(stringIndexerOrderType)
             formula <- paste(deparse(formula), collapse = "")
             jobj <- 
callJStatic("org.apache.spark.ml.r.AFTSurvivalRegressionWrapper",
-                                "fit", formula, data@sdf, 
as.integer(aggregationDepth))
+                                "fit", formula, data@sdf, 
as.integer(aggregationDepth),
+                                stringIndexerOrderType)
             new("AFTSurvivalRegressionModel", jobj = jobj)
           })
 

http://git-wip-us.apache.org/repos/asf/spark/blob/ad459cfb/R/pkg/tests/fulltests/test_mllib_regression.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_mllib_regression.R 
b/R/pkg/tests/fulltests/test_mllib_regression.R
index 82472c9..6b72a09 100644
--- a/R/pkg/tests/fulltests/test_mllib_regression.R
+++ b/R/pkg/tests/fulltests/test_mllib_regression.R
@@ -367,6 +367,49 @@ test_that("glm save/load", {
   unlink(modelPath)
 })
 
+test_that("spark.glm and glm with string encoding", {
+  t <- as.data.frame(Titanic, stringsAsFactors = FALSE)
+  df <- createDataFrame(t)
+
+  # base R
+  rm <- stats::glm(Freq ~ Sex + Age, family = "gaussian", data = t)
+  # spark.glm with default stringIndexerOrderType = "frequencyDesc"
+  sm0 <- spark.glm(df, Freq ~ Sex + Age, family = "gaussian")
+  # spark.glm with stringIndexerOrderType = "alphabetDesc"
+  sm1 <- spark.glm(df, Freq ~ Sex + Age, family = "gaussian",
+                   stringIndexerOrderType = "alphabetDesc")
+  # glm with stringIndexerOrderType = "alphabetDesc"
+  sm2 <- glm(Freq ~ Sex + Age, family = "gaussian", data = df,
+                stringIndexerOrderType = "alphabetDesc")
+
+  rStats <- summary(rm)
+  rCoefs <- rStats$coefficients
+  sStats <- lapply(list(sm0, sm1, sm2), summary)
+  # order by coefficient size since column rendering may be different
+  o <- order(rCoefs[, 1])
+
+  # default encoding does not produce same results as R
+  expect_false(all(abs(rCoefs[o, ] - sStats[[1]]$coefficients[o, ]) < 1e-4))
+
+  # all estimates should be the same as R with stringIndexerOrderType = 
"alphabetDesc"
+  test <- lapply(sStats[2:3], function(stats) {
+    expect_true(all(abs(rCoefs[o, ] - stats$coefficients[o, ]) < 1e-4))
+    expect_equal(stats$dispersion, rStats$dispersion)
+    expect_equal(stats$null.deviance, rStats$null.deviance)
+    expect_equal(stats$deviance, rStats$deviance)
+    expect_equal(stats$df.null, rStats$df.null)
+    expect_equal(stats$df.residual, rStats$df.residual)
+    expect_equal(stats$aic, rStats$aic)
+  })
+
+  # fitted values should be equal regardless of string encoding
+  rVals <- predict(rm, t)
+  test <- lapply(list(sm0, sm1, sm2), function(sm) {
+    vals <- collect(select(predict(sm, df), "prediction"))
+    expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
+  })
+})
+
 test_that("spark.isoreg", {
   label <- c(7.0, 5.0, 3.0, 5.0, 1.0)
   feature <- c(0.0, 1.0, 2.0, 3.0, 4.0)
@@ -462,6 +505,25 @@ test_that("spark.survreg", {
       model <- survival::survreg(formula = survival::Surv(time, status) ~ x + 
sex, data = rData),
                                  NA)
     expect_equal(predict(model, rData)[[1]], 3.724591, tolerance = 1e-4)
+
+    # Test stringIndexerOrderType
+    rData <- as.data.frame(rData)
+    rData$sex2 <- c("female", "male")[rData$sex + 1]
+    df <- createDataFrame(rData)
+    expect_error(
+      rModel <- survival::survreg(survival::Surv(time, status) ~ x + sex2, 
rData), NA)
+    rCoefs <- as.numeric(summary(rModel)$table[, 1])
+    model <- spark.survreg(df, Surv(time, status) ~ x + sex2)
+    coefs <- as.vector(summary(model)$coefficients[, 1])
+    o <- order(rCoefs)
+    # stringIndexerOrderType = "frequencyDesc" produces different estimates 
from R
+    expect_false(all(abs(rCoefs[o] - coefs[o]) < 1e-4))
+
+    # stringIndexerOrderType = "alphabetDesc" produces the same estimates as R
+    model <- spark.survreg(df, Surv(time, status) ~ x + sex2,
+                           stringIndexerOrderType = "alphabetDesc")
+    coefs <- as.vector(summary(model)$coefficients[, 1])
+    expect_true(all(abs(rCoefs[o] - coefs[o]) < 1e-4))
   }
 })
 

http://git-wip-us.apache.org/repos/asf/spark/blob/ad459cfb/mllib/src/main/scala/org/apache/spark/ml/r/AFTSurvivalRegressionWrapper.scala
----------------------------------------------------------------------
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/r/AFTSurvivalRegressionWrapper.scala 
b/mllib/src/main/scala/org/apache/spark/ml/r/AFTSurvivalRegressionWrapper.scala
index 0bf543d..80d03ab 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/r/AFTSurvivalRegressionWrapper.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/r/AFTSurvivalRegressionWrapper.scala
@@ -85,11 +85,13 @@ private[r] object AFTSurvivalRegressionWrapper extends 
MLReadable[AFTSurvivalReg
   def fit(
       formula: String,
       data: DataFrame,
-      aggregationDepth: Int): AFTSurvivalRegressionWrapper = {
+      aggregationDepth: Int,
+      stringIndexerOrderType: String): AFTSurvivalRegressionWrapper = {
 
     val (rewritedFormula, censorCol) = formulaRewrite(formula)
 
     val rFormula = new RFormula().setFormula(rewritedFormula)
+      .setStringIndexerOrderType(stringIndexerOrderType)
     RWrapperUtils.checkDataColumns(rFormula, data)
     val rFormulaModel = rFormula.fit(data)
 

http://git-wip-us.apache.org/repos/asf/spark/blob/ad459cfb/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
----------------------------------------------------------------------
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
index 4bd4aa7..ee1fc9b 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
@@ -65,6 +65,7 @@ private[r] class GeneralizedLinearRegressionWrapper private (
 private[r] object GeneralizedLinearRegressionWrapper
   extends MLReadable[GeneralizedLinearRegressionWrapper] {
 
+  // scalastyle:off
   def fit(
       formula: String,
       data: DataFrame,
@@ -75,8 +76,11 @@ private[r] object GeneralizedLinearRegressionWrapper
       weightCol: String,
       regParam: Double,
       variancePower: Double,
-      linkPower: Double): GeneralizedLinearRegressionWrapper = {
+      linkPower: Double,
+      stringIndexerOrderType: String): GeneralizedLinearRegressionWrapper = {
+  // scalastyle:on
     val rFormula = new RFormula().setFormula(formula)
+      .setStringIndexerOrderType(stringIndexerOrderType)
     checkDataColumns(rFormula, data)
     val rFormulaModel = rFormula.fit(data)
     // get labels and feature names from output schema


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to