Repository: spark
Updated Branches:
  refs/heads/branch-1.6 f38509a76 -> 54685fa36


[ML][R] SparkR::glm summary result to compare with native R

Follow up #9561. Due to 
[SPARK-11587](https://issues.apache.org/jira/browse/SPARK-11587) has been 
fixed, we should compare SparkR::glm summary result with native R output rather 
than hard-code one. mengxr

Author: Yanbo Liang <yblia...@gmail.com>

Closes #9590 from yanboliang/glm-r-test.

(cherry picked from commit f14e95115c0939a77ebcb00209696a87fd651ff9)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/54685fa3
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/54685fa3
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/54685fa3

Branch: refs/heads/branch-1.6
Commit: 54685fa3637e7babf509adfeea0d6af652eeeb7e
Parents: f38509a
Author: Yanbo Liang <yblia...@gmail.com>
Authored: Tue Nov 10 11:34:36 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Tue Nov 10 11:34:44 2015 -0800

----------------------------------------------------------------------
 R/pkg/R/mllib.R               |  2 +-
 R/pkg/inst/tests/test_mllib.R | 31 ++++++++++---------------------
 2 files changed, 11 insertions(+), 22 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/54685fa3/R/pkg/R/mllib.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index 7126b7c..f23e1c7 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -106,7 +106,7 @@ setMethod("summary", signature(object = "PipelineModel"),
               coefficients <- matrix(coefficients, ncol = 4)
               colnames(coefficients) <- c("Estimate", "Std. Error", "t value", 
"Pr(>|t|)")
               rownames(coefficients) <- unlist(features)
-              return(list(DevianceResiduals = devianceResiduals, Coefficients 
= coefficients))
+              return(list(devianceResiduals = devianceResiduals, coefficients 
= coefficients))
             } else {
               coefficients <- as.matrix(unlist(coefficients))
               colnames(coefficients) <- c("Estimate")

http://git-wip-us.apache.org/repos/asf/spark/blob/54685fa3/R/pkg/inst/tests/test_mllib.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/test_mllib.R b/R/pkg/inst/tests/test_mllib.R
index 42287ea..d497ad8 100644
--- a/R/pkg/inst/tests/test_mllib.R
+++ b/R/pkg/inst/tests/test_mllib.R
@@ -72,22 +72,17 @@ test_that("feature interaction vs native glm", {
 test_that("summary coefficients match with native glm", {
   training <- createDataFrame(sqlContext, iris)
   stats <- summary(glm(Sepal_Width ~ Sepal_Length + Species, data = training, 
solver = "normal"))
-  coefs <- unlist(stats$Coefficients)
-  devianceResiduals <- unlist(stats$DevianceResiduals)
+  coefs <- unlist(stats$coefficients)
+  devianceResiduals <- unlist(stats$devianceResiduals)
 
-  rCoefs <- as.vector(coef(glm(Sepal.Width ~ Sepal.Length + Species, data = 
iris)))
-  rStdError <- c(0.23536, 0.04630, 0.07207, 0.09331)
-  rTValue <- c(7.123, 7.557, -13.644, -10.798)
-  rPValue <- c(0.0, 0.0, 0.0, 0.0)
+  rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = iris))
+  rCoefs <- unlist(rStats$coefficients)
   rDevianceResiduals <- c(-0.95096, 0.72918)
 
-  expect_true(all(abs(rCoefs - coefs[1:4]) < 1e-6))
-  expect_true(all(abs(rStdError - coefs[5:8]) < 1e-5))
-  expect_true(all(abs(rTValue - coefs[9:12]) < 1e-3))
-  expect_true(all(abs(rPValue - coefs[13:16]) < 1e-6))
+  expect_true(all(abs(rCoefs - coefs) < 1e-5))
   expect_true(all(abs(rDevianceResiduals - devianceResiduals) < 1e-5))
   expect_true(all(
-    rownames(stats$Coefficients) ==
+    rownames(stats$coefficients) ==
     c("(Intercept)", "Sepal_Length", "Species_versicolor", 
"Species_virginica")))
 })
 
@@ -96,21 +91,15 @@ test_that("summary coefficients match with native glm of 
family 'binomial'", {
   training <- filter(df, df$Species != "setosa")
   stats <- summary(glm(Species ~ Sepal_Length + Sepal_Width, data = training,
     family = "binomial"))
-  coefs <- as.vector(stats$Coefficients)
+  coefs <- as.vector(stats$coefficients[,1])
 
   rTraining <- iris[iris$Species %in% c("versicolor","virginica"),]
   rCoefs <- as.vector(coef(glm(Species ~ Sepal.Length + Sepal.Width, data = 
rTraining,
     family = binomial(link = "logit"))))
-  rStdError <- c(3.0974, 0.5169, 0.8628)
-  rTValue <- c(-4.212, 3.680, 0.469)
-  rPValue <- c(0.000, 0.000, 0.639)
-
-  expect_true(all(abs(rCoefs - coefs[1:3]) < 1e-4))
-  expect_true(all(abs(rStdError - coefs[4:6]) < 1e-4))
-  expect_true(all(abs(rTValue - coefs[7:9]) < 1e-3))
-  expect_true(all(abs(rPValue - coefs[10:12]) < 1e-3))
+
+  expect_true(all(abs(rCoefs - coefs) < 1e-4))
   expect_true(all(
-    rownames(stats$Coefficients) ==
+    rownames(stats$coefficients) ==
     c("(Intercept)", "Sepal_Length", "Sepal_Width")))
 })
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to