Repository: spark Updated Branches: refs/heads/master 26a4cba3f -> ce112cec4
[SPARK-19395][SPARKR] Convert coefficients in summary to matrix ## What changes were proposed in this pull request? The `coefficients` component in model summary should be 'matrix' but the underlying structure is indeed list. This affects several models except for 'AFTSurvivalRegressionModel' which has the correct implementation. The fix is to first `unlist` the coefficients returned from the `callJMethod` before converting to matrix. An example illustrates the issues: ``` data(iris) df <- createDataFrame(iris) model <- spark.glm(df, Sepal_Length ~ Sepal_Width, family = "gaussian") s <- summary(model) > str(s$coefficients) List of 8 $ : num 6.53 $ : num -0.223 $ : num 0.479 $ : num 0.155 $ : num 13.6 $ : num -1.44 $ : num 0 $ : num 0.152 - attr(*, "dim")= int [1:2] 2 4 - attr(*, "dimnames")=List of 2 ..$ : chr [1:2] "(Intercept)" "Sepal_Width" ..$ : chr [1:4] "Estimate" "Std. Error" "t value" "Pr(>|t|)" > s$coefficients[, 2] $`(Intercept)` [1] 0.4788963 $Sepal_Width [1] 0.1550809 ``` This shows that the underlying structure of coefficients is still `list`. felixcheung wangmiao1981 Author: actuaryzhang <actuaryzhan...@gmail.com> Closes #16730 from actuaryzhang/sparkRCoef. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ce112cec Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ce112cec Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ce112cec Branch: refs/heads/master Commit: ce112cec4f9bff222aa256893f94c316662a2a7e Parents: 26a4cba Author: actuaryzhang <actuaryzhan...@gmail.com> Authored: Tue Jan 31 12:20:43 2017 -0800 Committer: Felix Cheung <felixche...@apache.org> Committed: Tue Jan 31 12:20:43 2017 -0800 ---------------------------------------------------------------------- R/pkg/R/mllib_classification.R | 2 +- R/pkg/R/mllib_clustering.R | 2 +- R/pkg/R/mllib_regression.R | 4 +-- .../tests/testthat/test_mllib_classification.R | 17 +++++++----- .../inst/tests/testthat/test_mllib_clustering.R | 4 +++ .../inst/tests/testthat/test_mllib_regression.R | 27 +++++++++++--------- 6 files changed, 34 insertions(+), 22 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/ce112cec/R/pkg/R/mllib_classification.R ---------------------------------------------------------------------- diff --git a/R/pkg/R/mllib_classification.R b/R/pkg/R/mllib_classification.R index fee4a4c..552cbe4 100644 --- a/R/pkg/R/mllib_classification.R +++ b/R/pkg/R/mllib_classification.R @@ -145,7 +145,7 @@ setMethod("summary", signature(object = "LogisticRegressionModel"), labels <- callJMethod(jobj, "labels") coefficients <- callJMethod(jobj, "rCoefficients") nCol <- length(coefficients) / length(features) - coefficients <- matrix(coefficients, ncol = nCol) + coefficients <- matrix(unlist(coefficients), ncol = nCol) # If nCol == 1, means this is a binomial logistic regression model with pivoting. # Otherwise, it's a multinomial logistic regression model without pivoting. if (nCol == 1) { http://git-wip-us.apache.org/repos/asf/spark/blob/ce112cec/R/pkg/R/mllib_clustering.R ---------------------------------------------------------------------- diff --git a/R/pkg/R/mllib_clustering.R b/R/pkg/R/mllib_clustering.R index e384c73..3b782ce 100644 --- a/R/pkg/R/mllib_clustering.R +++ b/R/pkg/R/mllib_clustering.R @@ -390,7 +390,7 @@ setMethod("summary", signature(object = "KMeansModel"), coefficients <- callJMethod(jobj, "coefficients") k <- callJMethod(jobj, "k") size <- callJMethod(jobj, "size") - coefficients <- t(matrix(coefficients, ncol = k)) + coefficients <- t(matrix(unlist(coefficients), ncol = k)) colnames(coefficients) <- unlist(features) rownames(coefficients) <- 1:k cluster <- if (is.loaded) { http://git-wip-us.apache.org/repos/asf/spark/blob/ce112cec/R/pkg/R/mllib_regression.R ---------------------------------------------------------------------- diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R index 7908600..96ee220 100644 --- a/R/pkg/R/mllib_regression.R +++ b/R/pkg/R/mllib_regression.R @@ -182,11 +182,11 @@ setMethod("summary", signature(object = "GeneralizedLinearRegressionModel"), # coefficients, standard error of coefficients, t value and p value. Otherwise, # it will be fitted by local "l-bfgs", we can only provide coefficients. if (length(features) == length(coefficients)) { - coefficients <- matrix(coefficients, ncol = 1) + coefficients <- matrix(unlist(coefficients), ncol = 1) colnames(coefficients) <- c("Estimate") rownames(coefficients) <- unlist(features) } else { - coefficients <- matrix(coefficients, ncol = 4) + coefficients <- matrix(unlist(coefficients), ncol = 4) colnames(coefficients) <- c("Estimate", "Std. Error", "t value", "Pr(>|t|)") rownames(coefficients) <- unlist(features) } http://git-wip-us.apache.org/repos/asf/spark/blob/ce112cec/R/pkg/inst/tests/testthat/test_mllib_classification.R ---------------------------------------------------------------------- diff --git a/R/pkg/inst/tests/testthat/test_mllib_classification.R b/R/pkg/inst/tests/testthat/test_mllib_classification.R index 2e0dea3..5f84a62 100644 --- a/R/pkg/inst/tests/testthat/test_mllib_classification.R +++ b/R/pkg/inst/tests/testthat/test_mllib_classification.R @@ -68,12 +68,17 @@ test_that("spark.logit", { df <- suppressWarnings(createDataFrame(iris)) model <- spark.logit(df, Species ~ ., regParam = 0.5) summary <- summary(model) + + # test summary coefficients return matrix type + expect_true(class(summary$coefficients) == "matrix") + expect_true(class(summary$coefficients[, 1]) == "numeric") + versicolorCoefsR <- c(1.52, 0.03, -0.53, 0.04, 0.00) virginicaCoefsR <- c(-2.62, 0.27, -0.02, 0.16, 0.42) setosaCoefsR <- c(1.10, -0.29, 0.55, -0.19, -0.42) - versicolorCoefs <- unlist(summary$coefficients[, "versicolor"]) - virginicaCoefs <- unlist(summary$coefficients[, "virginica"]) - setosaCoefs <- unlist(summary$coefficients[, "setosa"]) + versicolorCoefs <- summary$coefficients[, "versicolor"] + virginicaCoefs <- summary$coefficients[, "virginica"] + setosaCoefs <- summary$coefficients[, "setosa"] expect_true(all(abs(versicolorCoefsR - versicolorCoefs) < 0.1)) expect_true(all(abs(virginicaCoefsR - virginicaCoefs) < 0.1)) expect_true(all(abs(setosaCoefs - setosaCoefs) < 0.1)) @@ -136,8 +141,8 @@ test_that("spark.logit", { summary <- summary(model) versicolorCoefsR <- c(3.94, -0.16, -0.02, -0.35, -0.78) virginicaCoefsR <- c(-3.94, 0.16, -0.02, 0.35, 0.78) - versicolorCoefs <- unlist(summary$coefficients[, "versicolor"]) - virginicaCoefs <- unlist(summary$coefficients[, "virginica"]) + versicolorCoefs <- summary$coefficients[, "versicolor"] + virginicaCoefs <- summary$coefficients[, "virginica"] expect_true(all(abs(versicolorCoefsR - versicolorCoefs) < 0.1)) expect_true(all(abs(virginicaCoefsR - virginicaCoefs) < 0.1)) @@ -145,7 +150,7 @@ test_that("spark.logit", { model <- spark.logit(training, Species ~ ., regParam = 0.5) summary <- summary(model) coefsR <- c(-6.08, 0.25, 0.16, 0.48, 1.04) - coefs <- unlist(summary$coefficients[, "Estimate"]) + coefs <- summary$coefficients[, "Estimate"] expect_true(all(abs(coefsR - coefs) < 0.1)) # Test prediction with string label http://git-wip-us.apache.org/repos/asf/spark/blob/ce112cec/R/pkg/inst/tests/testthat/test_mllib_clustering.R ---------------------------------------------------------------------- diff --git a/R/pkg/inst/tests/testthat/test_mllib_clustering.R b/R/pkg/inst/tests/testthat/test_mllib_clustering.R index aad834b..28a6eeb 100644 --- a/R/pkg/inst/tests/testthat/test_mllib_clustering.R +++ b/R/pkg/inst/tests/testthat/test_mllib_clustering.R @@ -166,6 +166,10 @@ test_that("spark.kmeans", { expect_equal(k, 2) expect_equal(sort(collect(distinct(select(cluster, "prediction")))$prediction), c(0, 1)) + # test summary coefficients return matrix type + expect_true(class(summary.model$coefficients) == "matrix") + expect_true(class(summary.model$coefficients[1, ]) == "numeric") + # Test model save/load modelPath <- tempfile(pattern = "spark-kmeans", fileext = ".tmp") write.ml(model, modelPath) http://git-wip-us.apache.org/repos/asf/spark/blob/ce112cec/R/pkg/inst/tests/testthat/test_mllib_regression.R ---------------------------------------------------------------------- diff --git a/R/pkg/inst/tests/testthat/test_mllib_regression.R b/R/pkg/inst/tests/testthat/test_mllib_regression.R index c450a15..81a5bdc 100644 --- a/R/pkg/inst/tests/testthat/test_mllib_regression.R +++ b/R/pkg/inst/tests/testthat/test_mllib_regression.R @@ -87,11 +87,14 @@ test_that("spark.glm summary", { # gaussian family training <- suppressWarnings(createDataFrame(iris)) stats <- summary(spark.glm(training, Sepal_Width ~ Sepal_Length + Species)) - rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = iris)) - coefs <- unlist(stats$coefficients) - rCoefs <- unlist(rStats$coefficients) + # test summary coefficients return matrix type + expect_true(class(stats$coefficients) == "matrix") + expect_true(class(stats$coefficients[, 1]) == "numeric") + + coefs <- stats$coefficients + rCoefs <- rStats$coefficients expect_true(all(abs(rCoefs - coefs) < 1e-4)) expect_true(all( rownames(stats$coefficients) == @@ -117,8 +120,8 @@ test_that("spark.glm summary", { rStats <- summary(glm(Species ~ Sepal.Length + Sepal.Width, data = rTraining, family = binomial(link = "logit"))) - coefs <- unlist(stats$coefficients) - rCoefs <- unlist(rStats$coefficients) + coefs <- stats$coefficients + rCoefs <- rStats$coefficients expect_true(all(abs(rCoefs - coefs) < 1e-4)) expect_true(all( rownames(stats$coefficients) == @@ -141,8 +144,8 @@ test_that("spark.glm summary", { stats <- summary(spark.glm(df, b ~ a1 + a2, family = "binomial", weightCol = "w")) rStats <- summary(glm(b ~ a1 + a2, family = "binomial", data = data, weights = w)) - coefs <- unlist(stats$coefficients) - rCoefs <- unlist(rStats$coefficients) + coefs <- stats$coefficients + rCoefs <- rStats$coefficients expect_true(all(abs(rCoefs - coefs) < 1e-3)) expect_true(all(rownames(stats$coefficients) == c("(Intercept)", "a1", "a2"))) expect_equal(stats$dispersion, rStats$dispersion) @@ -169,7 +172,7 @@ test_that("spark.glm summary", { data <- as.data.frame(cbind(A, b)) df <- createDataFrame(data) stats <- summary(spark.glm(df, b ~ . - 1)) - coefs <- unlist(stats$coefficients) + coefs <- stats$coefficients expect_true(all(abs(c(0.5, 0.25) - coefs) < 1e-4)) }) @@ -259,8 +262,8 @@ test_that("glm summary", { rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = iris)) - coefs <- unlist(stats$coefficients) - rCoefs <- unlist(rStats$coefficients) + coefs <- stats$coefficients + rCoefs <- rStats$coefficients expect_true(all(abs(rCoefs - coefs) < 1e-4)) expect_true(all( rownames(stats$coefficients) == @@ -282,8 +285,8 @@ test_that("glm summary", { rStats <- summary(glm(Species ~ Sepal.Length + Sepal.Width, data = rTraining, family = binomial(link = "logit"))) - coefs <- unlist(stats$coefficients) - rCoefs <- unlist(rStats$coefficients) + coefs <- stats$coefficients + rCoefs <- rStats$coefficients expect_true(all(abs(rCoefs - coefs) < 1e-4)) expect_true(all( rownames(stats$coefficients) == --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org