[4/5] spark git commit: [SPARK-18862][SPARKR][ML] Split SparkR mllib.R into multiple files

yliang Sun, 08 Jan 2017 01:11:20 -0800

http://git-wip-us.apache.org/repos/asf/spark/blob/6b6b555a/R/pkg/R/mllib.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
deleted file mode 100644
index d736bbb..0000000
--- a/R/pkg/R/mllib.R
+++ /dev/null
@@ -1,2114 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# mllib.R: Provides methods for MLlib integration
-
-# Integration with R's standard functions.
-# Most of MLlib's argorithms are provided in two flavours:
-# - a specialization of the default R methods (glm). These methods try to 
respect
-#   the inputs and the outputs of R's method to the largest extent, but some 
small differences
-#   may exist.
-# - a set of methods that reflect the arguments of the other languages 
supported by Spark. These
-#   methods are prefixed with the `spark.` prefix: spark.glm, spark.kmeans, 
etc.
-
-#' S4 class that represents a generalized linear model
-#'
-#' @param jobj a Java object reference to the backing Scala 
GeneralizedLinearRegressionWrapper
-#' @export
-#' @note GeneralizedLinearRegressionModel since 2.0.0
-setClass("GeneralizedLinearRegressionModel", representation(jobj = "jobj"))
-
-#' S4 class that represents a NaiveBayesModel
-#'
-#' @param jobj a Java object reference to the backing Scala NaiveBayesWrapper
-#' @export
-#' @note NaiveBayesModel since 2.0.0
-setClass("NaiveBayesModel", representation(jobj = "jobj"))
-
-#' S4 class that represents an LDAModel
-#'
-#' @param jobj a Java object reference to the backing Scala LDAWrapper
-#' @export
-#' @note LDAModel since 2.1.0
-setClass("LDAModel", representation(jobj = "jobj"))
-
-#' S4 class that represents a AFTSurvivalRegressionModel
-#'
-#' @param jobj a Java object reference to the backing Scala 
AFTSurvivalRegressionWrapper
-#' @export
-#' @note AFTSurvivalRegressionModel since 2.0.0
-setClass("AFTSurvivalRegressionModel", representation(jobj = "jobj"))
-
-#' S4 class that represents a KMeansModel
-#'
-#' @param jobj a Java object reference to the backing Scala KMeansModel
-#' @export
-#' @note KMeansModel since 2.0.0
-setClass("KMeansModel", representation(jobj = "jobj"))
-
-#' S4 class that represents a MultilayerPerceptronClassificationModel
-#'
-#' @param jobj a Java object reference to the backing Scala 
MultilayerPerceptronClassifierWrapper
-#' @export
-#' @note MultilayerPerceptronClassificationModel since 2.1.0
-setClass("MultilayerPerceptronClassificationModel", representation(jobj = 
"jobj"))
-
-#' S4 class that represents an IsotonicRegressionModel
-#'
-#' @param jobj a Java object reference to the backing Scala 
IsotonicRegressionModel
-#' @export
-#' @note IsotonicRegressionModel since 2.1.0
-setClass("IsotonicRegressionModel", representation(jobj = "jobj"))
-
-#' S4 class that represents a GaussianMixtureModel
-#'
-#' @param jobj a Java object reference to the backing Scala 
GaussianMixtureModel
-#' @export
-#' @note GaussianMixtureModel since 2.1.0
-setClass("GaussianMixtureModel", representation(jobj = "jobj"))
-
-#' S4 class that represents an ALSModel
-#'
-#' @param jobj a Java object reference to the backing Scala ALSWrapper
-#' @export
-#' @note ALSModel since 2.1.0
-setClass("ALSModel", representation(jobj = "jobj"))
-
-#' S4 class that represents an KSTest
-#'
-#' @param jobj a Java object reference to the backing Scala KSTestWrapper
-#' @export
-#' @note KSTest since 2.1.0
-setClass("KSTest", representation(jobj = "jobj"))
-
-#' S4 class that represents an LogisticRegressionModel
-#'
-#' @param jobj a Java object reference to the backing Scala 
LogisticRegressionModel
-#' @export
-#' @note LogisticRegressionModel since 2.1.0
-setClass("LogisticRegressionModel", representation(jobj = "jobj"))
-
-#' S4 class that represents a RandomForestRegressionModel
-#'
-#' @param jobj a Java object reference to the backing Scala 
RandomForestRegressionModel
-#' @export
-#' @note RandomForestRegressionModel since 2.1.0
-setClass("RandomForestRegressionModel", representation(jobj = "jobj"))
-
-#' S4 class that represents a RandomForestClassificationModel
-#'
-#' @param jobj a Java object reference to the backing Scala 
RandomForestClassificationModel
-#' @export
-#' @note RandomForestClassificationModel since 2.1.0
-setClass("RandomForestClassificationModel", representation(jobj = "jobj"))
-
-#' S4 class that represents a GBTRegressionModel
-#'
-#' @param jobj a Java object reference to the backing Scala GBTRegressionModel
-#' @export
-#' @note GBTRegressionModel since 2.1.0
-setClass("GBTRegressionModel", representation(jobj = "jobj"))
-
-#' S4 class that represents a GBTClassificationModel
-#'
-#' @param jobj a Java object reference to the backing Scala 
GBTClassificationModel
-#' @export
-#' @note GBTClassificationModel since 2.1.0
-setClass("GBTClassificationModel", representation(jobj = "jobj"))
-
-#' Saves the MLlib model to the input path
-#'
-#' Saves the MLlib model to the input path. For more information, see the 
specific
-#' MLlib model below.
-#' @rdname write.ml
-#' @name write.ml
-#' @export
-#' @seealso \link{spark.glm}, \link{glm},
-#' @seealso \link{spark.als}, \link{spark.gaussianMixture}, \link{spark.gbt}, 
\link{spark.isoreg},
-#' @seealso \link{spark.kmeans},
-#' @seealso \link{spark.lda}, \link{spark.logit}, \link{spark.mlp}, 
\link{spark.naiveBayes},
-#' @seealso \link{spark.randomForest}, \link{spark.survreg},
-#' @seealso \link{read.ml}
-NULL
-
-#' Makes predictions from a MLlib model
-#'
-#' Makes predictions from a MLlib model. For more information, see the specific
-#' MLlib model below.
-#' @rdname predict
-#' @name predict
-#' @export
-#' @seealso \link{spark.glm}, \link{glm},
-#' @seealso \link{spark.als}, \link{spark.gaussianMixture}, \link{spark.gbt}, 
\link{spark.isoreg},
-#' @seealso \link{spark.kmeans},
-#' @seealso \link{spark.logit}, \link{spark.mlp}, \link{spark.naiveBayes},
-#' @seealso \link{spark.randomForest}, \link{spark.survreg}
-NULL
-
-write_internal <- function(object, path, overwrite = FALSE) {
-  writer <- callJMethod(object@jobj, "write")
-  if (overwrite) {
-    writer <- callJMethod(writer, "overwrite")
-  }
-  invisible(callJMethod(writer, "save", path))
-}
-
-predict_internal <- function(object, newData) {
-  dataFrame(callJMethod(object@jobj, "transform", newData@sdf))
-}
-
-#' Generalized Linear Models
-#'
-#' Fits generalized linear model against a Spark DataFrame.
-#' Users can call \code{summary} to print a summary of the fitted model, 
\code{predict} to make
-#' predictions on new data, and \code{write.ml}/\code{read.ml} to save/load 
fitted models.
-#'
-#' @param data a SparkDataFrame for training.
-#' @param formula a symbolic description of the model to be fitted. Currently 
only a few formula
-#'                operators are supported, including '~', '.', ':', '+', and 
'-'.
-#' @param family a description of the error distribution and link function to 
be used in the model.
-#'               This can be a character string naming a family function, a 
family function or
-#'               the result of a call to a family function. Refer R family at
-#'               
\url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}.
-#' @param tol positive convergence tolerance of iterations.
-#' @param maxIter integer giving the maximal number of IRLS iterations.
-#' @param weightCol the weight column name. If this is not set or \code{NULL}, 
we treat all instance
-#'                  weights as 1.0.
-#' @param regParam regularization parameter for L2 regularization.
-#' @param ... additional arguments passed to the method.
-#' @aliases spark.glm,SparkDataFrame,formula-method
-#' @return \code{spark.glm} returns a fitted generalized linear model.
-#' @rdname spark.glm
-#' @name spark.glm
-#' @export
-#' @examples
-#' \dontrun{
-#' sparkR.session()
-#' data(iris)
-#' df <- createDataFrame(iris)
-#' model <- spark.glm(df, Sepal_Length ~ Sepal_Width, family = "gaussian")
-#' summary(model)
-#'
-#' # fitted values on training data
-#' fitted <- predict(model, df)
-#' head(select(fitted, "Sepal_Length", "prediction"))
-#'
-#' # save fitted model to input path
-#' path <- "path/to/model"
-#' write.ml(model, path)
-#'
-#' # can also read back the saved model and print
-#' savedModel <- read.ml(path)
-#' summary(savedModel)
-#' }
-#' @note spark.glm since 2.0.0
-#' @seealso \link{glm}, \link{read.ml}
-setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
-          function(data, formula, family = gaussian, tol = 1e-6, maxIter = 25, 
weightCol = NULL,
-                   regParam = 0.0) {
-            if (is.character(family)) {
-              family <- get(family, mode = "function", envir = parent.frame())
-            }
-            if (is.function(family)) {
-              family <- family()
-            }
-            if (is.null(family$family)) {
-              print(family)
-              stop("'family' not recognized")
-            }
-
-            formula <- paste(deparse(formula), collapse = "")
-            if (is.null(weightCol)) {
-              weightCol <- ""
-            }
-
-            jobj <- 
callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper",
-                                "fit", formula, data@sdf, family$family, 
family$link,
-                                tol, as.integer(maxIter), 
as.character(weightCol), regParam)
-            new("GeneralizedLinearRegressionModel", jobj = jobj)
-          })
-
-#' Generalized Linear Models (R-compliant)
-#'
-#' Fits a generalized linear model, similarly to R's glm().
-#' @param formula a symbolic description of the model to be fitted. Currently 
only a few formula
-#'                operators are supported, including '~', '.', ':', '+', and 
'-'.
-#' @param data a SparkDataFrame or R's glm data for training.
-#' @param family a description of the error distribution and link function to 
be used in the model.
-#'               This can be a character string naming a family function, a 
family function or
-#'               the result of a call to a family function. Refer R family at
-#'               
\url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}.
-#' @param weightCol the weight column name. If this is not set or \code{NULL}, 
we treat all instance
-#'                  weights as 1.0.
-#' @param epsilon positive convergence tolerance of iterations.
-#' @param maxit integer giving the maximal number of IRLS iterations.
-#' @return \code{glm} returns a fitted generalized linear model.
-#' @rdname glm
-#' @export
-#' @examples
-#' \dontrun{
-#' sparkR.session()
-#' data(iris)
-#' df <- createDataFrame(iris)
-#' model <- glm(Sepal_Length ~ Sepal_Width, df, family = "gaussian")
-#' summary(model)
-#' }
-#' @note glm since 1.5.0
-#' @seealso \link{spark.glm}
-setMethod("glm", signature(formula = "formula", family = "ANY", data = 
"SparkDataFrame"),
-          function(formula, family = gaussian, data, epsilon = 1e-6, maxit = 
25, weightCol = NULL) {
-            spark.glm(data, formula, family, tol = epsilon, maxIter = maxit, 
weightCol = weightCol)
-          })
-
-#  Returns the summary of a model produced by glm() or spark.glm(), similarly 
to R's summary().
-
-#' @param object a fitted generalized linear model.
-#' @return \code{summary} returns summary information of the fitted model, 
which is a list.
-#'         The list of components includes at least the \code{coefficients} 
(coefficients matrix, which includes
-#'         coefficients, standard error of coefficients, t value and p value),
-#'         \code{null.deviance} (null/residual degrees of freedom), \code{aic} 
(AIC)
-#'         and \code{iter} (number of iterations IRLS takes). If there are 
collinear columns in the data,
-#'         the coefficients matrix only provides coefficients.
-#' @rdname spark.glm
-#' @export
-#' @note summary(GeneralizedLinearRegressionModel) since 2.0.0
-setMethod("summary", signature(object = "GeneralizedLinearRegressionModel"),
-          function(object) {
-            jobj <- object@jobj
-            is.loaded <- callJMethod(jobj, "isLoaded")
-            features <- callJMethod(jobj, "rFeatures")
-            coefficients <- callJMethod(jobj, "rCoefficients")
-            dispersion <- callJMethod(jobj, "rDispersion")
-            null.deviance <- callJMethod(jobj, "rNullDeviance")
-            deviance <- callJMethod(jobj, "rDeviance")
-            df.null <- callJMethod(jobj, "rResidualDegreeOfFreedomNull")
-            df.residual <- callJMethod(jobj, "rResidualDegreeOfFreedom")
-            aic <- callJMethod(jobj, "rAic")
-            iter <- callJMethod(jobj, "rNumIterations")
-            family <- callJMethod(jobj, "rFamily")
-            deviance.resid <- if (is.loaded) {
-              NULL
-            } else {
-              dataFrame(callJMethod(jobj, "rDevianceResiduals"))
-            }
-            # If the underlying WeightedLeastSquares using "normal" solver, we 
can provide
-            # coefficients, standard error of coefficients, t value and p 
value. Otherwise,
-            # it will be fitted by local "l-bfgs", we can only provide 
coefficients.
-            if (length(features) == length(coefficients)) {
-              coefficients <- matrix(coefficients, ncol = 1)
-              colnames(coefficients) <- c("Estimate")
-              rownames(coefficients) <- unlist(features)
-            } else {
-              coefficients <- matrix(coefficients, ncol = 4)
-              colnames(coefficients) <- c("Estimate", "Std. Error", "t value", 
"Pr(>|t|)")
-              rownames(coefficients) <- unlist(features)
-            }
-            ans <- list(deviance.resid = deviance.resid, coefficients = 
coefficients,
-                        dispersion = dispersion, null.deviance = null.deviance,
-                        deviance = deviance, df.null = df.null, df.residual = 
df.residual,
-                        aic = aic, iter = iter, family = family, is.loaded = 
is.loaded)
-            class(ans) <- "summary.GeneralizedLinearRegressionModel"
-            ans
-          })
-
-#  Prints the summary of GeneralizedLinearRegressionModel
-
-#' @rdname spark.glm
-#' @param x summary object of fitted generalized linear model returned by 
\code{summary} function.
-#' @export
-#' @note print.summary.GeneralizedLinearRegressionModel since 2.0.0
-print.summary.GeneralizedLinearRegressionModel <- function(x, ...) {
-  if (x$is.loaded) {
-    cat("\nSaved-loaded model does not support output 'Deviance Residuals'.\n")
-  } else {
-    x$deviance.resid <- setNames(unlist(approxQuantile(x$deviance.resid, 
"devianceResiduals",
-    c(0.0, 0.25, 0.5, 0.75, 1.0), 0.01)), c("Min", "1Q", "Median", "3Q", 
"Max"))
-    x$deviance.resid <- zapsmall(x$deviance.resid, 5L)
-    cat("\nDeviance Residuals: \n")
-    cat("(Note: These are approximate quantiles with relative error <= 
0.01)\n")
-    print.default(x$deviance.resid, digits = 5L, na.print = "", print.gap = 2L)
-  }
-
-  cat("\nCoefficients:\n")
-  print.default(x$coefficients, digits = 5L, na.print = "", print.gap = 2L)
-
-  cat("\n(Dispersion parameter for ", x$family, " family taken to be ", 
format(x$dispersion),
-    ")\n\n", apply(cbind(paste(format(c("Null", "Residual"), justify = 
"right"), "deviance:"),
-    format(unlist(x[c("null.deviance", "deviance")]), digits = 5L),
-    " on", format(unlist(x[c("df.null", "df.residual")])), " degrees of 
freedom\n"),
-    1L, paste, collapse = " "), sep = "")
-  cat("AIC: ", format(x$aic, digits = 4L), "\n\n",
-    "Number of Fisher Scoring iterations: ", x$iter, "\n\n", sep = "")
-  invisible(x)
-  }
-
-#  Makes predictions from a generalized linear model produced by glm() or 
spark.glm(),
-#  similarly to R's predict().
-
-#' @param newData a SparkDataFrame for testing.
-#' @return \code{predict} returns a SparkDataFrame containing predicted labels 
in a column named
-#'         "prediction".
-#' @rdname spark.glm
-#' @export
-#' @note predict(GeneralizedLinearRegressionModel) since 1.5.0
-setMethod("predict", signature(object = "GeneralizedLinearRegressionModel"),
-          function(object, newData) {
-            predict_internal(object, newData)
-          })
-
-# Makes predictions from a naive Bayes model or a model produced by 
spark.naiveBayes(),
-# similarly to R package e1071's predict.
-
-#' @param newData a SparkDataFrame for testing.
-#' @return \code{predict} returns a SparkDataFrame containing predicted 
labeled in a column named
-#' "prediction".
-#' @rdname spark.naiveBayes
-#' @export
-#' @note predict(NaiveBayesModel) since 2.0.0
-setMethod("predict", signature(object = "NaiveBayesModel"),
-          function(object, newData) {
-            predict_internal(object, newData)
-          })
-
-# Returns the summary of a naive Bayes model produced by 
\code{spark.naiveBayes}
-
-#' @param object a naive Bayes model fitted by \code{spark.naiveBayes}.
-#' @return \code{summary} returns summary information of the fitted model, 
which is a list.
-#'         The list includes \code{apriori} (the label distribution) and
-#'         \code{tables} (conditional probabilities given the target label).
-#' @rdname spark.naiveBayes
-#' @export
-#' @note summary(NaiveBayesModel) since 2.0.0
-setMethod("summary", signature(object = "NaiveBayesModel"),
-          function(object) {
-            jobj <- object@jobj
-            features <- callJMethod(jobj, "features")
-            labels <- callJMethod(jobj, "labels")
-            apriori <- callJMethod(jobj, "apriori")
-            apriori <- t(as.matrix(unlist(apriori)))
-            colnames(apriori) <- unlist(labels)
-            tables <- callJMethod(jobj, "tables")
-            tables <- matrix(tables, nrow = length(labels))
-            rownames(tables) <- unlist(labels)
-            colnames(tables) <- unlist(features)
-            list(apriori = apriori, tables = tables)
-          })
-
-# Returns posterior probabilities from a Latent Dirichlet Allocation model 
produced by spark.lda()
-
-#' @param newData A SparkDataFrame for testing.
-#' @return \code{spark.posterior} returns a SparkDataFrame containing 
posterior probabilities
-#'         vectors named "topicDistribution".
-#' @rdname spark.lda
-#' @aliases spark.posterior,LDAModel,SparkDataFrame-method
-#' @export
-#' @note spark.posterior(LDAModel) since 2.1.0
-setMethod("spark.posterior", signature(object = "LDAModel", newData = 
"SparkDataFrame"),
-          function(object, newData) {
-            predict_internal(object, newData)
-          })
-
-# Returns the summary of a Latent Dirichlet Allocation model produced by 
\code{spark.lda}
-
-#' @param object A Latent Dirichlet Allocation model fitted by 
\code{spark.lda}.
-#' @param maxTermsPerTopic Maximum number of terms to collect for each topic. 
Default value of 10.
-#' @return \code{summary} returns summary information of the fitted model, 
which is a list.
-#'         The list includes
-#'         \item{\code{docConcentration}}{concentration parameter commonly 
named \code{alpha} for
-#'               the prior placed on documents distributions over topics 
\code{theta}}
-#'         \item{\code{topicConcentration}}{concentration parameter commonly 
named \code{beta} or
-#'               \code{eta} for the prior placed on topic distributions over 
terms}
-#'         \item{\code{logLikelihood}}{log likelihood of the entire corpus}
-#'         \item{\code{logPerplexity}}{log perplexity}
-#'         \item{\code{isDistributed}}{TRUE for distributed model while FALSE 
for local model}
-#'         \item{\code{vocabSize}}{number of terms in the corpus}
-#'         \item{\code{topics}}{top 10 terms and their weights of all topics}
-#'         \item{\code{vocabulary}}{whole terms of the training corpus, NULL 
if libsvm format file
-#'               used as training set}
-#' @rdname spark.lda
-#' @aliases summary,LDAModel-method
-#' @export
-#' @note summary(LDAModel) since 2.1.0
-setMethod("summary", signature(object = "LDAModel"),
-          function(object, maxTermsPerTopic) {
-            maxTermsPerTopic <- as.integer(ifelse(missing(maxTermsPerTopic), 
10, maxTermsPerTopic))
-            jobj <- object@jobj
-            docConcentration <- callJMethod(jobj, "docConcentration")
-            topicConcentration <- callJMethod(jobj, "topicConcentration")
-            logLikelihood <- callJMethod(jobj, "logLikelihood")
-            logPerplexity <- callJMethod(jobj, "logPerplexity")
-            isDistributed <- callJMethod(jobj, "isDistributed")
-            vocabSize <- callJMethod(jobj, "vocabSize")
-            topics <- dataFrame(callJMethod(jobj, "topics", maxTermsPerTopic))
-            vocabulary <- callJMethod(jobj, "vocabulary")
-            list(docConcentration = unlist(docConcentration),
-                 topicConcentration = topicConcentration,
-                 logLikelihood = logLikelihood, logPerplexity = logPerplexity,
-                 isDistributed = isDistributed, vocabSize = vocabSize,
-                 topics = topics, vocabulary = unlist(vocabulary))
-          })
-
-# Returns the log perplexity of a Latent Dirichlet Allocation model produced 
by \code{spark.lda}
-
-#' @return \code{spark.perplexity} returns the log perplexity of given 
SparkDataFrame, or the log
-#'         perplexity of the training data if missing argument "data".
-#' @rdname spark.lda
-#' @aliases spark.perplexity,LDAModel-method
-#' @export
-#' @note spark.perplexity(LDAModel) since 2.1.0
-setMethod("spark.perplexity", signature(object = "LDAModel", data = 
"SparkDataFrame"),
-          function(object, data) {
-            ifelse(missing(data), callJMethod(object@jobj, "logPerplexity"),
-                   callJMethod(object@jobj, "computeLogPerplexity", data@sdf))
-         })
-
-# Saves the Latent Dirichlet Allocation model to the input path.
-
-#' @param path The directory where the model is saved.
-#' @param overwrite Overwrites or not if the output path already exists. 
Default is FALSE
-#'                  which means throw exception if the output path exists.
-#'
-#' @rdname spark.lda
-#' @aliases write.ml,LDAModel,character-method
-#' @export
-#' @seealso \link{read.ml}
-#' @note write.ml(LDAModel, character) since 2.1.0
-setMethod("write.ml", signature(object = "LDAModel", path = "character"),
-          function(object, path, overwrite = FALSE) {
-            write_internal(object, path, overwrite)
-          })
-
-#' Isotonic Regression Model
-#'
-#' Fits an Isotonic Regression model against a Spark DataFrame, similarly to 
R's isoreg().
-#' Users can print, make predictions on the produced model and save the model 
to the input path.
-#'
-#' @param data SparkDataFrame for training.
-#' @param formula A symbolic description of the model to be fitted. Currently 
only a few formula
-#'                operators are supported, including '~', '.', ':', '+', and 
'-'.
-#' @param isotonic Whether the output sequence should be isotonic/increasing 
(TRUE) or
-#'                 antitonic/decreasing (FALSE).
-#' @param featureIndex The index of the feature if \code{featuresCol} is a 
vector column
-#'                     (default: 0), no effect otherwise.
-#' @param weightCol The weight column name.
-#' @param ... additional arguments passed to the method.
-#' @return \code{spark.isoreg} returns a fitted Isotonic Regression model.
-#' @rdname spark.isoreg
-#' @aliases spark.isoreg,SparkDataFrame,formula-method
-#' @name spark.isoreg
-#' @export
-#' @examples
-#' \dontrun{
-#' sparkR.session()
-#' data <- list(list(7.0, 0.0), list(5.0, 1.0), list(3.0, 2.0),
-#'         list(5.0, 3.0), list(1.0, 4.0))
-#' df <- createDataFrame(data, c("label", "feature"))
-#' model <- spark.isoreg(df, label ~ feature, isotonic = FALSE)
-#' # return model boundaries and prediction as lists
-#' result <- summary(model, df)
-#' # prediction based on fitted model
-#' predict_data <- list(list(-2.0), list(-1.0), list(0.5),
-#'                 list(0.75), list(1.0), list(2.0), list(9.0))
-#' predict_df <- createDataFrame(predict_data, c("feature"))
-#' # get prediction column
-#' predict_result <- collect(select(predict(model, predict_df), "prediction"))
-#'
-#' # save fitted model to input path
-#' path <- "path/to/model"
-#' write.ml(model, path)
-#'
-#' # can also read back the saved model and print
-#' savedModel <- read.ml(path)
-#' summary(savedModel)
-#' }
-#' @note spark.isoreg since 2.1.0
-setMethod("spark.isoreg", signature(data = "SparkDataFrame", formula = 
"formula"),
-          function(data, formula, isotonic = TRUE, featureIndex = 0, weightCol 
= NULL) {
-            formula <- paste(deparse(formula), collapse = "")
-
-            if (is.null(weightCol)) {
-              weightCol <- ""
-            }
-
-            jobj <- 
callJStatic("org.apache.spark.ml.r.IsotonicRegressionWrapper", "fit",
-                                data@sdf, formula, as.logical(isotonic), 
as.integer(featureIndex),
-                                as.character(weightCol))
-            new("IsotonicRegressionModel", jobj = jobj)
-          })
-
-#  Predicted values based on an isotonicRegression model
-
-#' @param object a fitted IsotonicRegressionModel.
-#' @param newData SparkDataFrame for testing.
-#' @return \code{predict} returns a SparkDataFrame containing predicted values.
-#' @rdname spark.isoreg
-#' @aliases predict,IsotonicRegressionModel,SparkDataFrame-method
-#' @export
-#' @note predict(IsotonicRegressionModel) since 2.1.0
-setMethod("predict", signature(object = "IsotonicRegressionModel"),
-          function(object, newData) {
-            predict_internal(object, newData)
-          })
-
-#  Get the summary of an IsotonicRegressionModel model
-
-#' @return \code{summary} returns summary information of the fitted model, 
which is a list.
-#'         The list includes model's \code{boundaries} (boundaries in 
increasing order)
-#'         and \code{predictions} (predictions associated with the boundaries 
at the same index).
-#' @rdname spark.isoreg
-#' @aliases summary,IsotonicRegressionModel-method
-#' @export
-#' @note summary(IsotonicRegressionModel) since 2.1.0
-setMethod("summary", signature(object = "IsotonicRegressionModel"),
-          function(object) {
-            jobj <- object@jobj
-            boundaries <- callJMethod(jobj, "boundaries")
-            predictions <- callJMethod(jobj, "predictions")
-            list(boundaries = boundaries, predictions = predictions)
-          })
-
-#' K-Means Clustering Model
-#'
-#' Fits a k-means clustering model against a Spark DataFrame, similarly to R's 
kmeans().
-#' Users can call \code{summary} to print a summary of the fitted model, 
\code{predict} to make
-#' predictions on new data, and \code{write.ml}/\code{read.ml} to save/load 
fitted models.
-#'
-#' @param data a SparkDataFrame for training.
-#' @param formula a symbolic description of the model to be fitted. Currently 
only a few formula
-#'                operators are supported, including '~', '.', ':', '+', and 
'-'.
-#'                Note that the response variable of formula is empty in 
spark.kmeans.
-#' @param k number of centers.
-#' @param maxIter maximum iteration number.
-#' @param initMode the initialization algorithm choosen to fit the model.
-#' @param ... additional argument(s) passed to the method.
-#' @return \code{spark.kmeans} returns a fitted k-means model.
-#' @rdname spark.kmeans
-#' @aliases spark.kmeans,SparkDataFrame,formula-method
-#' @name spark.kmeans
-#' @export
-#' @examples
-#' \dontrun{
-#' sparkR.session()
-#' data(iris)
-#' df <- createDataFrame(iris)
-#' model <- spark.kmeans(df, Sepal_Length ~ Sepal_Width, k = 4, initMode = 
"random")
-#' summary(model)
-#'
-#' # fitted values on training data
-#' fitted <- predict(model, df)
-#' head(select(fitted, "Sepal_Length", "prediction"))
-#'
-#' # save fitted model to input path
-#' path <- "path/to/model"
-#' write.ml(model, path)
-#'
-#' # can also read back the saved model and print
-#' savedModel <- read.ml(path)
-#' summary(savedModel)
-#' }
-#' @note spark.kmeans since 2.0.0
-#' @seealso \link{predict}, \link{read.ml}, \link{write.ml}
-setMethod("spark.kmeans", signature(data = "SparkDataFrame", formula = 
"formula"),
-          function(data, formula, k = 2, maxIter = 20, initMode = 
c("k-means||", "random")) {
-            formula <- paste(deparse(formula), collapse = "")
-            initMode <- match.arg(initMode)
-            jobj <- callJStatic("org.apache.spark.ml.r.KMeansWrapper", "fit", 
data@sdf, formula,
-                                as.integer(k), as.integer(maxIter), initMode)
-            new("KMeansModel", jobj = jobj)
-          })
-
-#' Get fitted result from a k-means model
-#'
-#' Get fitted result from a k-means model, similarly to R's fitted().
-#' Note: A saved-loaded model does not support this method.
-#'
-#' @param object a fitted k-means model.
-#' @param method type of fitted results, \code{"centers"} for cluster centers
-#'        or \code{"classes"} for assigned classes.
-#' @param ... additional argument(s) passed to the method.
-#' @return \code{fitted} returns a SparkDataFrame containing fitted values.
-#' @rdname fitted
-#' @export
-#' @examples
-#' \dontrun{
-#' model <- spark.kmeans(trainingData, ~ ., 2)
-#' fitted.model <- fitted(model)
-#' showDF(fitted.model)
-#'}
-#' @note fitted since 2.0.0
-setMethod("fitted", signature(object = "KMeansModel"),
-          function(object, method = c("centers", "classes")) {
-            method <- match.arg(method)
-            jobj <- object@jobj
-            is.loaded <- callJMethod(jobj, "isLoaded")
-            if (is.loaded) {
-              stop("Saved-loaded k-means model does not support 'fitted' 
method")
-            } else {
-              dataFrame(callJMethod(jobj, "fitted", method))
-            }
-          })
-
-#  Get the summary of a k-means model
-
-#' @param object a fitted k-means model.
-#' @return \code{summary} returns summary information of the fitted model, 
which is a list.
-#'         The list includes the model's \code{k} (number of cluster centers),
-#'         \code{coefficients} (model cluster centers),
-#'         \code{size} (number of data points in each cluster), and 
\code{cluster}
-#'         (cluster centers of the transformed data).
-#' @rdname spark.kmeans
-#' @export
-#' @note summary(KMeansModel) since 2.0.0
-setMethod("summary", signature(object = "KMeansModel"),
-          function(object) {
-            jobj <- object@jobj
-            is.loaded <- callJMethod(jobj, "isLoaded")
-            features <- callJMethod(jobj, "features")
-            coefficients <- callJMethod(jobj, "coefficients")
-            k <- callJMethod(jobj, "k")
-            size <- callJMethod(jobj, "size")
-            coefficients <- t(matrix(coefficients, ncol = k))
-            colnames(coefficients) <- unlist(features)
-            rownames(coefficients) <- 1:k
-            cluster <- if (is.loaded) {
-              NULL
-            } else {
-              dataFrame(callJMethod(jobj, "cluster"))
-            }
-            list(k = k, coefficients = coefficients, size = size,
-                 cluster = cluster, is.loaded = is.loaded)
-          })
-
-#  Predicted values based on a k-means model
-
-#' @param newData a SparkDataFrame for testing.
-#' @return \code{predict} returns the predicted values based on a k-means 
model.
-#' @rdname spark.kmeans
-#' @export
-#' @note predict(KMeansModel) since 2.0.0
-setMethod("predict", signature(object = "KMeansModel"),
-          function(object, newData) {
-            predict_internal(object, newData)
-          })
-
-#' Logistic Regression Model
-#'
-#' Fits an logistic regression model against a Spark DataFrame. It supports 
"binomial": Binary logistic regression
-#' with pivoting; "multinomial": Multinomial logistic (softmax) regression 
without pivoting, similar to glmnet.
-#' Users can print, make predictions on the produced model and save the model 
to the input path.
-#'
-#' @param data SparkDataFrame for training.
-#' @param formula A symbolic description of the model to be fitted. Currently 
only a few formula
-#'                operators are supported, including '~', '.', ':', '+', and 
'-'.
-#' @param regParam the regularization parameter.
-#' @param elasticNetParam the ElasticNet mixing parameter. For alpha = 0.0, 
the penalty is an L2 penalty.
-#'                        For alpha = 1.0, it is an L1 penalty. For 0.0 < 
alpha < 1.0, the penalty is a combination
-#'                        of L1 and L2. Default is 0.0 which is an L2 penalty.
-#' @param maxIter maximum iteration number.
-#' @param tol convergence tolerance of iterations.
-#' @param family the name of family which is a description of the label 
distribution to be used in the model.
-#'               Supported options:
-#'                 \itemize{
-#'                   \item{"auto": Automatically select the family based on 
the number of classes:
-#'                           If number of classes == 1 || number of classes == 
2, set to "binomial".
-#'                           Else, set to "multinomial".}
-#'                   \item{"binomial": Binary logistic regression with 
pivoting.}
-#'                   \item{"multinomial": Multinomial logistic (softmax) 
regression without pivoting.}
-#'                 }
-#' @param standardization whether to standardize the training features before 
fitting the model. The coefficients
-#'                        of models will be always returned on the original 
scale, so it will be transparent for
-#'                        users. Note that with/without standardization, the 
models should be always converged
-#'                        to the same solution when no regularization is 
applied. Default is TRUE, same as glmnet.
-#' @param thresholds in binary classification, in range [0, 1]. If the 
estimated probability of class label 1
-#'                  is > threshold, then predict 1, else 0. A high threshold 
encourages the model to predict 0
-#'                  more often; a low threshold encourages the model to 
predict 1 more often. Note: Setting this with
-#'                  threshold p is equivalent to setting thresholds c(1-p, p). 
In multiclass (or binary) classification to adjust the probability of
-#'                  predicting each class. Array must have length equal to the 
number of classes, with values > 0,
-#'                  excepting that at most one value may be 0. The class with 
largest value p/t is predicted, where p
-#'                  is the original probability of that class and t is the 
class's threshold.
-#' @param weightCol The weight column name.
-#' @param ... additional arguments passed to the method.
-#' @return \code{spark.logit} returns a fitted logistic regression model.
-#' @rdname spark.logit
-#' @aliases spark.logit,SparkDataFrame,formula-method
-#' @name spark.logit
-#' @export
-#' @examples
-#' \dontrun{
-#' sparkR.session()
-#' # binary logistic regression
-#' df <- createDataFrame(iris)
-#' training <- df[df$Species %in% c("versicolor", "virginica"), ]
-#' model <- spark.logit(training, Species ~ ., regParam = 0.5)
-#' summary <- summary(model)
-#'
-#' # fitted values on training data
-#' fitted <- predict(model, training)
-#'
-#' # save fitted model to input path
-#' path <- "path/to/model"
-#' write.ml(model, path)
-#'
-#' # can also read back the saved model and predict
-#' # Note that summary deos not work on loaded model
-#' savedModel <- read.ml(path)
-#' summary(savedModel)
-#'
-#' # multinomial logistic regression
-#'
-#' df <- createDataFrame(iris)
-#' model <- spark.logit(df, Species ~ ., regParam = 0.5)
-#' summary <- summary(model)
-#'
-#' }
-#' @note spark.logit since 2.1.0
-setMethod("spark.logit", signature(data = "SparkDataFrame", formula = 
"formula"),
-          function(data, formula, regParam = 0.0, elasticNetParam = 0.0, 
maxIter = 100,
-                   tol = 1E-6, family = "auto", standardization = TRUE,
-                   thresholds = 0.5, weightCol = NULL) {
-            formula <- paste(deparse(formula), collapse = "")
-
-            if (is.null(weightCol)) {
-              weightCol <- ""
-            }
-
-            jobj <- 
callJStatic("org.apache.spark.ml.r.LogisticRegressionWrapper", "fit",
-                                data@sdf, formula, as.numeric(regParam),
-                                as.numeric(elasticNetParam), 
as.integer(maxIter),
-                                as.numeric(tol), as.character(family),
-                                as.logical(standardization), 
as.array(thresholds),
-                                as.character(weightCol))
-            new("LogisticRegressionModel", jobj = jobj)
-          })
-
-#  Predicted values based on an LogisticRegressionModel model
-
-#' @param newData a SparkDataFrame for testing.
-#' @return \code{predict} returns the predicted values based on an 
LogisticRegressionModel.
-#' @rdname spark.logit
-#' @aliases predict,LogisticRegressionModel,SparkDataFrame-method
-#' @export
-#' @note predict(LogisticRegressionModel) since 2.1.0
-setMethod("predict", signature(object = "LogisticRegressionModel"),
-          function(object, newData) {
-            predict_internal(object, newData)
-          })
-
-#  Get the summary of an LogisticRegressionModel
-
-#' @param object an LogisticRegressionModel fitted by \code{spark.logit}.
-#' @return \code{summary} returns summary information of the fitted model, 
which is a list.
-#'         The list includes \code{coefficients} (coefficients matrix of the 
fitted model).
-#' @rdname spark.logit
-#' @aliases summary,LogisticRegressionModel-method
-#' @export
-#' @note summary(LogisticRegressionModel) since 2.1.0
-setMethod("summary", signature(object = "LogisticRegressionModel"),
-          function(object) {
-            jobj <- object@jobj
-            features <- callJMethod(jobj, "rFeatures")
-            labels <- callJMethod(jobj, "labels")
-            coefficients <- callJMethod(jobj, "rCoefficients")
-            nCol <- length(coefficients) / length(features)
-            coefficients <- matrix(coefficients, ncol = nCol)
-            # If nCol == 1, means this is a binomial logistic regression model 
with pivoting.
-            # Otherwise, it's a multinomial logistic regression model without 
pivoting.
-            if (nCol == 1) {
-              colnames(coefficients) <- c("Estimate")
-            } else {
-              colnames(coefficients) <- unlist(labels)
-            }
-            rownames(coefficients) <- unlist(features)
-
-            list(coefficients = coefficients)
-          })
-
-#' Multilayer Perceptron Classification Model
-#'
-#' \code{spark.mlp} fits a multi-layer perceptron neural network model against 
a SparkDataFrame.
-#' Users can call \code{summary} to print a summary of the fitted model, 
\code{predict} to make
-#' predictions on new data, and \code{write.ml}/\code{read.ml} to save/load 
fitted models.
-#' Only categorical data is supported.
-#' For more details, see
-#' 
\href{http://spark.apache.org/docs/latest/ml-classification-regression.html}{
-#'   Multilayer Perceptron}
-#'
-#' @param data a \code{SparkDataFrame} of observations and labels for model 
fitting.
-#' @param formula a symbolic description of the model to be fitted. Currently 
only a few formula
-#'                operators are supported, including '~', '.', ':', '+', and 
'-'.
-#' @param blockSize blockSize parameter.
-#' @param layers integer vector containing the number of nodes for each layer.
-#' @param solver solver parameter, supported options: "gd" (minibatch gradient 
descent) or "l-bfgs".
-#' @param maxIter maximum iteration number.
-#' @param tol convergence tolerance of iterations.
-#' @param stepSize stepSize parameter.
-#' @param seed seed parameter for weights initialization.
-#' @param initialWeights initialWeights parameter for weights initialization, 
it should be a
-#' numeric vector.
-#' @param ... additional arguments passed to the method.
-#' @return \code{spark.mlp} returns a fitted Multilayer Perceptron 
Classification Model.
-#' @rdname spark.mlp
-#' @aliases spark.mlp,SparkDataFrame,formula-method
-#' @name spark.mlp
-#' @seealso \link{read.ml}
-#' @export
-#' @examples
-#' \dontrun{
-#' df <- read.df("data/mllib/sample_multiclass_classification_data.txt", 
source = "libsvm")
-#'
-#' # fit a Multilayer Perceptron Classification Model
-#' model <- spark.mlp(df, label ~ features, blockSize = 128, layers = c(4, 3), 
solver = "l-bfgs",
-#'                    maxIter = 100, tol = 0.5, stepSize = 1, seed = 1,
-#'                    initialWeights = c(0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 9, 9, 
9, 9, 9))
-#'
-#' # get the summary of the model
-#' summary(model)
-#'
-#' # make predictions
-#' predictions <- predict(model, df)
-#'
-#' # save and load the model
-#' path <- "path/to/model"
-#' write.ml(model, path)
-#' savedModel <- read.ml(path)
-#' summary(savedModel)
-#' }
-#' @note spark.mlp since 2.1.0
-setMethod("spark.mlp", signature(data = "SparkDataFrame", formula = "formula"),
-          function(data, formula, layers, blockSize = 128, solver = "l-bfgs", 
maxIter = 100,
-                   tol = 1E-6, stepSize = 0.03, seed = NULL, initialWeights = 
NULL) {
-            formula <- paste(deparse(formula), collapse = "")
-            if (is.null(layers)) {
-              stop ("layers must be a integer vector with length > 1.")
-            }
-            layers <- as.integer(na.omit(layers))
-            if (length(layers) <= 1) {
-              stop ("layers must be a integer vector with length > 1.")
-            }
-            if (!is.null(seed)) {
-              seed <- as.character(as.integer(seed))
-            }
-            if (!is.null(initialWeights)) {
-              initialWeights <- as.array(as.numeric(na.omit(initialWeights)))
-            }
-            jobj <- 
callJStatic("org.apache.spark.ml.r.MultilayerPerceptronClassifierWrapper",
-                                "fit", data@sdf, formula, 
as.integer(blockSize), as.array(layers),
-                                as.character(solver), as.integer(maxIter), 
as.numeric(tol),
-                                as.numeric(stepSize), seed, initialWeights)
-            new("MultilayerPerceptronClassificationModel", jobj = jobj)
-          })
-
-# Makes predictions from a model produced by spark.mlp().
-
-#' @param newData a SparkDataFrame for testing.
-#' @return \code{predict} returns a SparkDataFrame containing predicted 
labeled in a column named
-#' "prediction".
-#' @rdname spark.mlp
-#' @aliases predict,MultilayerPerceptronClassificationModel-method
-#' @export
-#' @note predict(MultilayerPerceptronClassificationModel) since 2.1.0
-setMethod("predict", signature(object = 
"MultilayerPerceptronClassificationModel"),
-          function(object, newData) {
-            predict_internal(object, newData)
-          })
-
-# Returns the summary of a Multilayer Perceptron Classification Model produced 
by \code{spark.mlp}
-
-#' @param object a Multilayer Perceptron Classification Model fitted by 
\code{spark.mlp}
-#' @return \code{summary} returns summary information of the fitted model, 
which is a list.
-#'         The list includes \code{numOfInputs} (number of inputs), 
\code{numOfOutputs}
-#'         (number of outputs), \code{layers} (array of layer sizes including 
input
-#'         and output layers), and \code{weights} (the weights of layers).
-#'         For \code{weights}, it is a numeric vector with length equal to the 
expected
-#'         given the architecture (i.e., for 8-10-2 network, 112 connection 
weights).
-#' @rdname spark.mlp
-#' @export
-#' @aliases summary,MultilayerPerceptronClassificationModel-method
-#' @note summary(MultilayerPerceptronClassificationModel) since 2.1.0
-setMethod("summary", signature(object = 
"MultilayerPerceptronClassificationModel"),
-          function(object) {
-            jobj <- object@jobj
-            layers <- unlist(callJMethod(jobj, "layers"))
-            numOfInputs <- head(layers, n = 1)
-            numOfOutputs <- tail(layers, n = 1)
-            weights <- callJMethod(jobj, "weights")
-            list(numOfInputs = numOfInputs, numOfOutputs = numOfOutputs,
-                 layers = layers, weights = weights)
-          })
-
-#' Naive Bayes Models
-#'
-#' \code{spark.naiveBayes} fits a Bernoulli naive Bayes model against a 
SparkDataFrame.
-#' Users can call \code{summary} to print a summary of the fitted model, 
\code{predict} to make
-#' predictions on new data, and \code{write.ml}/\code{read.ml} to save/load 
fitted models.
-#' Only categorical data is supported.
-#'
-#' @param data a \code{SparkDataFrame} of observations and labels for model 
fitting.
-#' @param formula a symbolic description of the model to be fitted. Currently 
only a few formula
-#'               operators are supported, including '~', '.', ':', '+', and 
'-'.
-#' @param smoothing smoothing parameter.
-#' @param ... additional argument(s) passed to the method. Currently only 
\code{smoothing}.
-#' @return \code{spark.naiveBayes} returns a fitted naive Bayes model.
-#' @rdname spark.naiveBayes
-#' @aliases spark.naiveBayes,SparkDataFrame,formula-method
-#' @name spark.naiveBayes
-#' @seealso e1071: \url{https://cran.r-project.org/package=e1071}
-#' @export
-#' @examples
-#' \dontrun{
-#' data <- as.data.frame(UCBAdmissions)
-#' df <- createDataFrame(data)
-#'
-#' # fit a Bernoulli naive Bayes model
-#' model <- spark.naiveBayes(df, Admit ~ Gender + Dept, smoothing = 0)
-#'
-#' # get the summary of the model
-#' summary(model)
-#'
-#' # make predictions
-#' predictions <- predict(model, df)
-#'
-#' # save and load the model
-#' path <- "path/to/model"
-#' write.ml(model, path)
-#' savedModel <- read.ml(path)
-#' summary(savedModel)
-#' }
-#' @note spark.naiveBayes since 2.0.0
-setMethod("spark.naiveBayes", signature(data = "SparkDataFrame", formula = 
"formula"),
-          function(data, formula, smoothing = 1.0) {
-            formula <- paste(deparse(formula), collapse = "")
-            jobj <- callJStatic("org.apache.spark.ml.r.NaiveBayesWrapper", 
"fit",
-            formula, data@sdf, smoothing)
-            new("NaiveBayesModel", jobj = jobj)
-          })
-
-# Saves the Bernoulli naive Bayes model to the input path.
-
-#' @param path the directory where the model is saved.
-#' @param overwrite overwrites or not if the output path already exists. 
Default is FALSE
-#'                  which means throw exception if the output path exists.
-#'
-#' @rdname spark.naiveBayes
-#' @export
-#' @seealso \link{write.ml}
-#' @note write.ml(NaiveBayesModel, character) since 2.0.0
-setMethod("write.ml", signature(object = "NaiveBayesModel", path = 
"character"),
-          function(object, path, overwrite = FALSE) {
-            write_internal(object, path, overwrite)
-          })
-
-# Saves the AFT survival regression model to the input path.
-
-#' @param path the directory where the model is saved.
-#' @param overwrite overwrites or not if the output path already exists. 
Default is FALSE
-#'                  which means throw exception if the output path exists.
-#' @rdname spark.survreg
-#' @export
-#' @note write.ml(AFTSurvivalRegressionModel, character) since 2.0.0
-#' @seealso \link{write.ml}
-setMethod("write.ml", signature(object = "AFTSurvivalRegressionModel", path = 
"character"),
-          function(object, path, overwrite = FALSE) {
-            write_internal(object, path, overwrite)
-          })
-
-#  Saves the generalized linear model to the input path.
-
-#' @param path the directory where the model is saved.
-#' @param overwrite overwrites or not if the output path already exists. 
Default is FALSE
-#'                  which means throw exception if the output path exists.
-#'
-#' @rdname spark.glm
-#' @export
-#' @note write.ml(GeneralizedLinearRegressionModel, character) since 2.0.0
-setMethod("write.ml", signature(object = "GeneralizedLinearRegressionModel", 
path = "character"),
-          function(object, path, overwrite = FALSE) {
-            write_internal(object, path, overwrite)
-          })
-
-#  Save fitted MLlib model to the input path
-
-#' @param path the directory where the model is saved.
-#' @param overwrite overwrites or not if the output path already exists. 
Default is FALSE
-#'                  which means throw exception if the output path exists.
-#'
-#' @rdname spark.kmeans
-#' @export
-#' @note write.ml(KMeansModel, character) since 2.0.0
-setMethod("write.ml", signature(object = "KMeansModel", path = "character"),
-          function(object, path, overwrite = FALSE) {
-            write_internal(object, path, overwrite)
-          })
-
-# Saves the Multilayer Perceptron Classification Model to the input path.
-
-#' @param path the directory where the model is saved.
-#' @param overwrite overwrites or not if the output path already exists. 
Default is FALSE
-#'                  which means throw exception if the output path exists.
-#'
-#' @rdname spark.mlp
-#' @aliases write.ml,MultilayerPerceptronClassificationModel,character-method
-#' @export
-#' @seealso \link{write.ml}
-#' @note write.ml(MultilayerPerceptronClassificationModel, character) since 
2.1.0
-setMethod("write.ml", signature(object = 
"MultilayerPerceptronClassificationModel",
-          path = "character"),
-          function(object, path, overwrite = FALSE) {
-            write_internal(object, path, overwrite)
-          })
-
-#  Save fitted IsotonicRegressionModel to the input path
-
-#' @param path The directory where the model is saved.
-#' @param overwrite Overwrites or not if the output path already exists. 
Default is FALSE
-#'                  which means throw exception if the output path exists.
-#'
-#' @rdname spark.isoreg
-#' @aliases write.ml,IsotonicRegressionModel,character-method
-#' @export
-#' @note write.ml(IsotonicRegression, character) since 2.1.0
-setMethod("write.ml", signature(object = "IsotonicRegressionModel", path = 
"character"),
-          function(object, path, overwrite = FALSE) {
-            write_internal(object, path, overwrite)
-          })
-
-#  Save fitted LogisticRegressionModel to the input path
-
-#' @param path The directory where the model is saved.
-#' @param overwrite Overwrites or not if the output path already exists. 
Default is FALSE
-#'                  which means throw exception if the output path exists.
-#'
-#' @rdname spark.logit
-#' @aliases write.ml,LogisticRegressionModel,character-method
-#' @export
-#' @note write.ml(LogisticRegression, character) since 2.1.0
-setMethod("write.ml", signature(object = "LogisticRegressionModel", path = 
"character"),
-          function(object, path, overwrite = FALSE) {
-            write_internal(object, path, overwrite)
-          })
-
-#  Save fitted MLlib model to the input path
-
-#' @param path the directory where the model is saved.
-#' @param overwrite overwrites or not if the output path already exists. 
Default is FALSE
-#'                  which means throw exception if the output path exists.
-#'
-#' @aliases write.ml,GaussianMixtureModel,character-method
-#' @rdname spark.gaussianMixture
-#' @export
-#' @note write.ml(GaussianMixtureModel, character) since 2.1.0
-setMethod("write.ml", signature(object = "GaussianMixtureModel", path = 
"character"),
-          function(object, path, overwrite = FALSE) {
-            write_internal(object, path, overwrite)
-          })
-
-#' Load a fitted MLlib model from the input path.
-#'
-#' @param path path of the model to read.
-#' @return A fitted MLlib model.
-#' @rdname read.ml
-#' @name read.ml
-#' @export
-#' @seealso \link{write.ml}
-#' @examples
-#' \dontrun{
-#' path <- "path/to/model"
-#' model <- read.ml(path)
-#' }
-#' @note read.ml since 2.0.0
-read.ml <- function(path) {
-  path <- suppressWarnings(normalizePath(path))
-  jobj <- callJStatic("org.apache.spark.ml.r.RWrappers", "load", path)
-  if (isInstanceOf(jobj, "org.apache.spark.ml.r.NaiveBayesWrapper")) {
-    new("NaiveBayesModel", jobj = jobj)
-  } else if (isInstanceOf(jobj, 
"org.apache.spark.ml.r.AFTSurvivalRegressionWrapper")) {
-    new("AFTSurvivalRegressionModel", jobj = jobj)
-  } else if (isInstanceOf(jobj, 
"org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper")) {
-    new("GeneralizedLinearRegressionModel", jobj = jobj)
-  } else if (isInstanceOf(jobj, "org.apache.spark.ml.r.KMeansWrapper")) {
-    new("KMeansModel", jobj = jobj)
-  } else if (isInstanceOf(jobj, "org.apache.spark.ml.r.LDAWrapper")) {
-    new("LDAModel", jobj = jobj)
-  } else if (isInstanceOf(jobj, 
"org.apache.spark.ml.r.MultilayerPerceptronClassifierWrapper")) {
-    new("MultilayerPerceptronClassificationModel", jobj = jobj)
-  } else if (isInstanceOf(jobj, 
"org.apache.spark.ml.r.IsotonicRegressionWrapper")) {
-    new("IsotonicRegressionModel", jobj = jobj)
-  } else if (isInstanceOf(jobj, 
"org.apache.spark.ml.r.GaussianMixtureWrapper")) {
-    new("GaussianMixtureModel", jobj = jobj)
-  } else if (isInstanceOf(jobj, "org.apache.spark.ml.r.ALSWrapper")) {
-    new("ALSModel", jobj = jobj)
-  } else if (isInstanceOf(jobj, 
"org.apache.spark.ml.r.LogisticRegressionWrapper")) {
-    new("LogisticRegressionModel", jobj = jobj)
-  } else if (isInstanceOf(jobj, 
"org.apache.spark.ml.r.RandomForestRegressorWrapper")) {
-    new("RandomForestRegressionModel", jobj = jobj)
-  } else if (isInstanceOf(jobj, 
"org.apache.spark.ml.r.RandomForestClassifierWrapper")) {
-    new("RandomForestClassificationModel", jobj = jobj)
-  } else if (isInstanceOf(jobj, "org.apache.spark.ml.r.GBTRegressorWrapper")) {
-    new("GBTRegressionModel", jobj = jobj)
-  } else if (isInstanceOf(jobj, "org.apache.spark.ml.r.GBTClassifierWrapper")) 
{
-    new("GBTClassificationModel", jobj = jobj)
-  } else {
-    stop("Unsupported model: ", jobj)
-  }
-}
-
-#' Accelerated Failure Time (AFT) Survival Regression Model
-#'
-#' \code{spark.survreg} fits an accelerated failure time (AFT) survival 
regression model on
-#' a SparkDataFrame. Users can call \code{summary} to get a summary of the 
fitted AFT model,
-#' \code{predict} to make predictions on new data, and 
\code{write.ml}/\code{read.ml} to
-#' save/load fitted models.
-#'
-#' @param data a SparkDataFrame for training.
-#' @param formula a symbolic description of the model to be fitted. Currently 
only a few formula
-#'                operators are supported, including '~', ':', '+', and '-'.
-#'                Note that operator '.' is not supported currently.
-#' @return \code{spark.survreg} returns a fitted AFT survival regression model.
-#' @rdname spark.survreg
-#' @seealso survival: \url{https://cran.r-project.org/package=survival}
-#' @export
-#' @examples
-#' \dontrun{
-#' df <- createDataFrame(ovarian)
-#' model <- spark.survreg(df, Surv(futime, fustat) ~ ecog_ps + rx)
-#'
-#' # get a summary of the model
-#' summary(model)
-#'
-#' # make predictions
-#' predicted <- predict(model, df)
-#' showDF(predicted)
-#'
-#' # save and load the model
-#' path <- "path/to/model"
-#' write.ml(model, path)
-#' savedModel <- read.ml(path)
-#' summary(savedModel)
-#' }
-#' @note spark.survreg since 2.0.0
-setMethod("spark.survreg", signature(data = "SparkDataFrame", formula = 
"formula"),
-          function(data, formula) {
-            formula <- paste(deparse(formula), collapse = "")
-            jobj <- 
callJStatic("org.apache.spark.ml.r.AFTSurvivalRegressionWrapper",
-                                "fit", formula, data@sdf)
-            new("AFTSurvivalRegressionModel", jobj = jobj)
-          })
-
-#' Latent Dirichlet Allocation
-#'
-#' \code{spark.lda} fits a Latent Dirichlet Allocation model on a 
SparkDataFrame. Users can call
-#' \code{summary} to get a summary of the fitted LDA model, 
\code{spark.posterior} to compute
-#' posterior probabilities on new data, \code{spark.perplexity} to compute log 
perplexity on new
-#' data and \code{write.ml}/\code{read.ml} to save/load fitted models.
-#'
-#' @param data A SparkDataFrame for training.
-#' @param features Features column name. Either libSVM-format column or 
character-format column is
-#'        valid.
-#' @param k Number of topics.
-#' @param maxIter Maximum iterations.
-#' @param optimizer Optimizer to train an LDA model, "online" or "em", default 
is "online".
-#' @param subsamplingRate (For online optimizer) Fraction of the corpus to be 
sampled and used in
-#'        each iteration of mini-batch gradient descent, in range (0, 1].
-#' @param topicConcentration concentration parameter (commonly named 
\code{beta} or \code{eta}) for
-#'        the prior placed on topic distributions over terms, default -1 to 
set automatically on the
-#'        Spark side. Use \code{summary} to retrieve the effective 
topicConcentration. Only 1-size
-#'        numeric is accepted.
-#' @param docConcentration concentration parameter (commonly named 
\code{alpha}) for the
-#'        prior placed on documents distributions over topics (\code{theta}), 
default -1 to set
-#'        automatically on the Spark side. Use \code{summary} to retrieve the 
effective
-#'        docConcentration. Only 1-size or \code{k}-size numeric is accepted.
-#' @param customizedStopWords stopwords that need to be removed from the given 
corpus. Ignore the
-#'        parameter if libSVM-format column is used as the features column.
-#' @param maxVocabSize maximum vocabulary size, default 1 << 18
-#' @param ... additional argument(s) passed to the method.
-#' @return \code{spark.lda} returns a fitted Latent Dirichlet Allocation model.
-#' @rdname spark.lda
-#' @aliases spark.lda,SparkDataFrame-method
-#' @seealso topicmodels: \url{https://cran.r-project.org/package=topicmodels}
-#' @export
-#' @examples
-#' \dontrun{
-#' # nolint start
-#' # An example "path/to/file" can be
-#' # paste0(Sys.getenv("SPARK_HOME"), "/data/mllib/sample_lda_libsvm_data.txt")
-#' # nolint end
-#' text <- read.df("path/to/file", source = "libsvm")
-#' model <- spark.lda(data = text, optimizer = "em")
-#'
-#' # get a summary of the model
-#' summary(model)
-#'
-#' # compute posterior probabilities
-#' posterior <- spark.posterior(model, text)
-#' showDF(posterior)
-#'
-#' # compute perplexity
-#' perplexity <- spark.perplexity(model, text)
-#'
-#' # save and load the model
-#' path <- "path/to/model"
-#' write.ml(model, path)
-#' savedModel <- read.ml(path)
-#' summary(savedModel)
-#' }
-#' @note spark.lda since 2.1.0
-setMethod("spark.lda", signature(data = "SparkDataFrame"),
-          function(data, features = "features", k = 10, maxIter = 20, 
optimizer = c("online", "em"),
-                   subsamplingRate = 0.05, topicConcentration = -1, 
docConcentration = -1,
-                   customizedStopWords = "", maxVocabSize = bitwShiftL(1, 18)) 
{
-            optimizer <- match.arg(optimizer)
-            jobj <- callJStatic("org.apache.spark.ml.r.LDAWrapper", "fit", 
data@sdf, features,
-                                as.integer(k), as.integer(maxIter), optimizer,
-                                as.numeric(subsamplingRate), 
topicConcentration,
-                                as.array(docConcentration), 
as.array(customizedStopWords),
-                                maxVocabSize)
-            new("LDAModel", jobj = jobj)
-          })
-
-# Returns a summary of the AFT survival regression model produced by 
spark.survreg,
-# similarly to R's summary().
-
-#' @param object a fitted AFT survival regression model.
-#' @return \code{summary} returns summary information of the fitted model, 
which is a list.
-#'         The list includes the model's \code{coefficients} (features, 
coefficients,
-#'         intercept and log(scale)).
-#' @rdname spark.survreg
-#' @export
-#' @note summary(AFTSurvivalRegressionModel) since 2.0.0
-setMethod("summary", signature(object = "AFTSurvivalRegressionModel"),
-          function(object) {
-            jobj <- object@jobj
-            features <- callJMethod(jobj, "rFeatures")
-            coefficients <- callJMethod(jobj, "rCoefficients")
-            coefficients <- as.matrix(unlist(coefficients))
-            colnames(coefficients) <- c("Value")
-            rownames(coefficients) <- unlist(features)
-            list(coefficients = coefficients)
-          })
-
-# Makes predictions from an AFT survival regression model or a model produced 
by
-# spark.survreg, similarly to R package survival's predict.
-
-#' @param newData a SparkDataFrame for testing.
-#' @return \code{predict} returns a SparkDataFrame containing predicted values
-#'         on the original scale of the data (mean predicted value at scale = 
1.0).
-#' @rdname spark.survreg
-#' @export
-#' @note predict(AFTSurvivalRegressionModel) since 2.0.0
-setMethod("predict", signature(object = "AFTSurvivalRegressionModel"),
-          function(object, newData) {
-            predict_internal(object, newData)
-          })
-
-#' Multivariate Gaussian Mixture Model (GMM)
-#'
-#' Fits multivariate gaussian mixture model against a Spark DataFrame, 
similarly to R's
-#' mvnormalmixEM(). Users can call \code{summary} to print a summary of the 
fitted model,
-#' \code{predict} to make predictions on new data, and 
\code{write.ml}/\code{read.ml}
-#' to save/load fitted models.
-#'
-#' @param data a SparkDataFrame for training.
-#' @param formula a symbolic description of the model to be fitted. Currently 
only a few formula
-#'                operators are supported, including '~', '.', ':', '+', and 
'-'.
-#'                Note that the response variable of formula is empty in 
spark.gaussianMixture.
-#' @param k number of independent Gaussians in the mixture model.
-#' @param maxIter maximum iteration number.
-#' @param tol the convergence tolerance.
-#' @param ... additional arguments passed to the method.
-#' @aliases spark.gaussianMixture,SparkDataFrame,formula-method
-#' @return \code{spark.gaussianMixture} returns a fitted multivariate gaussian 
mixture model.
-#' @rdname spark.gaussianMixture
-#' @name spark.gaussianMixture
-#' @seealso mixtools: \url{https://cran.r-project.org/package=mixtools}
-#' @export
-#' @examples
-#' \dontrun{
-#' sparkR.session()
-#' library(mvtnorm)
-#' set.seed(100)
-#' a <- rmvnorm(4, c(0, 0))
-#' b <- rmvnorm(6, c(3, 4))
-#' data <- rbind(a, b)
-#' df <- createDataFrame(as.data.frame(data))
-#' model <- spark.gaussianMixture(df, ~ V1 + V2, k = 2)
-#' summary(model)
-#'
-#' # fitted values on training data
-#' fitted <- predict(model, df)
-#' head(select(fitted, "V1", "prediction"))
-#'
-#' # save fitted model to input path
-#' path <- "path/to/model"
-#' write.ml(model, path)
-#'
-#' # can also read back the saved model and print
-#' savedModel <- read.ml(path)
-#' summary(savedModel)
-#' }
-#' @note spark.gaussianMixture since 2.1.0
-#' @seealso \link{predict}, \link{read.ml}, \link{write.ml}
-setMethod("spark.gaussianMixture", signature(data = "SparkDataFrame", formula 
= "formula"),
-          function(data, formula, k = 2, maxIter = 100, tol = 0.01) {
-            formula <- paste(deparse(formula), collapse = "")
-            jobj <- 
callJStatic("org.apache.spark.ml.r.GaussianMixtureWrapper", "fit", data@sdf,
-                                formula, as.integer(k), as.integer(maxIter), 
as.numeric(tol))
-            new("GaussianMixtureModel", jobj = jobj)
-          })
-
-#  Get the summary of a multivariate gaussian mixture model
-
-#' @param object a fitted gaussian mixture model.
-#' @return \code{summary} returns summary of the fitted model, which is a list.
-#'         The list includes the model's \code{lambda} (lambda), \code{mu} 
(mu),
-#'         \code{sigma} (sigma), and \code{posterior} (posterior).
-#' @aliases spark.gaussianMixture,SparkDataFrame,formula-method
-#' @rdname spark.gaussianMixture
-#' @export
-#' @note summary(GaussianMixtureModel) since 2.1.0
-setMethod("summary", signature(object = "GaussianMixtureModel"),
-          function(object) {
-            jobj <- object@jobj
-            is.loaded <- callJMethod(jobj, "isLoaded")
-            lambda <- unlist(callJMethod(jobj, "lambda"))
-            muList <- callJMethod(jobj, "mu")
-            sigmaList <- callJMethod(jobj, "sigma")
-            k <- callJMethod(jobj, "k")
-            dim <- callJMethod(jobj, "dim")
-            mu <- c()
-            for (i in 1 : k) {
-              start <- (i - 1) * dim + 1
-              end <- i * dim
-              mu[[i]] <- unlist(muList[start : end])
-            }
-            sigma <- c()
-            for (i in 1 : k) {
-              start <- (i - 1) * dim * dim + 1
-              end <- i * dim * dim
-              sigma[[i]] <- t(matrix(sigmaList[start : end], ncol = dim))
-            }
-            posterior <- if (is.loaded) {
-              NULL
-            } else {
-              dataFrame(callJMethod(jobj, "posterior"))
-            }
-            list(lambda = lambda, mu = mu, sigma = sigma,
-                 posterior = posterior, is.loaded = is.loaded)
-          })
-
-#  Predicted values based on a gaussian mixture model
-
-#' @param newData a SparkDataFrame for testing.
-#' @return \code{predict} returns a SparkDataFrame containing predicted labels 
in a column named
-#'         "prediction".
-#' @aliases predict,GaussianMixtureModel,SparkDataFrame-method
-#' @rdname spark.gaussianMixture
-#' @export
-#' @note predict(GaussianMixtureModel) since 2.1.0
-setMethod("predict", signature(object = "GaussianMixtureModel"),
-          function(object, newData) {
-            predict_internal(object, newData)
-          })
-
-#' Alternating Least Squares (ALS) for Collaborative Filtering
-#'
-#' \code{spark.als} learns latent factors in collaborative filtering via 
alternating least
-#' squares. Users can call \code{summary} to obtain fitted latent factors, 
\code{predict}
-#' to make predictions on new data, and \code{write.ml}/\code{read.ml} to 
save/load fitted models.
-#'
-#' For more details, see
-#' 
\href{http://spark.apache.org/docs/latest/ml-collaborative-filtering.html}{MLlib:
-#' Collaborative Filtering}.
-#'
-#' @param data a SparkDataFrame for training.
-#' @param ratingCol column name for ratings.
-#' @param userCol column name for user ids. Ids must be (or can be coerced 
into) integers.
-#' @param itemCol column name for item ids. Ids must be (or can be coerced 
into) integers.
-#' @param rank rank of the matrix factorization (> 0).
-#' @param regParam regularization parameter (>= 0).
-#' @param maxIter maximum number of iterations (>= 0).
-#' @param nonnegative logical value indicating whether to apply nonnegativity 
constraints.
-#' @param implicitPrefs logical value indicating whether to use implicit 
preference.
-#' @param alpha alpha parameter in the implicit preference formulation (>= 0).
-#' @param seed integer seed for random number generation.
-#' @param numUserBlocks number of user blocks used to parallelize computation 
(> 0).
-#' @param numItemBlocks number of item blocks used to parallelize computation 
(> 0).
-#' @param checkpointInterval number of checkpoint intervals (>= 1) or disable 
checkpoint (-1).
-#' @param ... additional argument(s) passed to the method.
-#' @return \code{spark.als} returns a fitted ALS model.
-#' @rdname spark.als
-#' @aliases spark.als,SparkDataFrame-method
-#' @name spark.als
-#' @export
-#' @examples
-#' \dontrun{
-#' ratings <- list(list(0, 0, 4.0), list(0, 1, 2.0), list(1, 1, 3.0), list(1, 
2, 4.0),
-#'                 list(2, 1, 1.0), list(2, 2, 5.0))
-#' df <- createDataFrame(ratings, c("user", "item", "rating"))
-#' model <- spark.als(df, "rating", "user", "item")
-#'
-#' # extract latent factors
-#' stats <- summary(model)
-#' userFactors <- stats$userFactors
-#' itemFactors <- stats$itemFactors
-#'
-#' # make predictions
-#' predicted <- predict(model, df)
-#' showDF(predicted)
-#'
-#' # save and load the model
-#' path <- "path/to/model"
-#' write.ml(model, path)
-#' savedModel <- read.ml(path)
-#' summary(savedModel)
-#'
-#' # set other arguments
-#' modelS <- spark.als(df, "rating", "user", "item", rank = 20,
-#'                     regParam = 0.1, nonnegative = TRUE)
-#' statsS <- summary(modelS)
-#' }
-#' @note spark.als since 2.1.0
-setMethod("spark.als", signature(data = "SparkDataFrame"),
-          function(data, ratingCol = "rating", userCol = "user", itemCol = 
"item",
-                   rank = 10, regParam = 0.1, maxIter = 10, nonnegative = 
FALSE,
-                   implicitPrefs = FALSE, alpha = 1.0, numUserBlocks = 10, 
numItemBlocks = 10,
-                   checkpointInterval = 10, seed = 0) {
-
-            if (!is.numeric(rank) || rank <= 0) {
-              stop("rank should be a positive number.")
-            }
-            if (!is.numeric(regParam) || regParam < 0) {
-              stop("regParam should be a nonnegative number.")
-            }
-            if (!is.numeric(maxIter) || maxIter <= 0) {
-              stop("maxIter should be a positive number.")
-            }
-
-            jobj <- callJStatic("org.apache.spark.ml.r.ALSWrapper",
-                                "fit", data@sdf, ratingCol, userCol, itemCol, 
as.integer(rank),
-                                regParam, as.integer(maxIter), implicitPrefs, 
alpha, nonnegative,
-                                as.integer(numUserBlocks), 
as.integer(numItemBlocks),
-                                as.integer(checkpointInterval), 
as.integer(seed))
-            new("ALSModel", jobj = jobj)
-          })
-
-# Returns a summary of the ALS model produced by spark.als.
-
-#' @param object a fitted ALS model.
-#' @return \code{summary} returns summary information of the fitted model, 
which is a list.
-#'         The list includes \code{user} (the names of the user column),
-#'         \code{item} (the item column), \code{rating} (the rating column), 
\code{userFactors}
-#'         (the estimated user factors), \code{itemFactors} (the estimated 
item factors),
-#'         and \code{rank} (rank of the matrix factorization model).
-#' @rdname spark.als
-#' @aliases summary,ALSModel-method
-#' @export
-#' @note summary(ALSModel) since 2.1.0
-setMethod("summary", signature(object = "ALSModel"),
-          function(object) {
-            jobj <- object@jobj
-            user <- callJMethod(jobj, "userCol")
-            item <- callJMethod(jobj, "itemCol")
-            rating <- callJMethod(jobj, "ratingCol")
-            userFactors <- dataFrame(callJMethod(jobj, "userFactors"))
-            itemFactors <- dataFrame(callJMethod(jobj, "itemFactors"))
-            rank <- callJMethod(jobj, "rank")
-            list(user = user, item = item, rating = rating, userFactors = 
userFactors,
-                 itemFactors = itemFactors, rank = rank)
-          })
-
-
-# Makes predictions from an ALS model or a model produced by spark.als.
-
-#' @param newData a SparkDataFrame for testing.
-#' @return \code{predict} returns a SparkDataFrame containing predicted values.
-#' @rdname spark.als
-#' @aliases predict,ALSModel-method
-#' @export
-#' @note predict(ALSModel) since 2.1.0
-setMethod("predict", signature(object = "ALSModel"),
-          function(object, newData) {
-            predict_internal(object, newData)
-          })
-
-
-# Saves the ALS model to the input path.
-
-#' @param path the directory where the model is saved.
-#' @param overwrite logical value indicating whether to overwrite if the 
output path
-#'                  already exists. Default is FALSE which means throw 
exception
-#'                  if the output path exists.
-#'
-#' @rdname spark.als
-#' @aliases write.ml,ALSModel,character-method
-#' @export
-#' @seealso \link{read.ml}
-#' @note write.ml(ALSModel, character) since 2.1.0
-setMethod("write.ml", signature(object = "ALSModel", path = "character"),
-          function(object, path, overwrite = FALSE) {
-            write_internal(object, path, overwrite)
-          })
-
-#' (One-Sample) Kolmogorov-Smirnov Test
-#'
-#' @description
-#' \code{spark.kstest} Conduct the two-sided Kolmogorov-Smirnov (KS) test for 
data sampled from a
-#' continuous distribution.
-#'
-#' By comparing the largest difference between the empirical cumulative
-#' distribution of the sample data and the theoretical distribution we can 
provide a test for the
-#' the null hypothesis that the sample data comes from that theoretical 
distribution.
-#'
-#' Users can call \code{summary} to obtain a summary of the test, and 
\code{print.summary.KSTest}
-#' to print out a summary result.
-#'
-#' @param data a SparkDataFrame of user data.
-#' @param testCol column name where the test data is from. It should be a 
column of double type.
-#' @param nullHypothesis name of the theoretical distribution tested against. 
Currently only
-#'                       \code{"norm"} for normal distribution is supported.
-#' @param distParams parameters(s) of the distribution. For 
\code{nullHypothesis = "norm"},
-#'                   we can provide as a vector the mean and standard 
deviation of
-#'                   the distribution. If none is provided, then standard 
normal will be used.
-#'                   If only one is provided, then the standard deviation will 
be set to be one.
-#' @param ... additional argument(s) passed to the method.
-#' @return \code{spark.kstest} returns a test result object.
-#' @rdname spark.kstest
-#' @aliases spark.kstest,SparkDataFrame-method
-#' @name spark.kstest
-#' @seealso 
\href{http://spark.apache.org/docs/latest/mllib-statistics.html#hypothesis-testing}{
-#'          MLlib: Hypothesis Testing}
-#' @export
-#' @examples
-#' \dontrun{
-#' data <- data.frame(test = c(0.1, 0.15, 0.2, 0.3, 0.25))
-#' df <- createDataFrame(data)
-#' test <- spark.kstest(df, "test", "norm", c(0, 1))
-#'
-#' # get a summary of the test result
-#' testSummary <- summary(test)
-#' testSummary
-#'
-#' # print out the summary in an organized way
-#' print.summary.KSTest(testSummary)
-#' }
-#' @note spark.kstest since 2.1.0
-setMethod("spark.kstest", signature(data = "SparkDataFrame"),
-          function(data, testCol = "test", nullHypothesis = c("norm"), 
distParams = c(0, 1)) {
-            tryCatch(match.arg(nullHypothesis),
-                     error = function(e) {
-                       msg <- paste("Distribution", nullHypothesis, "is not 
supported.")
-                       stop(msg)
-                     })
-            if (nullHypothesis == "norm") {
-              distParams <- as.numeric(distParams)
-              mu <- ifelse(length(distParams) < 1, 0, distParams[1])
-              sigma <- ifelse(length(distParams) < 2, 1, distParams[2])
-              jobj <- callJStatic("org.apache.spark.ml.r.KSTestWrapper",
-                                  "test", data@sdf, testCol, nullHypothesis,
-                                  as.array(c(mu, sigma)))
-              new("KSTest", jobj = jobj)
-            }
-})
-
-#  Get the summary of Kolmogorov-Smirnov (KS) Test.
-#' @param object test result object of KSTest by \code{spark.kstest}.
-#' @return \code{summary} returns summary information of KSTest object, which 
is a list.
-#'         The list includes the \code{p.value} (p-value), \code{statistic} 
(test statistic
-#'         computed for the test), \code{nullHypothesis} (the null hypothesis 
with its
-#'         parameters tested against) and \code{degreesOfFreedom} (degrees of 
freedom of the test).
-#' @rdname spark.kstest
-#' @aliases summary,KSTest-method
-#' @export
-#' @note summary(KSTest) since 2.1.0
-setMethod("summary", signature(object = "KSTest"),
-          function(object) {
-            jobj <- object@jobj
-            pValue <- callJMethod(jobj, "pValue")
-            statistic <- callJMethod(jobj, "statistic")
-            nullHypothesis <- callJMethod(jobj, "nullHypothesis")
-            distName <- callJMethod(jobj, "distName")
-            distParams <- unlist(callJMethod(jobj, "distParams"))
-            degreesOfFreedom <- callJMethod(jobj, "degreesOfFreedom")
-
-            ans <- list(p.value = pValue, statistic = statistic, 
nullHypothesis = nullHypothesis,
-                        nullHypothesis.name = distName, 
nullHypothesis.parameters = distParams,
-                        degreesOfFreedom = degreesOfFreedom, jobj = jobj)
-            class(ans) <- "summary.KSTest"
-            ans
-          })
-
-#  Prints the summary of KSTest
-
-#' @rdname spark.kstest
-#' @param x summary object of KSTest returned by \code{summary}.
-#' @export
-#' @note print.summary.KSTest since 2.1.0
-print.summary.KSTest <- function(x, ...) {
-  jobj <- x$jobj
-  summaryStr <- callJMethod(jobj, "summary")
-  cat(summaryStr, "\n")
-  invisible(x)
-}
-
-#' Random Forest Model for Regression and Classification
-#'
-#' \code{spark.randomForest} fits a Random Forest Regression model or 
Classification model on
-#' a SparkDataFrame. Users can call \code{summary} to get a summary of the 
fitted Random Forest
-#' model, \code{predict} to make predictions on new data, and 
\code{write.ml}/\code{read.ml} to
-#' save/load fitted models.
-#' For more details, see
-#' 
\href{http://spark.apache.org/docs/latest/ml-classification-regression.html#random-forest-regression}{
-#' Random Forest Regression} and
-#' 
\href{http://spark.apache.org/docs/latest/ml-classification-regression.html#random-forest-classifier}{
-#' Random Forest Classification}
-#'
-#' @param data a SparkDataFrame for training.
-#' @param formula a symbolic description of the model to be fitted. Currently 
only a few formula
-#'                operators are supported, including '~', ':', '+', and '-'.
-#' @param type type of model, one of "regression" or "classification", to fit
-#' @param maxDepth Maximum depth of the tree (>= 0).
-#' @param maxBins Maximum number of bins used for discretizing continuous 
features and for choosing
-#'                how to split on features at each node. More bins give higher 
granularity. Must be
-#'                >= 2 and >= number of categories in any categorical feature.
-#' @param numTrees Number of trees to train (>= 1).
-#' @param impurity Criterion used for information gain calculation.
-#'                 For regression, must be "variance". For classification, 
must be one of
-#'                 "entropy" and "gini", default is "gini".
-#' @param featureSubsetStrategy The number of features to consider for splits 
at each tree node.
-#'        Supported options: "auto", "all", "onethird", "sqrt", "log2", 
(0.0-1.0], [1-n].
-#' @param seed integer seed for random number generation.
-#' @param subsamplingRate Fraction of the training data used for learning each 
decision tree, in
-#'                        range (0, 1].
-#' @param minInstancesPerNode Minimum number of instances each child must have 
after split.
-#' @param minInfoGain Minimum information gain for a split to be considered at 
a tree node.
-#' @param checkpointInterval Param for set checkpoint interval (>= 1) or 
disable checkpoint (-1).
-#' @param maxMemoryInMB Maximum memory in MB allocated to histogram 
aggregation.
-#' @param cacheNodeIds If FALSE, the algorithm will pass trees to executors to 
match instances with
-#'                     nodes. If TRUE, the algorithm will cache node IDs for 
each instance. Caching
-#'                     can speed up training of deeper trees. Users can set 
how often should the
-#'                     cache be checkpointed or disable it by setting 
checkpointInterval.
-#' @param ... additional arguments passed to the method.
-#' @aliases spark.randomForest,SparkDataFrame,formula-method
-#' @return \code{spark.randomForest} returns a fitted Random Forest model.
-#' @rdname spark.randomForest
-#' @name spark.randomForest
-#' @export
-#' @examples
-#' \dontrun{
-#' # fit a Random Forest Regression Model
-#' df <- createDataFrame(longley)
-#' model <- spark.randomForest(df, Employed ~ ., type = "regression", maxDepth 
= 5, maxBins = 16)
-#'
-#' # get the summary of the model
-#' summary(model)
-#'
-#' # make predictions
-#' predictions <- predict(model, df)
-#'
-#' # save and load the model
-#' path <- "path/to/model"
-#' write.ml(model, path)
-#' savedModel <- read.ml(path)
-#' summary(savedModel)
-#'
-#' # fit a Random Forest Classification Model
-#' df <- createDataFrame(iris)
-#' model <- spark.randomForest(df, Species ~ Petal_Length + Petal_Width, 
"classification")
-#' }
-#' @note spark.randomForest since 2.1.0
-setMethod("spark.randomForest", signature(data = "SparkDataFrame", formula = 
"formula"),
-          function(data, formula, type = c("regression", "classification"),
-                   maxDepth = 5, maxBins = 32, numTrees = 20, impurity = NULL,
-                   featureSubsetStrategy = "auto", seed = NULL, 
subsamplingRate = 1.0,
-                   minInstancesPerNode = 1, minInfoGain = 0.0, 
checkpointInterval = 10,
-                   maxMemoryInMB = 256, cacheNodeIds = FALSE) {
-            type <- match.arg(type)
-            formula <- paste(deparse(formula), collapse = "")
-            if (!is.null(seed)) {
-              seed <- as.character(as.integer(seed))
-            }
-            switch(type,
-                   regression = {
-                     if (is.null(impurity)) impurity <- "variance"
-                     impurity <- match.arg(impurity, "variance")
-                     jobj <- 
callJStatic("org.apache.spark.ml.r.RandomForestRegressorWrapper",
-                                         "fit", data@sdf, formula, 
as.integer(maxDepth),
-                                         as.integer(maxBins), 
as.integer(numTrees),
-                                         impurity, 
as.integer(minInstancesPerNode),
-                                         as.numeric(minInfoGain), 
as.integer(checkpointInterval),
-                                         as.character(featureSubsetStrategy), 
seed,
-                                         as.numeric(subsamplingRate),
-                                         as.integer(maxMemoryInMB), 
as.logical(cacheNodeIds))
-                     new("RandomForestRegressionModel", jobj = jobj)
-                   },
-                   classification = {
-                     if (is.null(impurity)) impurity <- "gini"
-                     impurity <- match.arg(impurity, c("gini", "entropy"))
-                     jobj <- 
callJStatic("org.apache.spark.ml.r.RandomForestClassifierWrapper",
-                                         "fit", data@sdf, formula, 
as.integer(maxDepth),
-                                         as.integer(maxBins), 
as.integer(numTrees),
-                                         impurity, 
as.integer(minInstancesPerNode),
-                                         as.numeric(minInfoGain), 
as.integer(checkpointInterval),
-                                         as.character(featureSubsetStrategy), 
seed,
-                                         as.numeric(subsamplingRate),
-                                         as.integer(maxMemoryInMB), 
as.logical(cacheNodeIds))
-                     new("RandomForestClassificationModel", jobj = jobj)
-                   }
-            )
-          })
-
-# Makes predictions from a Random Forest Regression model or Classification 
model
-
-#' @param newData a SparkDataFrame for testing.
-#' @return \code{predict} returns a SparkDataFrame containing predicted 
labeled in a column named
-#'         "prediction".
-#' @rdname spark.randomForest
-#' @aliases predict,RandomForestRegressionModel-method
-#' @export
-#' @note predict(RandomForestRegressionModel) since 2.1.0
-setMethod("predict", signature(object = "RandomForestRegressionModel"),
-          function(object, newData) {
-            predict_internal(object, newData)
-          })
-
-#' @rdname spark.randomForest
-#' @aliases predict,RandomForestClassificationModel-method
-#' @export
-#' @note predict(RandomForestClassificationModel) since 2.1.0
-setMethod("predict", signature(object = "RandomForestClassificationModel"),
-          function(object, newData) {
-            predict_internal(object, newData)
-          })
-
-# Save the Random Forest Regression or Classification model to the input path.
-
-#' @param object A fitted Random Forest regression model or classification 
model.
-#' @param path The directory where the model is saved.
-#' @param overwrite Overwrites or not if the output path already exists. 
Default is FALSE
-#'                  which means throw exception if the output path exists.
-#'
-#' @aliases write.ml,RandomForestRegressionModel,character-method
-#' @rdname spark.randomForest
-#' @export
-#' @note write.ml(RandomForestRegressionModel, character) since 2.1.0
-setMethod("write.ml", signature(object = "RandomForestRegressionModel", path = 
"character"),
-          function(object, path, overwrite = FALSE) {
-            write_internal(object, path, overwrite)
-          })
-
-#' @aliases write.ml,RandomForestClassificationModel,character-method
-#' @rdname spark.randomForest
-#' @export
-#' @note write.ml(RandomForestClassificationModel, character) since 2.1.0
-setMethod("write.ml", signature(object = "RandomForestClassificationModel", 
path = "character"),
-          function(object, path, overwrite = FALSE) {
-            write_internal(object, path, overwrite)
-          })
-
-# Create the summary of a tree ensemble model (eg. Random Forest, GBT)
-summary.treeEnsemble <- function(model) {
-  jobj <- model@jobj
-  formula <- callJMethod(jobj, "formula")
-  numFeatures <- callJMethod(jobj, "numFeatures")
-  features <-  callJMethod(jobj, "features")
-  featureImportances <- callJMethod(callJMethod(jobj, "featureImportances"), 
"toString")
-  numTrees <- callJMethod(jobj, "numTrees")
-  treeWeights <- callJMethod(jobj, "treeWeights")
-  list(formula = formula,
-       numFeatures = numFeatures,
-       features = features,
-       featureImportances = featureImportances,
-       numTrees = numTrees,
-       treeWeights = treeWeights,
-       jobj = jobj)
-}
-
-#  Get the summary of a Random Forest Regression Model
-
-#' @return \code{summary} returns summary information of the fitted model, 
which is a list.
-#'         The list of components includes \code{formula} (formula),
-#'         \code{numFeatures} (number of features), \code{features} (list of 
features),
-#'         \code{featureImportances} (feature importances), \code{numTrees} 
(number of trees),
-#'         and \code{treeWeights} (tree weights).
-#' @rdname spark.randomForest
-#' @aliases summary,RandomForestRegressionModel-method
-#' @export
-#' @note summary(RandomForestRegressionModel) since 2.1.0
-setMethod("summary", signature(object = "RandomForestRegressionModel"),
-          function(object) {
-            ans <- summary.treeEnsemble(object)
-            class(ans) <- "summary.RandomForestRegressionModel"
-            ans
-          })
-
-#  Get the summary of a Random Forest Classification Model
-
-#' @rdname spark.randomForest
-#' @aliases summary,RandomForestClassificationModel-method
-#' @export
-#' @note summary(RandomForestClassificationModel) since 2.1.0
-setMethod("summary", signature(object = "RandomForestClassificationModel"),
-          function(object) {
-            ans <- summary.treeEnsemble(object)
-            class(ans) <- "summary.RandomForestClassificationModel"
-            ans
-          })
-
-#  Prints the summary of tree ensemble models (eg. Random Forest, GBT)
-print.summary.treeEnsemble <- function(x) {
-  jobj <- x$jobj
-  cat("Formula: ", x$formula)
-  cat("\nNumber of features: ", x$numFeatures)
-  cat("\nFeatures: ", unlist(x$features))
-  cat("\nFeature importances: ", x$featureImportances)
-  cat("\nNumber of trees: ", x$numTrees)
-  cat("\nTree weights: ", unlist(x$treeWeights))
-
-  summaryStr <- callJMethod(jobj, "summary")
-  cat("\n", summaryStr, "\n")
-  invisible(x)
-}
-
-#  Prints the summary of Random Forest Regression Model
-
-#' @param x summary object of Random Forest regression model or classification 
model
-#'          returned by \code{summary}.
-#' @rdname spark.randomForest
-#' @export
-#' @note print.summary.RandomForestRegressionModel since 2.1.0
-print.summary.RandomForestRegressionModel <- function(x, ...) {
-  print.summary.treeEnsemble(x)
-}
-
-#  Prints the summary of Random Forest Classification Model
-
-#' @rdname spark.randomForest
-#' @export
-#' @note print.summary.RandomForestClassificationModel since 2.1.0
-print.summary.RandomForestClassificationModel <- function(x, ...) {
-  print.summary.treeEnsemble(x)
-}
-
-#' Gradient Boosted Tree Model for Regression and Classification
-#'
-#' \code{spark.gbt} fits a Gradient Boosted Tree Regression model or 
Classification model on a
-#' SparkDataFrame. Users can call \code{summary} to get a summary of the fitted
-#' Gradient Boosted Tree model, \code{predict} to make predictions on new 
data, and
-#' \code{write.ml}/\code{read.ml} to save/load fitted models.
-#' For more details, see
-#' 
\href{http://spark.apache.org/docs/latest/ml-classification-regression.html#gradient-boosted-tree-regression}{
-#' GBT Regres


<TRUNCATED>

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[4/5] spark git commit: [SPARK-18862][SPARKR][ML] Split SparkR mllib.R into multiple files

Reply via email to