Repository: spark Updated Branches: refs/heads/master 731588056 -> 89cd3845b
[SPARK-19460][SPARKR] Update dataset used in R documentation, examples to reduce warning noise and confusions ## What changes were proposed in this pull request? Replace `iris` dataset with `Titanic` or other dataset in example and document. ## How was this patch tested? Manual and existing test Author: wm...@hotmail.com <wm...@hotmail.com> Closes #17032 from wangmiao1981/example. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/89cd3845 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/89cd3845 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/89cd3845 Branch: refs/heads/master Commit: 89cd3845b6edb165236a6498dcade033975ee276 Parents: 7315880 Author: wm...@hotmail.com <wm...@hotmail.com> Authored: Tue Feb 28 22:31:35 2017 -0800 Committer: Felix Cheung <felixche...@apache.org> Committed: Tue Feb 28 22:31:35 2017 -0800 ---------------------------------------------------------------------- R/pkg/R/mllib_classification.R | 15 ++++----- R/pkg/R/mllib_clustering.R | 15 +++++---- R/pkg/R/mllib_regression.R | 14 ++++---- R/pkg/R/mllib_tree.R | 18 +++++----- R/pkg/vignettes/sparkr-vignettes.Rmd | 47 ++++++++++++++------------- examples/src/main/r/ml/bisectingKmeans.R | 11 ++++--- examples/src/main/r/ml/glm.R | 20 +++++++----- examples/src/main/r/ml/kmeans.R | 10 +++--- examples/src/main/r/ml/ml.R | 9 ++--- 9 files changed, 85 insertions(+), 74 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/89cd3845/R/pkg/R/mllib_classification.R ---------------------------------------------------------------------- diff --git a/R/pkg/R/mllib_classification.R b/R/pkg/R/mllib_classification.R index 05bb952..4db9cc3 100644 --- a/R/pkg/R/mllib_classification.R +++ b/R/pkg/R/mllib_classification.R @@ -75,9 +75,9 @@ setClass("NaiveBayesModel", representation(jobj = "jobj")) #' @examples #' \dontrun{ #' sparkR.session() -#' df <- createDataFrame(iris) -#' training <- df[df$Species %in% c("versicolor", "virginica"), ] -#' model <- spark.svmLinear(training, Species ~ ., regParam = 0.5) +#' t <- as.data.frame(Titanic) +#' training <- createDataFrame(t) +#' model <- spark.svmLinear(training, Survived ~ ., regParam = 0.5) #' summary <- summary(model) #' #' # fitted values on training data @@ -220,9 +220,9 @@ function(object, path, overwrite = FALSE) { #' \dontrun{ #' sparkR.session() #' # binary logistic regression -#' df <- createDataFrame(iris) -#' training <- df[df$Species %in% c("versicolor", "virginica"), ] -#' model <- spark.logit(training, Species ~ ., regParam = 0.5) +#' t <- as.data.frame(Titanic) +#' training <- createDataFrame(t) +#' model <- spark.logit(training, Survived ~ ., regParam = 0.5) #' summary <- summary(model) #' #' # fitted values on training data @@ -239,8 +239,7 @@ function(object, path, overwrite = FALSE) { #' #' # multinomial logistic regression #' -#' df <- createDataFrame(iris) -#' model <- spark.logit(df, Species ~ ., regParam = 0.5) +#' model <- spark.logit(training, Class ~ ., regParam = 0.5) #' summary <- summary(model) #' #' } http://git-wip-us.apache.org/repos/asf/spark/blob/89cd3845/R/pkg/R/mllib_clustering.R ---------------------------------------------------------------------- diff --git a/R/pkg/R/mllib_clustering.R b/R/pkg/R/mllib_clustering.R index 8823f90..0ebdb5a 100644 --- a/R/pkg/R/mllib_clustering.R +++ b/R/pkg/R/mllib_clustering.R @@ -72,8 +72,9 @@ setClass("LDAModel", representation(jobj = "jobj")) #' @examples #' \dontrun{ #' sparkR.session() -#' df <- createDataFrame(iris) -#' model <- spark.bisectingKmeans(df, Sepal_Length ~ Sepal_Width, k = 4) +#' t <- as.data.frame(Titanic) +#' df <- createDataFrame(t) +#' model <- spark.bisectingKmeans(df, Class ~ Survived, k = 4) #' summary(model) #' #' # get fitted result from a bisecting k-means model @@ -82,7 +83,7 @@ setClass("LDAModel", representation(jobj = "jobj")) #' #' # fitted values on training data #' fitted <- predict(model, df) -#' head(select(fitted, "Sepal_Length", "prediction")) +#' head(select(fitted, "Class", "prediction")) #' #' # save fitted model to input path #' path <- "path/to/model" @@ -338,14 +339,14 @@ setMethod("write.ml", signature(object = "GaussianMixtureModel", path = "charact #' @examples #' \dontrun{ #' sparkR.session() -#' data(iris) -#' df <- createDataFrame(iris) -#' model <- spark.kmeans(df, Sepal_Length ~ Sepal_Width, k = 4, initMode = "random") +#' t <- as.data.frame(Titanic) +#' df <- createDataFrame(t) +#' model <- spark.kmeans(df, Class ~ Survived, k = 4, initMode = "random") #' summary(model) #' #' # fitted values on training data #' fitted <- predict(model, df) -#' head(select(fitted, "Sepal_Length", "prediction")) +#' head(select(fitted, "Class", "prediction")) #' #' # save fitted model to input path #' path <- "path/to/model" http://git-wip-us.apache.org/repos/asf/spark/blob/89cd3845/R/pkg/R/mllib_regression.R ---------------------------------------------------------------------- diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R index ac0578c..648d363 100644 --- a/R/pkg/R/mllib_regression.R +++ b/R/pkg/R/mllib_regression.R @@ -68,14 +68,14 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj")) #' @examples #' \dontrun{ #' sparkR.session() -#' data(iris) -#' df <- createDataFrame(iris) -#' model <- spark.glm(df, Sepal_Length ~ Sepal_Width, family = "gaussian") +#' t <- as.data.frame(Titanic) +#' df <- createDataFrame(t) +#' model <- spark.glm(df, Freq ~ Sex + Age, family = "gaussian") #' summary(model) #' #' # fitted values on training data #' fitted <- predict(model, df) -#' head(select(fitted, "Sepal_Length", "prediction")) +#' head(select(fitted, "Freq", "prediction")) #' #' # save fitted model to input path #' path <- "path/to/model" @@ -137,9 +137,9 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"), #' @examples #' \dontrun{ #' sparkR.session() -#' data(iris) -#' df <- createDataFrame(iris) -#' model <- glm(Sepal_Length ~ Sepal_Width, df, family = "gaussian") +#' t <- as.data.frame(Titanic) +#' df <- createDataFrame(t) +#' model <- glm(Freq ~ Sex + Age, df, family = "gaussian") #' summary(model) #' } #' @note glm since 1.5.0 http://git-wip-us.apache.org/repos/asf/spark/blob/89cd3845/R/pkg/R/mllib_tree.R ---------------------------------------------------------------------- diff --git a/R/pkg/R/mllib_tree.R b/R/pkg/R/mllib_tree.R index 0d53fad..40a806c 100644 --- a/R/pkg/R/mllib_tree.R +++ b/R/pkg/R/mllib_tree.R @@ -143,14 +143,15 @@ print.summary.treeEnsemble <- function(x) { #' #' # fit a Gradient Boosted Tree Classification Model #' # label must be binary - Only binary classification is supported for GBT. -#' df <- createDataFrame(iris[iris$Species != "virginica", ]) -#' model <- spark.gbt(df, Species ~ Petal_Length + Petal_Width, "classification") +#' t <- as.data.frame(Titanic) +#' df <- createDataFrame(t) +#' model <- spark.gbt(df, Survived ~ Age + Freq, "classification") #' #' # numeric label is also supported -#' iris2 <- iris[iris$Species != "virginica", ] -#' iris2$NumericSpecies <- ifelse(iris2$Species == "setosa", 0, 1) -#' df <- createDataFrame(iris2) -#' model <- spark.gbt(df, NumericSpecies ~ ., type = "classification") +#' t2 <- as.data.frame(Titanic) +#' t2$NumericGender <- ifelse(t2$Sex == "Male", 0, 1) +#' df <- createDataFrame(t2) +#' model <- spark.gbt(df, NumericGender ~ ., type = "classification") #' } #' @note spark.gbt since 2.1.0 setMethod("spark.gbt", signature(data = "SparkDataFrame", formula = "formula"), @@ -351,8 +352,9 @@ setMethod("write.ml", signature(object = "GBTClassificationModel", path = "chara #' summary(savedModel) #' #' # fit a Random Forest Classification Model -#' df <- createDataFrame(iris) -#' model <- spark.randomForest(df, Species ~ Petal_Length + Petal_Width, "classification") +#' t <- as.data.frame(Titanic) +#' df <- createDataFrame(t) +#' model <- spark.randomForest(df, Survived ~ Freq + Age, "classification") #' } #' @note spark.randomForest since 2.1.0 setMethod("spark.randomForest", signature(data = "SparkDataFrame", formula = "formula"), http://git-wip-us.apache.org/repos/asf/spark/blob/89cd3845/R/pkg/vignettes/sparkr-vignettes.Rmd ---------------------------------------------------------------------- diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd index bc8bc3c..43c255c 100644 --- a/R/pkg/vignettes/sparkr-vignettes.Rmd +++ b/R/pkg/vignettes/sparkr-vignettes.Rmd @@ -565,11 +565,10 @@ We use a simple example to demonstrate `spark.logit` usage. In general, there ar and 3). Obtain the coefficient matrix of the fitted model using `summary` and use the model for prediction with `predict`. Binomial logistic regression -```{r, warning=FALSE} -df <- createDataFrame(iris) -# Create a DataFrame containing two classes -training <- df[df$Species %in% c("versicolor", "virginica"), ] -model <- spark.logit(training, Species ~ ., regParam = 0.00042) +```{r} +t <- as.data.frame(Titanic) +training <- createDataFrame(t) +model <- spark.logit(training, Survived ~ ., regParam = 0.04741301) summary(model) ``` @@ -579,10 +578,11 @@ fitted <- predict(model, training) ``` Multinomial logistic regression against three classes -```{r, warning=FALSE} -df <- createDataFrame(iris) +```{r} +t <- as.data.frame(Titanic) +training <- createDataFrame(t) # Note in this case, Spark infers it is multinomial logistic regression, so family = "multinomial" is optional. -model <- spark.logit(df, Species ~ ., regParam = 0.056) +model <- spark.logit(training, Class ~ ., regParam = 0.07815179) summary(model) ``` @@ -609,11 +609,12 @@ MLPC employs backpropagation for learning the model. We use the logistic loss fu `spark.mlp` requires at least two columns in `data`: one named `"label"` and the other one `"features"`. The `"features"` column should be in libSVM-format. -We use iris data set to show how to use `spark.mlp` in classification. -```{r, warning=FALSE} -df <- createDataFrame(iris) +We use Titanic data set to show how to use `spark.mlp` in classification. +```{r} +t <- as.data.frame(Titanic) +training <- createDataFrame(t) # fit a Multilayer Perceptron Classification Model -model <- spark.mlp(df, Species ~ ., blockSize = 128, layers = c(4, 3), solver = "l-bfgs", maxIter = 100, tol = 0.5, stepSize = 1, seed = 1, initialWeights = c(0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 9, 9, 9, 9, 9)) +model <- spark.mlp(training, Survived ~ Age + Sex, blockSize = 128, layers = c(2, 3), solver = "l-bfgs", maxIter = 100, tol = 0.5, stepSize = 1, seed = 1, initialWeights = c( 0, 0, 0, 5, 5, 5, 9, 9, 9)) ``` To avoid lengthy display, we only present partial results of the model summary. You can check the full result from your sparkR shell. @@ -630,7 +631,7 @@ options(ops) ``` ```{r} # make predictions use the fitted model -predictions <- predict(model, df) +predictions <- predict(model, training) head(select(predictions, predictions$prediction)) ``` @@ -769,12 +770,13 @@ predictions <- predict(rfModel, df) `spark.bisectingKmeans` is a kind of [hierarchical clustering](https://en.wikipedia.org/wiki/Hierarchical_clustering) using a divisive (or "top-down") approach: all observations start in one cluster, and splits are performed recursively as one moves down the hierarchy. -```{r, warning=FALSE} -df <- createDataFrame(iris) -model <- spark.bisectingKmeans(df, Sepal_Length ~ Sepal_Width, k = 4) +```{r} +t <- as.data.frame(Titanic) +training <- createDataFrame(t) +model <- spark.bisectingKmeans(training, Class ~ Survived, k = 4) summary(model) -fitted <- predict(model, df) -head(select(fitted, "Sepal_Length", "prediction")) +fitted <- predict(model, training) +head(select(fitted, "Class", "prediction")) ``` #### Gaussian Mixture Model @@ -912,9 +914,10 @@ testSummary ### Model Persistence The following example shows how to save/load an ML model by SparkR. -```{r, warning=FALSE} -irisDF <- createDataFrame(iris) -gaussianGLM <- spark.glm(irisDF, Sepal_Length ~ Sepal_Width + Species, family = "gaussian") +```{r} +t <- as.data.frame(Titanic) +training <- createDataFrame(t) +gaussianGLM <- spark.glm(training, Freq ~ Sex + Age, family = "gaussian") # Save and then load a fitted MLlib model modelPath <- tempfile(pattern = "ml", fileext = ".tmp") @@ -925,7 +928,7 @@ gaussianGLM2 <- read.ml(modelPath) summary(gaussianGLM2) # Check model prediction -gaussianPredictions <- predict(gaussianGLM2, irisDF) +gaussianPredictions <- predict(gaussianGLM2, training) head(gaussianPredictions) unlink(modelPath) http://git-wip-us.apache.org/repos/asf/spark/blob/89cd3845/examples/src/main/r/ml/bisectingKmeans.R ---------------------------------------------------------------------- diff --git a/examples/src/main/r/ml/bisectingKmeans.R b/examples/src/main/r/ml/bisectingKmeans.R index 5fb5bfb..b3eaa6d 100644 --- a/examples/src/main/r/ml/bisectingKmeans.R +++ b/examples/src/main/r/ml/bisectingKmeans.R @@ -25,20 +25,21 @@ library(SparkR) sparkR.session(appName = "SparkR-ML-bisectingKmeans-example") # $example on$ -irisDF <- createDataFrame(iris) +t <- as.data.frame(Titanic) +training <- createDataFrame(t) # Fit bisecting k-means model with four centers -model <- spark.bisectingKmeans(df, Sepal_Length ~ Sepal_Width, k = 4) +model <- spark.bisectingKmeans(training, Class ~ Survived, k = 4) # get fitted result from a bisecting k-means model fitted.model <- fitted(model, "centers") # Model summary -summary(fitted.model) +head(summary(fitted.model)) # fitted values on training data -fitted <- predict(model, df) -head(select(fitted, "Sepal_Length", "prediction")) +fitted <- predict(model, training) +head(select(fitted, "Class", "prediction")) # $example off$ sparkR.session.stop() http://git-wip-us.apache.org/repos/asf/spark/blob/89cd3845/examples/src/main/r/ml/glm.R ---------------------------------------------------------------------- diff --git a/examples/src/main/r/ml/glm.R b/examples/src/main/r/ml/glm.R index e41af97..ee13910 100644 --- a/examples/src/main/r/ml/glm.R +++ b/examples/src/main/r/ml/glm.R @@ -25,11 +25,12 @@ library(SparkR) sparkR.session(appName = "SparkR-ML-glm-example") # $example on$ -irisDF <- suppressWarnings(createDataFrame(iris)) +training <- read.df("data/mllib/sample_multiclass_classification_data.txt", source = "libsvm") # Fit a generalized linear model of family "gaussian" with spark.glm -gaussianDF <- irisDF -gaussianTestDF <- irisDF -gaussianGLM <- spark.glm(gaussianDF, Sepal_Length ~ Sepal_Width + Species, family = "gaussian") +df_list <- randomSplit(training, c(7,3), 2) +gaussianDF <- df_list[[1]] +gaussianTestDF <- df_list[[2]] +gaussianGLM <- spark.glm(gaussianDF, label ~ features, family = "gaussian") # Model summary summary(gaussianGLM) @@ -39,14 +40,15 @@ gaussianPredictions <- predict(gaussianGLM, gaussianTestDF) head(gaussianPredictions) # Fit a generalized linear model with glm (R-compliant) -gaussianGLM2 <- glm(Sepal_Length ~ Sepal_Width + Species, gaussianDF, family = "gaussian") +gaussianGLM2 <- glm(label ~ features, gaussianDF, family = "gaussian") summary(gaussianGLM2) # Fit a generalized linear model of family "binomial" with spark.glm -# Note: Filter out "setosa" from label column (two labels left) to match "binomial" family. -binomialDF <- filter(irisDF, irisDF$Species != "setosa") -binomialTestDF <- binomialDF -binomialGLM <- spark.glm(binomialDF, Species ~ Sepal_Length + Sepal_Width, family = "binomial") +training2 <- read.df("data/mllib/sample_binary_classification_data.txt", source = "libsvm") +df_list2 <- randomSplit(training2, c(7,3), 2) +binomialDF <- df_list2[[1]] +binomialTestDF <- df_list2[[2]] +binomialGLM <- spark.glm(binomialDF, label ~ features, family = "binomial") # Model summary summary(binomialGLM) http://git-wip-us.apache.org/repos/asf/spark/blob/89cd3845/examples/src/main/r/ml/kmeans.R ---------------------------------------------------------------------- diff --git a/examples/src/main/r/ml/kmeans.R b/examples/src/main/r/ml/kmeans.R index 288e2f9..824df20 100644 --- a/examples/src/main/r/ml/kmeans.R +++ b/examples/src/main/r/ml/kmeans.R @@ -26,10 +26,12 @@ sparkR.session(appName = "SparkR-ML-kmeans-example") # $example on$ # Fit a k-means model with spark.kmeans -irisDF <- suppressWarnings(createDataFrame(iris)) -kmeansDF <- irisDF -kmeansTestDF <- irisDF -kmeansModel <- spark.kmeans(kmeansDF, ~ Sepal_Length + Sepal_Width + Petal_Length + Petal_Width, +t <- as.data.frame(Titanic) +training <- createDataFrame(t) +df_list <- randomSplit(training, c(7,3), 2) +kmeansDF <- df_list[[1]] +kmeansTestDF <- df_list[[2]] +kmeansModel <- spark.kmeans(kmeansDF, ~ Class + Sex + Age + Freq, k = 3) # Model summary http://git-wip-us.apache.org/repos/asf/spark/blob/89cd3845/examples/src/main/r/ml/ml.R ---------------------------------------------------------------------- diff --git a/examples/src/main/r/ml/ml.R b/examples/src/main/r/ml/ml.R index b968194..41b7867 100644 --- a/examples/src/main/r/ml/ml.R +++ b/examples/src/main/r/ml/ml.R @@ -26,11 +26,12 @@ sparkR.session(appName = "SparkR-ML-example") ############################ model read/write ############################################## # $example on:read_write$ -irisDF <- suppressWarnings(createDataFrame(iris)) +training <- read.df("data/mllib/sample_multiclass_classification_data.txt", source = "libsvm") # Fit a generalized linear model of family "gaussian" with spark.glm -gaussianDF <- irisDF -gaussianTestDF <- irisDF -gaussianGLM <- spark.glm(gaussianDF, Sepal_Length ~ Sepal_Width + Species, family = "gaussian") +df_list <- randomSplit(training, c(7,3), 2) +gaussianDF <- df_list[[1]] +gaussianTestDF <- df_list[[2]] +gaussianGLM <- spark.glm(gaussianDF, label ~ features, family = "gaussian") # Save and then load a fitted MLlib model modelPath <- tempfile(pattern = "ml", fileext = ".tmp") --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org