Repository: spark Updated Branches: refs/heads/master 045fc3606 -> 7e8279fde
[SPARK-15254][DOC] Improve ML pipeline Cross Validation Scaladoc & PyDoc ## What changes were proposed in this pull request? Updated ML pipeline Cross Validation Scaladoc & PyDoc. ## How was this patch tested? Documentation update (If this patch involves UI changes, please attach a screenshot; otherwise, remove this) Author: krishnakalyan3 <krishnakaly...@gmail.com> Closes #13894 from krishnakalyan3/kfold-cv. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7e8279fd Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7e8279fd Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7e8279fd Branch: refs/heads/master Commit: 7e8279fde176b08687adf2b410693b35cfbd4b46 Parents: 045fc36 Author: krishnakalyan3 <krishnakaly...@gmail.com> Authored: Wed Jul 27 15:37:38 2016 +0200 Committer: Nick Pentreath <ni...@za.ibm.com> Committed: Wed Jul 27 15:37:38 2016 +0200 ---------------------------------------------------------------------- .../org/apache/spark/ml/tuning/CrossValidator.scala | 10 ++++++++-- python/pyspark/ml/tuning.py | 13 +++++++++++-- 2 files changed, 19 insertions(+), 4 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/7e8279fd/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala ---------------------------------------------------------------------- diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala index 5205578..6ea52ef 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala @@ -55,7 +55,11 @@ private[ml] trait CrossValidatorParams extends ValidatorParams { } /** - * K-fold cross validation. + * K-fold cross validation performs model selection by splitting the dataset into a set of + * non-overlapping randomly partitioned folds which are used as separate training and test datasets + * e.g., with k=3 folds, K-fold cross validation will generate 3 (training, test) dataset pairs, + * each of which uses 2/3 of the data for training and 1/3 for testing. Each fold is used as the + * test set exactly once. */ @Since("1.2.0") class CrossValidator @Since("1.2.0") (@Since("1.4.0") override val uid: String) @@ -188,7 +192,9 @@ object CrossValidator extends MLReadable[CrossValidator] { } /** - * Model from k-fold cross validation. + * CrossValidatorModel contains the model with the highest average cross-validation + * metric across folds and uses this model to transform input data. CrossValidatorModel + * also tracks the metrics for each param map evaluated. * * @param bestModel The best model selected from k-fold cross validation. * @param avgMetrics Average cross-validation metrics for each paramMap in http://git-wip-us.apache.org/repos/asf/spark/blob/7e8279fd/python/pyspark/ml/tuning.py ---------------------------------------------------------------------- diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py index 298314d..7f967e5 100644 --- a/python/pyspark/ml/tuning.py +++ b/python/pyspark/ml/tuning.py @@ -143,7 +143,13 @@ class ValidatorParams(HasSeed): class CrossValidator(Estimator, ValidatorParams): """ - K-fold cross validation. + + K-fold cross validation performs model selection by splitting the dataset into a set of + non-overlapping randomly partitioned folds which are used as separate training and test datasets + e.g., with k=3 folds, K-fold cross validation will generate 3 (training, test) dataset pairs, + each of which uses 2/3 of the data for training and 1/3 for testing. Each fold is used as the + test set exactly once. + >>> from pyspark.ml.classification import LogisticRegression >>> from pyspark.ml.evaluation import BinaryClassificationEvaluator @@ -260,7 +266,10 @@ class CrossValidator(Estimator, ValidatorParams): class CrossValidatorModel(Model, ValidatorParams): """ - Model from k-fold cross validation. + + CrossValidatorModel contains the model with the highest average cross-validation + metric across folds and uses this model to transform input data. CrossValidatorModel + also tracks the metrics for each param map evaluated. .. versionadded:: 1.4.0 """ --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org