spark git commit: [SPARK-20307][ML][SPARKR][FOLLOW-UP] RFormula should handle invalid for both features and label column.
Repository: spark Updated Branches: refs/heads/master 74ac1fb08 -> 69e5282d3 [SPARK-20307][ML][SPARKR][FOLLOW-UP] RFormula should handle invalid for both features and label column. ## What changes were proposed in this pull request? ```RFormula``` should handle invalid for both features and label column. #18496 only handle invalid values in features column. This PR add handling invalid values for label column and test cases. ## How was this patch tested? Add test cases. Author: Yanbo LiangCloses #18613 from yanboliang/spark-20307. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/69e5282d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/69e5282d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/69e5282d Branch: refs/heads/master Commit: 69e5282d3c2998611680d3e10f2830d4e9c5f750 Parents: 74ac1fb Author: Yanbo Liang Authored: Sat Jul 15 20:56:38 2017 +0800 Committer: Yanbo Liang Committed: Sat Jul 15 20:56:38 2017 +0800 -- R/pkg/tests/fulltests/test_mllib_tree.R | 2 +- .../org/apache/spark/ml/feature/RFormula.scala | 9 ++-- .../apache/spark/ml/feature/RFormulaSuite.scala | 49 +++- python/pyspark/ml/feature.py| 5 +- 4 files changed, 57 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/69e5282d/R/pkg/tests/fulltests/test_mllib_tree.R -- diff --git a/R/pkg/tests/fulltests/test_mllib_tree.R b/R/pkg/tests/fulltests/test_mllib_tree.R index 66a0693..e31a65f 100644 --- a/R/pkg/tests/fulltests/test_mllib_tree.R +++ b/R/pkg/tests/fulltests/test_mllib_tree.R @@ -225,7 +225,7 @@ test_that("spark.randomForest", { expect_error(collect(predictions)) model <- spark.randomForest(traindf, clicked ~ ., type = "classification", maxDepth = 10, maxBins = 10, numTrees = 10, - handleInvalid = "skip") + handleInvalid = "keep") predictions <- predict(model, testdf) expect_equal(class(collect(predictions)$clicked[1]), "character") http://git-wip-us.apache.org/repos/asf/spark/blob/69e5282d/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala index bb7acaf..c224454 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala @@ -134,16 +134,16 @@ class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String) def getFormula: String = $(formula) /** - * Param for how to handle invalid data (unseen labels or NULL values). - * Options are 'skip' (filter out rows with invalid data), + * Param for how to handle invalid data (unseen or NULL values) in features and label column + * of string type. Options are 'skip' (filter out rows with invalid data), * 'error' (throw an error), or 'keep' (put invalid data in a special additional * bucket, at index numLabels). * Default: "error" * @group param */ @Since("2.3.0") - override val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", -"How to handle invalid data (unseen labels or NULL values). " + + override val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "How to " + +"handle invalid data (unseen or NULL values) in features and label column of string type. " + "Options are 'skip' (filter out rows with invalid data), error (throw an error), " + "or 'keep' (put invalid data in a special additional bucket, at index numLabels).", ParamValidators.inArray(StringIndexer.supportedHandleInvalids)) @@ -265,6 +265,7 @@ class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String) encoderStages += new StringIndexer() .setInputCol(resolvedFormula.label) .setOutputCol($(labelCol)) +.setHandleInvalid($(handleInvalid)) } val pipelineModel = new Pipeline(uid).setStages(encoderStages.toArray).fit(dataset) http://git-wip-us.apache.org/repos/asf/spark/blob/69e5282d/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala index 23570d6..5d09c90 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala +++
spark git commit: [SPARK-21267][DOCS][MINOR] Follow up to avoid referencing programming-guide redirector
Repository: spark Updated Branches: refs/heads/branch-2.2 1cb4369a5 -> 8e85ce625 [SPARK-21267][DOCS][MINOR] Follow up to avoid referencing programming-guide redirector ## What changes were proposed in this pull request? Update internal references from programming-guide to rdd-programming-guide See https://github.com/apache/spark-website/commit/5ddf243fd84a0f0f98a5193a207737cea9cdc083 and https://github.com/apache/spark/pull/18485#issuecomment-314789751 Let's keep the redirector even if it's problematic to build, but not rely on it internally. ## How was this patch tested? (Doc build) Author: Sean OwenCloses #18625 from srowen/SPARK-21267.2. (cherry picked from commit 74ac1fb081e9532d77278a4edca9f3f129fd62eb) Signed-off-by: Sean Owen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8e85ce62 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8e85ce62 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8e85ce62 Branch: refs/heads/branch-2.2 Commit: 8e85ce625127f62b7e2abdfab81c7bcbebcc8448 Parents: 1cb4369 Author: Sean Owen Authored: Sat Jul 15 09:21:29 2017 +0100 Committer: Sean Owen Committed: Sat Jul 15 09:22:06 2017 +0100 -- R/pkg/R/DataFrame.R | 2 +- R/pkg/R/RDD.R | 2 +- docs/graphx-programming-guide.md| 2 +- docs/index.md | 2 +- docs/ml-guide.md| 2 +- docs/mllib-optimization.md | 2 +- docs/spark-standalone.md| 2 +- docs/streaming-programming-guide.md | 14 ++ docs/tuning.md | 6 +++--- 9 files changed, 20 insertions(+), 14 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8e85ce62/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index b606f1f..3859fa8 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -591,7 +591,7 @@ setMethod("cache", #' #' Persist this SparkDataFrame with the specified storage level. For details of the #' supported storage levels, refer to -#' \url{http://spark.apache.org/docs/latest/programming-guide.html#rdd-persistence}. +#' \url{http://spark.apache.org/docs/latest/rdd-programming-guide.html#rdd-persistence}. #' #' @param x the SparkDataFrame to persist. #' @param newLevel storage level chosen for the persistance. See available options in http://git-wip-us.apache.org/repos/asf/spark/blob/8e85ce62/R/pkg/R/RDD.R -- diff --git a/R/pkg/R/RDD.R b/R/pkg/R/RDD.R index 7ad3993..15ca212 100644 --- a/R/pkg/R/RDD.R +++ b/R/pkg/R/RDD.R @@ -227,7 +227,7 @@ setMethod("cacheRDD", #' #' Persist this RDD with the specified storage level. For details of the #' supported storage levels, refer to -#'\url{http://spark.apache.org/docs/latest/programming-guide.html#rdd-persistence}. +#'\url{http://spark.apache.org/docs/latest/rdd-programming-guide.html#rdd-persistence}. #' #' @param x The RDD to persist #' @param newLevel The new storage level to be assigned http://git-wip-us.apache.org/repos/asf/spark/blob/8e85ce62/docs/graphx-programming-guide.md -- diff --git a/docs/graphx-programming-guide.md b/docs/graphx-programming-guide.md index 76aa7b4..46225dc 100644 --- a/docs/graphx-programming-guide.md +++ b/docs/graphx-programming-guide.md @@ -27,7 +27,7 @@ description: GraphX graph processing library guide for Spark SPARK_VERSION_SHORT [EdgeContext]: api/scala/index.html#org.apache.spark.graphx.EdgeContext [GraphOps.collectNeighborIds]: api/scala/index.html#org.apache.spark.graphx.GraphOps@collectNeighborIds(EdgeDirection):VertexRDD[Array[VertexId]] [GraphOps.collectNeighbors]: api/scala/index.html#org.apache.spark.graphx.GraphOps@collectNeighbors(EdgeDirection):VertexRDD[Array[(VertexId,VD)]] -[RDD Persistence]: programming-guide.html#rdd-persistence +[RDD Persistence]: rdd-programming-guide.html#rdd-persistence [Graph.cache]: api/scala/index.html#org.apache.spark.graphx.Graph@cache():Graph[VD,ED] [GraphOps.pregel]: api/scala/index.html#org.apache.spark.graphx.GraphOps@pregel[A](A,Int,EdgeDirection)((VertexId,VD,A)âVD,(EdgeTriplet[VD,ED])âIterator[(VertexId,A)],(A,A)âA)(ClassTag[A]):Graph[VD,ED] [PartitionStrategy]: api/scala/index.html#org.apache.spark.graphx.PartitionStrategy$ http://git-wip-us.apache.org/repos/asf/spark/blob/8e85ce62/docs/index.md -- diff --git a/docs/index.md b/docs/index.md index 51641c9..5ecfcf7 100644 --- a/docs/index.md +++ b/docs/index.md @@ -88,7 +88,7
spark git commit: [SPARK-21267][DOCS][MINOR] Follow up to avoid referencing programming-guide redirector
Repository: spark Updated Branches: refs/heads/master ac5d5d795 -> 74ac1fb08 [SPARK-21267][DOCS][MINOR] Follow up to avoid referencing programming-guide redirector ## What changes were proposed in this pull request? Update internal references from programming-guide to rdd-programming-guide See https://github.com/apache/spark-website/commit/5ddf243fd84a0f0f98a5193a207737cea9cdc083 and https://github.com/apache/spark/pull/18485#issuecomment-314789751 Let's keep the redirector even if it's problematic to build, but not rely on it internally. ## How was this patch tested? (Doc build) Author: Sean OwenCloses #18625 from srowen/SPARK-21267.2. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/74ac1fb0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/74ac1fb0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/74ac1fb0 Branch: refs/heads/master Commit: 74ac1fb081e9532d77278a4edca9f3f129fd62eb Parents: ac5d5d7 Author: Sean Owen Authored: Sat Jul 15 09:21:29 2017 +0100 Committer: Sean Owen Committed: Sat Jul 15 09:21:29 2017 +0100 -- R/pkg/R/DataFrame.R | 2 +- R/pkg/R/RDD.R | 2 +- docs/graphx-programming-guide.md| 2 +- docs/index.md | 2 +- docs/ml-guide.md| 2 +- docs/mllib-optimization.md | 2 +- docs/spark-standalone.md| 2 +- docs/streaming-programming-guide.md | 14 ++ docs/tuning.md | 6 +++--- 9 files changed, 20 insertions(+), 14 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/74ac1fb0/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index e7a166c..5d6f9c0 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -593,7 +593,7 @@ setMethod("cache", #' #' Persist this SparkDataFrame with the specified storage level. For details of the #' supported storage levels, refer to -#' \url{http://spark.apache.org/docs/latest/programming-guide.html#rdd-persistence}. +#' \url{http://spark.apache.org/docs/latest/rdd-programming-guide.html#rdd-persistence}. #' #' @param x the SparkDataFrame to persist. #' @param newLevel storage level chosen for the persistance. See available options in http://git-wip-us.apache.org/repos/asf/spark/blob/74ac1fb0/R/pkg/R/RDD.R -- diff --git a/R/pkg/R/RDD.R b/R/pkg/R/RDD.R index 7ad3993..15ca212 100644 --- a/R/pkg/R/RDD.R +++ b/R/pkg/R/RDD.R @@ -227,7 +227,7 @@ setMethod("cacheRDD", #' #' Persist this RDD with the specified storage level. For details of the #' supported storage levels, refer to -#'\url{http://spark.apache.org/docs/latest/programming-guide.html#rdd-persistence}. +#'\url{http://spark.apache.org/docs/latest/rdd-programming-guide.html#rdd-persistence}. #' #' @param x The RDD to persist #' @param newLevel The new storage level to be assigned http://git-wip-us.apache.org/repos/asf/spark/blob/74ac1fb0/docs/graphx-programming-guide.md -- diff --git a/docs/graphx-programming-guide.md b/docs/graphx-programming-guide.md index 76aa7b4..46225dc 100644 --- a/docs/graphx-programming-guide.md +++ b/docs/graphx-programming-guide.md @@ -27,7 +27,7 @@ description: GraphX graph processing library guide for Spark SPARK_VERSION_SHORT [EdgeContext]: api/scala/index.html#org.apache.spark.graphx.EdgeContext [GraphOps.collectNeighborIds]: api/scala/index.html#org.apache.spark.graphx.GraphOps@collectNeighborIds(EdgeDirection):VertexRDD[Array[VertexId]] [GraphOps.collectNeighbors]: api/scala/index.html#org.apache.spark.graphx.GraphOps@collectNeighbors(EdgeDirection):VertexRDD[Array[(VertexId,VD)]] -[RDD Persistence]: programming-guide.html#rdd-persistence +[RDD Persistence]: rdd-programming-guide.html#rdd-persistence [Graph.cache]: api/scala/index.html#org.apache.spark.graphx.Graph@cache():Graph[VD,ED] [GraphOps.pregel]: api/scala/index.html#org.apache.spark.graphx.GraphOps@pregel[A](A,Int,EdgeDirection)((VertexId,VD,A)âVD,(EdgeTriplet[VD,ED])âIterator[(VertexId,A)],(A,A)âA)(ClassTag[A]):Graph[VD,ED] [PartitionStrategy]: api/scala/index.html#org.apache.spark.graphx.PartitionStrategy$ http://git-wip-us.apache.org/repos/asf/spark/blob/74ac1fb0/docs/index.md -- diff --git a/docs/index.md b/docs/index.md index 07b6b17..2d4607b 100644 --- a/docs/index.md +++ b/docs/index.md @@ -87,7 +87,7 @@ options for deployment: **Programming Guides:** * [Quick Start](quick-start.html): a quick introduction to the Spark