spark git commit: [SPARK-17077][SQL] Cardinality estimation for project operator
Repository: spark Updated Branches: refs/heads/master 19d9d4c85 -> 3ccabdfb4 [SPARK-17077][SQL] Cardinality estimation for project operator ## What changes were proposed in this pull request? Support cardinality estimation for project operator. ## How was this patch tested? Add a test suite and a base class in the catalyst package. Author: Zhenhua WangCloses #16430 from wzhfy/projectEstimation. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3ccabdfb Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3ccabdfb Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3ccabdfb Branch: refs/heads/master Commit: 3ccabdfb4d760d684b1e0c0ed448a57331f209f2 Parents: 19d9d4c Author: Zhenhua Wang Authored: Sun Jan 8 21:15:52 2017 -0800 Committer: Reynold Xin Committed: Sun Jan 8 21:15:52 2017 -0800 -- .../sql/catalyst/expressions/AttributeMap.scala | 2 + .../plans/logical/basicLogicalOperators.scala | 4 ++ .../statsEstimation/EstimationUtils.scala | 54 .../statsEstimation/ProjectEstimation.scala | 44 .../ProjectEstimationSuite.scala| 51 ++ .../StatsEstimationTestBase.scala | 41 +++ 6 files changed, 196 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3ccabdfb/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AttributeMap.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AttributeMap.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AttributeMap.scala index 96a11e3..1504a52 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AttributeMap.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AttributeMap.scala @@ -33,6 +33,8 @@ class AttributeMap[A](baseMap: Map[ExprId, (Attribute, A)]) override def get(k: Attribute): Option[A] = baseMap.get(k.exprId).map(_._2) + override def contains(k: Attribute): Boolean = get(k).isDefined + override def + [B1 >: A](kv: (Attribute, B1)): Map[Attribute, B1] = baseMap.values.toMap + kv override def iterator: Iterator[(Attribute, A)] = baseMap.valuesIterator http://git-wip-us.apache.org/repos/asf/spark/blob/3ccabdfb/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala index c977e78..9b52a9c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala @@ -22,6 +22,7 @@ import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression import org.apache.spark.sql.catalyst.plans._ +import org.apache.spark.sql.catalyst.plans.logical.statsEstimation.ProjectEstimation import org.apache.spark.sql.types._ import org.apache.spark.util.Utils @@ -53,6 +54,9 @@ case class Project(projectList: Seq[NamedExpression], child: LogicalPlan) extend override def validConstraints: Set[Expression] = child.constraints.union(getAliasedConstraints(projectList)) + + override lazy val statistics: Statistics = +ProjectEstimation.estimate(this).getOrElse(super.statistics) } /** http://git-wip-us.apache.org/repos/asf/spark/blob/3ccabdfb/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala new file mode 100644 index 000..f099e32 --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version
spark git commit: [SPARK-18903][SPARKR][BACKPORT-2.1] Add API to get SparkUI URL
Repository: spark Updated Branches: refs/heads/branch-2.1 8779e6a46 -> 80a3e13e5 [SPARK-18903][SPARKR][BACKPORT-2.1] Add API to get SparkUI URL ## What changes were proposed in this pull request? backport to 2.1 Author: Felix CheungCloses #16507 from felixcheung/portsparkuir21. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/80a3e13e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/80a3e13e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/80a3e13e Branch: refs/heads/branch-2.1 Commit: 80a3e13e58036c2461c4b721cb298ffd13b7823f Parents: 8779e6a Author: Felix Cheung Authored: Sun Jan 8 20:42:18 2017 -0800 Committer: Felix Cheung Committed: Sun Jan 8 20:42:18 2017 -0800 -- R/pkg/NAMESPACE | 1 + R/pkg/R/sparkR.R | 24 R/pkg/inst/tests/testthat/test_sparkSQL.R | 5 - 3 files changed, 29 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/80a3e13e/R/pkg/NAMESPACE -- diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index 377f942..c3ec3f4 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -16,6 +16,7 @@ export("sparkR.stop") export("sparkR.session.stop") export("sparkR.conf") export("sparkR.version") +export("sparkR.uiWebUrl") export("print.jobj") export("sparkR.newJObject") http://git-wip-us.apache.org/repos/asf/spark/blob/80a3e13e/R/pkg/R/sparkR.R -- diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index c57cc8f..870e76b 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -410,6 +410,30 @@ sparkR.session <- function( sparkSession } +#' Get the URL of the SparkUI instance for the current active SparkSession +#' +#' Get the URL of the SparkUI instance for the current active SparkSession. +#' +#' @return the SparkUI URL, or NA if it is disabled, or not started. +#' @rdname sparkR.uiWebUrl +#' @name sparkR.uiWebUrl +#' @export +#' @examples +#'\dontrun{ +#' sparkR.session() +#' url <- sparkR.uiWebUrl() +#' } +#' @note sparkR.uiWebUrl since 2.1.1 +sparkR.uiWebUrl <- function() { + sc <- sparkR.callJMethod(getSparkContext(), "sc") + u <- callJMethod(sc, "uiWebUrl") + if (callJMethod(u, "isDefined")) { +callJMethod(u, "get") + } else { +NA + } +} + #' Assigns a group ID to all the jobs started by this thread until the group ID is set to a #' different value or cleared. #' http://git-wip-us.apache.org/repos/asf/spark/blob/80a3e13e/R/pkg/inst/tests/testthat/test_sparkSQL.R -- diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 2e95737..4490f31 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -2613,7 +2613,7 @@ test_that("randomSplit", { expect_true(all(sapply(abs(counts / num - weights / sum(weights)), function(e) { e < 0.05 }))) }) -test_that("Setting and getting config on SparkSession", { +test_that("Setting and getting config on SparkSession, sparkR.conf(), sparkR.uiWebUrl()", { # first, set it to a random but known value conf <- callJMethod(sparkSession, "conf") property <- paste0("spark.testing.", as.character(runif(1))) @@ -2637,6 +2637,9 @@ test_that("Setting and getting config on SparkSession", { expect_equal(appNameValue, "sparkSession test") expect_equal(testValue, value) expect_error(sparkR.conf("completely.dummy"), "Config 'completely.dummy' is not set") + + url <- sparkR.uiWebUrl() + expect_equal(substr(url, 1, 7), "http://;) }) test_that("enableHiveSupport on SparkSession", { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-19126][DOCS] Update Join Documentation Across Languages
Repository: spark Updated Branches: refs/heads/master 1f6ded645 -> 19d9d4c85 [SPARK-19126][DOCS] Update Join Documentation Across Languages ## What changes were proposed in this pull request? - [X] Make sure all join types are clearly mentioned - [X] Make join labeling/style consistent - [X] Make join label ordering docs the same - [X] Improve join documentation according to above for Scala - [X] Improve join documentation according to above for Python - [X] Improve join documentation according to above for R ## How was this patch tested? No tests b/c docs. Please review http://spark.apache.org/contributing.html before opening a pull request. Author: anabranchCloses #16504 from anabranch/SPARK-19126. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/19d9d4c8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/19d9d4c8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/19d9d4c8 Branch: refs/heads/master Commit: 19d9d4c855eab8f647a5ec66b079172de81221d0 Parents: 1f6ded6 Author: anabranch Authored: Sun Jan 8 20:37:46 2017 -0800 Committer: Felix Cheung Committed: Sun Jan 8 20:37:46 2017 -0800 -- R/pkg/R/DataFrame.R | 19 +++ python/pyspark/sql/dataframe.py | 5 +++-- .../scala/org/apache/spark/sql/Dataset.scala | 16 3 files changed, 26 insertions(+), 14 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/19d9d4c8/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 7737ffe..c56648a 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2313,9 +2313,9 @@ setMethod("dropDuplicates", #' @param joinExpr (Optional) The expression used to perform the join. joinExpr must be a #' Column expression. If joinExpr is omitted, the default, inner join is attempted and an error is #' thrown if it would be a Cartesian Product. For Cartesian join, use crossJoin instead. -#' @param joinType The type of join to perform. The following join types are available: -#' 'inner', 'outer', 'full', 'fullouter', leftouter', 'left_outer', 'left', -#' 'right_outer', 'rightouter', 'right', and 'leftsemi'. The default joinType is "inner". +#' @param joinType The type of join to perform, default 'inner'. +#' Must be one of: 'inner', 'cross', 'outer', 'full', 'full_outer', +#' 'left', 'left_outer', 'right', 'right_outer', 'left_semi', or 'left_anti'. #' @return A SparkDataFrame containing the result of the join operation. #' @family SparkDataFrame functions #' @aliases join,SparkDataFrame,SparkDataFrame-method @@ -2344,15 +2344,18 @@ setMethod("join", if (is.null(joinType)) { sdf <- callJMethod(x@sdf, "join", y@sdf, joinExpr@jc) } else { -if (joinType %in% c("inner", "outer", "full", "fullouter", -"leftouter", "left_outer", "left", -"rightouter", "right_outer", "right", "leftsemi")) { +if (joinType %in% c("inner", "cross", +"outer", "full", "fullouter", "full_outer", +"left", "leftouter", "left_outer", +"right", "rightouter", "right_outer", +"left_semi", "leftsemi", "left_anti", "leftanti")) { joinType <- gsub("_", "", joinType) sdf <- callJMethod(x@sdf, "join", y@sdf, joinExpr@jc, joinType) } else { stop("joinType must be one of the following types: ", - "'inner', 'outer', 'full', 'fullouter', 'leftouter', 'left_outer', 'left', - 'rightouter', 'right_outer', 'right', 'leftsemi'") + "'inner', 'cross', 'outer', 'full', 'full_outer',", + "'left', 'left_outer', 'right', 'right_outer',", + "'left_semi', or 'left_anti'.") } } } http://git-wip-us.apache.org/repos/asf/spark/blob/19d9d4c8/python/pyspark/sql/dataframe.py -- diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index b9d9038..10e42d0 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -730,8 +730,9 @@ class DataFrame(object): a join expression (Column), or a list of Columns. If `on` is a string or a list of strings indicating the name of the join column(s), the column(s) must exist on both sides, and this performs an equi-join. -:param how: str, default 'inner'. -One
spark git commit: [SPARK-19126][DOCS] Update Join Documentation Across Languages
Repository: spark Updated Branches: refs/heads/branch-2.1 8690d4bd1 -> 8779e6a46 [SPARK-19126][DOCS] Update Join Documentation Across Languages ## What changes were proposed in this pull request? - [X] Make sure all join types are clearly mentioned - [X] Make join labeling/style consistent - [X] Make join label ordering docs the same - [X] Improve join documentation according to above for Scala - [X] Improve join documentation according to above for Python - [X] Improve join documentation according to above for R ## How was this patch tested? No tests b/c docs. Please review http://spark.apache.org/contributing.html before opening a pull request. Author: anabranchCloses #16504 from anabranch/SPARK-19126. (cherry picked from commit 19d9d4c855eab8f647a5ec66b079172de81221d0) Signed-off-by: Felix Cheung Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8779e6a4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8779e6a4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8779e6a4 Branch: refs/heads/branch-2.1 Commit: 8779e6a4685f50a7062842f0d5a606c3a3b092d5 Parents: 8690d4b Author: anabranch Authored: Sun Jan 8 20:37:46 2017 -0800 Committer: Felix Cheung Committed: Sun Jan 8 20:38:01 2017 -0800 -- R/pkg/R/DataFrame.R | 19 +++ python/pyspark/sql/dataframe.py | 5 +++-- .../scala/org/apache/spark/sql/Dataset.scala | 16 3 files changed, 26 insertions(+), 14 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8779e6a4/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 9a51d53..058a77e 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2307,9 +2307,9 @@ setMethod("dropDuplicates", #' @param joinExpr (Optional) The expression used to perform the join. joinExpr must be a #' Column expression. If joinExpr is omitted, the default, inner join is attempted and an error is #' thrown if it would be a Cartesian Product. For Cartesian join, use crossJoin instead. -#' @param joinType The type of join to perform. The following join types are available: -#' 'inner', 'outer', 'full', 'fullouter', leftouter', 'left_outer', 'left', -#' 'right_outer', 'rightouter', 'right', and 'leftsemi'. The default joinType is "inner". +#' @param joinType The type of join to perform, default 'inner'. +#' Must be one of: 'inner', 'cross', 'outer', 'full', 'full_outer', +#' 'left', 'left_outer', 'right', 'right_outer', 'left_semi', or 'left_anti'. #' @return A SparkDataFrame containing the result of the join operation. #' @family SparkDataFrame functions #' @aliases join,SparkDataFrame,SparkDataFrame-method @@ -2338,15 +2338,18 @@ setMethod("join", if (is.null(joinType)) { sdf <- callJMethod(x@sdf, "join", y@sdf, joinExpr@jc) } else { -if (joinType %in% c("inner", "outer", "full", "fullouter", -"leftouter", "left_outer", "left", -"rightouter", "right_outer", "right", "leftsemi")) { +if (joinType %in% c("inner", "cross", +"outer", "full", "fullouter", "full_outer", +"left", "leftouter", "left_outer", +"right", "rightouter", "right_outer", +"left_semi", "leftsemi", "left_anti", "leftanti")) { joinType <- gsub("_", "", joinType) sdf <- callJMethod(x@sdf, "join", y@sdf, joinExpr@jc, joinType) } else { stop("joinType must be one of the following types: ", - "'inner', 'outer', 'full', 'fullouter', 'leftouter', 'left_outer', 'left', - 'rightouter', 'right_outer', 'right', 'leftsemi'") + "'inner', 'cross', 'outer', 'full', 'full_outer',", + "'left', 'left_outer', 'right', 'right_outer',", + "'left_semi', or 'left_anti'.") } } } http://git-wip-us.apache.org/repos/asf/spark/blob/8779e6a4/python/pyspark/sql/dataframe.py -- diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index b9d9038..10e42d0 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -730,8 +730,9 @@ class DataFrame(object): a join expression (Column), or a list of Columns. If `on` is a string or a list of strings indicating the name of the join column(s), the
spark git commit: [SPARK-19127][DOCS] Update Rank Function Documentation
Repository: spark Updated Branches: refs/heads/branch-2.1 ecc16220d -> 8690d4bd1 [SPARK-19127][DOCS] Update Rank Function Documentation ## What changes were proposed in this pull request? - [X] Fix inconsistencies in function reference for dense rank and dense - [X] Make all languages equivalent in their reference to `dense_rank` and `rank`. ## How was this patch tested? N/A for docs. Please review http://spark.apache.org/contributing.html before opening a pull request. Author: anabranchCloses #16505 from anabranch/SPARK-19127. (cherry picked from commit 1f6ded6455d07ec8828fc9662ddffe55cbba4238) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8690d4bd Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8690d4bd Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8690d4bd Branch: refs/heads/branch-2.1 Commit: 8690d4bd150579e546aec7866b16a77bad1017f5 Parents: ecc1622 Author: anabranch Authored: Sun Jan 8 17:53:53 2017 -0800 Committer: Reynold Xin Committed: Sun Jan 8 17:53:59 2017 -0800 -- R/pkg/R/functions.R | 10 ++ python/pyspark/sql/functions.py | 16 ++-- .../main/scala/org/apache/spark/sql/functions.scala | 16 ++-- 3 files changed, 26 insertions(+), 16 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8690d4bd/R/pkg/R/functions.R -- diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index bf5c963..6ffa0f5 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -3150,7 +3150,8 @@ setMethod("cume_dist", #' The difference between rank and dense_rank is that dense_rank leaves no gaps in ranking #' sequence when there are ties. That is, if you were ranking a competition using dense_rank #' and had three people tie for second place, you would say that all three were in second -#' place and that the next person came in third. +#' place and that the next person came in third. Rank would give me sequential numbers, making +#' the person that came in third place (after the ties) would register as coming in fifth. #' #' This is equivalent to the \code{DENSE_RANK} function in SQL. #' @@ -3321,10 +3322,11 @@ setMethod("percent_rank", #' #' Window function: returns the rank of rows within a window partition. #' -#' The difference between rank and denseRank is that denseRank leaves no gaps in ranking -#' sequence when there are ties. That is, if you were ranking a competition using denseRank +#' The difference between rank and dense_rank is that dense_rank leaves no gaps in ranking +#' sequence when there are ties. That is, if you were ranking a competition using dense_rank #' and had three people tie for second place, you would say that all three were in second -#' place and that the next person came in third. +#' place and that the next person came in third. Rank would give me sequential numbers, making +#' the person that came in third place (after the ties) would register as coming in fifth. #' #' This is equivalent to the RANK function in SQL. #' http://git-wip-us.apache.org/repos/asf/spark/blob/8690d4bd/python/pyspark/sql/functions.py -- diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index d8abafc..7fe901a 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -157,17 +157,21 @@ _window_functions = { 'dense_rank': """returns the rank of rows within a window partition, without any gaps. -The difference between rank and denseRank is that denseRank leaves no gaps in ranking -sequence when there are ties. That is, if you were ranking a competition using denseRank +The difference between rank and dense_rank is that dense_rank leaves no gaps in ranking +sequence when there are ties. That is, if you were ranking a competition using dense_rank and had three people tie for second place, you would say that all three were in second -place and that the next person came in third.""", +place and that the next person came in third. Rank would give me sequential numbers, making +the person that came in third place (after the ties) would register as coming in fifth. + +This is equivalent to the DENSE_RANK function in SQL.""", 'rank': """returns the rank of rows within a window partition. -The difference between rank and denseRank is that denseRank leaves no gaps in ranking -sequence when there are ties. That is, if you
spark git commit: [SPARK-19127][DOCS] Update Rank Function Documentation
Repository: spark Updated Branches: refs/heads/master 4351e6220 -> 1f6ded645 [SPARK-19127][DOCS] Update Rank Function Documentation ## What changes were proposed in this pull request? - [X] Fix inconsistencies in function reference for dense rank and dense - [X] Make all languages equivalent in their reference to `dense_rank` and `rank`. ## How was this patch tested? N/A for docs. Please review http://spark.apache.org/contributing.html before opening a pull request. Author: anabranchCloses #16505 from anabranch/SPARK-19127. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1f6ded64 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1f6ded64 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1f6ded64 Branch: refs/heads/master Commit: 1f6ded6455d07ec8828fc9662ddffe55cbba4238 Parents: 4351e62 Author: anabranch Authored: Sun Jan 8 17:53:53 2017 -0800 Committer: Reynold Xin Committed: Sun Jan 8 17:53:53 2017 -0800 -- R/pkg/R/functions.R | 10 ++ python/pyspark/sql/functions.py | 16 ++-- .../main/scala/org/apache/spark/sql/functions.scala | 16 ++-- 3 files changed, 26 insertions(+), 16 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1f6ded64/R/pkg/R/functions.R -- diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index bf5c963..6ffa0f5 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -3150,7 +3150,8 @@ setMethod("cume_dist", #' The difference between rank and dense_rank is that dense_rank leaves no gaps in ranking #' sequence when there are ties. That is, if you were ranking a competition using dense_rank #' and had three people tie for second place, you would say that all three were in second -#' place and that the next person came in third. +#' place and that the next person came in third. Rank would give me sequential numbers, making +#' the person that came in third place (after the ties) would register as coming in fifth. #' #' This is equivalent to the \code{DENSE_RANK} function in SQL. #' @@ -3321,10 +3322,11 @@ setMethod("percent_rank", #' #' Window function: returns the rank of rows within a window partition. #' -#' The difference between rank and denseRank is that denseRank leaves no gaps in ranking -#' sequence when there are ties. That is, if you were ranking a competition using denseRank +#' The difference between rank and dense_rank is that dense_rank leaves no gaps in ranking +#' sequence when there are ties. That is, if you were ranking a competition using dense_rank #' and had three people tie for second place, you would say that all three were in second -#' place and that the next person came in third. +#' place and that the next person came in third. Rank would give me sequential numbers, making +#' the person that came in third place (after the ties) would register as coming in fifth. #' #' This is equivalent to the RANK function in SQL. #' http://git-wip-us.apache.org/repos/asf/spark/blob/1f6ded64/python/pyspark/sql/functions.py -- diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index d8abafc..7fe901a 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -157,17 +157,21 @@ _window_functions = { 'dense_rank': """returns the rank of rows within a window partition, without any gaps. -The difference between rank and denseRank is that denseRank leaves no gaps in ranking -sequence when there are ties. That is, if you were ranking a competition using denseRank +The difference between rank and dense_rank is that dense_rank leaves no gaps in ranking +sequence when there are ties. That is, if you were ranking a competition using dense_rank and had three people tie for second place, you would say that all three were in second -place and that the next person came in third.""", +place and that the next person came in third. Rank would give me sequential numbers, making +the person that came in third place (after the ties) would register as coming in fifth. + +This is equivalent to the DENSE_RANK function in SQL.""", 'rank': """returns the rank of rows within a window partition. -The difference between rank and denseRank is that denseRank leaves no gaps in ranking -sequence when there are ties. That is, if you were ranking a competition using denseRank +The difference between rank and dense_rank is that dense_rank leaves no
spark git commit: [SPARK-19093][SQL] Cached tables are not used in SubqueryExpression
Repository: spark Updated Branches: refs/heads/master cd1d00ada -> 4351e6220 [SPARK-19093][SQL] Cached tables are not used in SubqueryExpression ## What changes were proposed in this pull request? Consider the plans inside subquery expressions while looking up cache manager to make use of cached data. Currently CacheManager.useCachedData does not consider the subquery expressions in the plan. SQL ``` select * from rows where not exists (select * from rows) ``` Before the fix ``` == Optimized Logical Plan == Join LeftAnti :- InMemoryRelation [_1#3775, _2#3776], true, 1, StorageLevel(disk, memory, deserialized, 1 replicas) : +- *FileScan parquet [_1#3775,_2#3776] Batched: true, Format: Parquet, Location: InMemoryFileIndex[dbfs:/tmp/rows], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<_1:string,_2:string> +- Project [_1#3775 AS _1#3775#4001, _2#3776 AS _2#3776#4002] +- Relation[_1#3775,_2#3776] parquet ``` After ``` == Optimized Logical Plan == Join LeftAnti :- InMemoryRelation [_1#256, _2#257], true, 1, StorageLevel(disk, memory, deserialized, 1 replicas) : +- *FileScan parquet [_1#256,_2#257] Batched: true, Format: Parquet, Location: InMemoryFileIndex[file:/tmp/rows], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<_1:string,_2:string> +- Project [_1#256 AS _1#256#298, _2#257 AS _2#257#299] +- InMemoryRelation [_1#256, _2#257], true, 1, StorageLevel(disk, memory, deserialized, 1 replicas) +- *FileScan parquet [_1#256,_2#257] Batched: true, Format: Parquet, Location: InMemoryFileIndex[file:/tmp/rows], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<_1:string,_2:string> ``` Query2 ``` SELECT * FROM t1 WHERE c1 IN (SELECT c1 FROM t2 WHERE c1 IN (SELECT c1 FROM t3 WHERE c1 = 1)) ``` Before ``` == Analyzed Logical Plan == c1: int Project [c1#3] +- Filter predicate-subquery#47 [(c1#3 = c1#10)] : +- Project [c1#10] : +- Filter predicate-subquery#46 [(c1#10 = c1#17)] :: +- Project [c1#17] :: +- Filter (c1#17 = 1) ::+- SubqueryAlias t3, `t3` :: +- Project [value#15 AS c1#17] :: +- LocalRelation [value#15] :+- SubqueryAlias t2, `t2` : +- Project [value#8 AS c1#10] : +- LocalRelation [value#8] +- SubqueryAlias t1, `t1` +- Project [value#1 AS c1#3] +- LocalRelation [value#1] == Optimized Logical Plan == Join LeftSemi, (c1#3 = c1#10) :- InMemoryRelation [c1#3], true, 1, StorageLevel(disk, memory, deserialized, 1 replicas), t1 : +- LocalTableScan [c1#3] +- Project [value#8 AS c1#10] +- Join LeftSemi, (value#8 = c1#17) :- LocalRelation [value#8] +- Project [value#15 AS c1#17] +- Filter (value#15 = 1) +- LocalRelation [value#15] ``` After ``` == Analyzed Logical Plan == c1: int Project [c1#3] +- Filter predicate-subquery#47 [(c1#3 = c1#10)] : +- Project [c1#10] : +- Filter predicate-subquery#46 [(c1#10 = c1#17)] :: +- Project [c1#17] :: +- Filter (c1#17 = 1) ::+- SubqueryAlias t3, `t3` :: +- Project [value#15 AS c1#17] :: +- LocalRelation [value#15] :+- SubqueryAlias t2, `t2` : +- Project [value#8 AS c1#10] : +- LocalRelation [value#8] +- SubqueryAlias t1, `t1` +- Project [value#1 AS c1#3] +- LocalRelation [value#1] == Optimized Logical Plan == Join LeftSemi, (c1#3 = c1#10) :- InMemoryRelation [c1#3], true, 1, StorageLevel(disk, memory, deserialized, 1 replicas), t1 : +- LocalTableScan [c1#3] +- Join LeftSemi, (c1#10 = c1#17) :- InMemoryRelation [c1#10], true, 1, StorageLevel(disk, memory, deserialized, 1 replicas), t2 : +- LocalTableScan [c1#10] +- Filter (c1#17 = 1) +- InMemoryRelation [c1#17], true, 1, StorageLevel(disk, memory, deserialized, 1 replicas), t1 +- LocalTableScan [c1#3] ``` ## How was this patch tested? Added new tests in CachedTableSuite. Author: Dilip BiswalCloses #16493 from dilipbiswal/SPARK-19093. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4351e622 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4351e622 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4351e622 Branch: refs/heads/master Commit: 4351e62207957bec663108a571cff2bfaaa9e7d5 Parents: cd1d00a Author: Dilip Biswal Authored: Sun Jan 8 23:09:07 2017 +0100 Committer: Herman van Hovell Committed: Sun Jan 8 23:09:07 2017 +0100 -- .../spark/sql/execution/CacheManager.scala | 7 +- .../org/apache/spark/sql/CachedTableSuite.scala | 72 2 files changed, 78 insertions(+), 1 deletion(-)
spark git commit: [SPARK-19026] SPARK_LOCAL_DIRS(multiple directories on different disks) cannot be deleted
Repository: spark Updated Branches: refs/heads/master 6b6b555a1 -> cd1d00ada [SPARK-19026] SPARK_LOCAL_DIRS(multiple directories on different disks) cannot be deleted JIRA Issue: https://issues.apache.org/jira/browse/SPARK-19026 SPARK_LOCAL_DIRS (Standalone) can be a comma-separated list of multiple directories on different disks, e.g. SPARK_LOCAL_DIRS=/dir1,/dir2,/dir3, if there is a IOExecption when create sub directory on dir3 , the sub directory which have been created successfully on dir1 and dir2 cannot be deleted anymore when the application finishes. So we should catch the IOExecption at Utils.createDirectory , otherwise the variable "appDirectories(appId)" which the function maybeCleanupApplication calls will not be set then dir1 and dir2 will not be cleaned up . Author: zuotingbingCloses #16439 from zuotingbing/master. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cd1d00ad Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cd1d00ad Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cd1d00ad Branch: refs/heads/master Commit: cd1d00adaff65e8adfebc2342dd422c53f98166b Parents: 6b6b555 Author: zuotingbing Authored: Sun Jan 8 09:29:01 2017 + Committer: Sean Owen Committed: Sun Jan 8 09:29:01 2017 + -- .../org/apache/spark/deploy/worker/Worker.scala | 25 +++- 1 file changed, 19 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/cd1d00ad/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala index f963a46..e48817e 100755 --- a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala +++ b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala @@ -445,12 +445,25 @@ private[deploy] class Worker( // Create local dirs for the executor. These are passed to the executor via the // SPARK_EXECUTOR_DIRS environment variable, and deleted by the Worker when the // application finishes. - val appLocalDirs = appDirectories.getOrElse(appId, -Utils.getOrCreateLocalRootDirs(conf).map { dir => - val appDir = Utils.createDirectory(dir, namePrefix = "executor") - Utils.chmod700(appDir) - appDir.getAbsolutePath() -}.toSeq) + val appLocalDirs = appDirectories.getOrElse(appId, { +val localRootDirs = Utils.getOrCreateLocalRootDirs(conf) +val dirs = localRootDirs.flatMap { dir => + try { +val appDir = Utils.createDirectory(dir, namePrefix = "executor") +Utils.chmod700(appDir) +Some(appDir.getAbsolutePath()) + } catch { +case e: IOException => + logWarning(s"${e.getMessage}. Ignoring this directory.") + None + } +}.toSeq +if (dirs.isEmpty) { + throw new IOException("No subfolder can be created in " + +s"${localRootDirs.mkString(",")}.") +} +dirs + }) appDirectories(appId) = appLocalDirs val manager = new ExecutorRunner( appId, - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[4/5] spark git commit: [SPARK-18862][SPARKR][ML] Split SparkR mllib.R into multiple files
http://git-wip-us.apache.org/repos/asf/spark/blob/6b6b555a/R/pkg/R/mllib.R -- diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R deleted file mode 100644 index d736bbb..000 --- a/R/pkg/R/mllib.R +++ /dev/null @@ -1,2114 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -#http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# mllib.R: Provides methods for MLlib integration - -# Integration with R's standard functions. -# Most of MLlib's argorithms are provided in two flavours: -# - a specialization of the default R methods (glm). These methods try to respect -# the inputs and the outputs of R's method to the largest extent, but some small differences -# may exist. -# - a set of methods that reflect the arguments of the other languages supported by Spark. These -# methods are prefixed with the `spark.` prefix: spark.glm, spark.kmeans, etc. - -#' S4 class that represents a generalized linear model -#' -#' @param jobj a Java object reference to the backing Scala GeneralizedLinearRegressionWrapper -#' @export -#' @note GeneralizedLinearRegressionModel since 2.0.0 -setClass("GeneralizedLinearRegressionModel", representation(jobj = "jobj")) - -#' S4 class that represents a NaiveBayesModel -#' -#' @param jobj a Java object reference to the backing Scala NaiveBayesWrapper -#' @export -#' @note NaiveBayesModel since 2.0.0 -setClass("NaiveBayesModel", representation(jobj = "jobj")) - -#' S4 class that represents an LDAModel -#' -#' @param jobj a Java object reference to the backing Scala LDAWrapper -#' @export -#' @note LDAModel since 2.1.0 -setClass("LDAModel", representation(jobj = "jobj")) - -#' S4 class that represents a AFTSurvivalRegressionModel -#' -#' @param jobj a Java object reference to the backing Scala AFTSurvivalRegressionWrapper -#' @export -#' @note AFTSurvivalRegressionModel since 2.0.0 -setClass("AFTSurvivalRegressionModel", representation(jobj = "jobj")) - -#' S4 class that represents a KMeansModel -#' -#' @param jobj a Java object reference to the backing Scala KMeansModel -#' @export -#' @note KMeansModel since 2.0.0 -setClass("KMeansModel", representation(jobj = "jobj")) - -#' S4 class that represents a MultilayerPerceptronClassificationModel -#' -#' @param jobj a Java object reference to the backing Scala MultilayerPerceptronClassifierWrapper -#' @export -#' @note MultilayerPerceptronClassificationModel since 2.1.0 -setClass("MultilayerPerceptronClassificationModel", representation(jobj = "jobj")) - -#' S4 class that represents an IsotonicRegressionModel -#' -#' @param jobj a Java object reference to the backing Scala IsotonicRegressionModel -#' @export -#' @note IsotonicRegressionModel since 2.1.0 -setClass("IsotonicRegressionModel", representation(jobj = "jobj")) - -#' S4 class that represents a GaussianMixtureModel -#' -#' @param jobj a Java object reference to the backing Scala GaussianMixtureModel -#' @export -#' @note GaussianMixtureModel since 2.1.0 -setClass("GaussianMixtureModel", representation(jobj = "jobj")) - -#' S4 class that represents an ALSModel -#' -#' @param jobj a Java object reference to the backing Scala ALSWrapper -#' @export -#' @note ALSModel since 2.1.0 -setClass("ALSModel", representation(jobj = "jobj")) - -#' S4 class that represents an KSTest -#' -#' @param jobj a Java object reference to the backing Scala KSTestWrapper -#' @export -#' @note KSTest since 2.1.0 -setClass("KSTest", representation(jobj = "jobj")) - -#' S4 class that represents an LogisticRegressionModel -#' -#' @param jobj a Java object reference to the backing Scala LogisticRegressionModel -#' @export -#' @note LogisticRegressionModel since 2.1.0 -setClass("LogisticRegressionModel", representation(jobj = "jobj")) - -#' S4 class that represents a RandomForestRegressionModel -#' -#' @param jobj a Java object reference to the backing Scala RandomForestRegressionModel -#' @export -#' @note RandomForestRegressionModel since 2.1.0 -setClass("RandomForestRegressionModel", representation(jobj = "jobj")) - -#' S4 class that represents a RandomForestClassificationModel -#' -#' @param jobj a Java object reference to the backing Scala RandomForestClassificationModel -#' @export -#' @note RandomForestClassificationModel since 2.1.0
[5/5] spark git commit: [SPARK-18862][SPARKR][ML] Split SparkR mllib.R into multiple files
[SPARK-18862][SPARKR][ML] Split SparkR mllib.R into multiple files ## What changes were proposed in this pull request? SparkR ```mllib.R``` is getting bigger as we add more ML wrappers, I'd like to split it into multiple files to make us easy to maintain: * mllib_classification.R * mllib_clustering.R * mllib_recommendation.R * mllib_regression.R * mllib_stat.R * mllib_tree.R * mllib_utils.R Note: Only reorg, no actual code change. ## How was this patch tested? Existing tests. Author: Yanbo LiangCloses #16312 from yanboliang/spark-18862. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6b6b555a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6b6b555a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6b6b555a Branch: refs/heads/master Commit: 6b6b555a1e667a9f03dfe4a21e56c513a353a58d Parents: 923e594 Author: Yanbo Liang Authored: Sun Jan 8 01:10:36 2017 -0800 Committer: Yanbo Liang Committed: Sun Jan 8 01:10:36 2017 -0800 -- R/pkg/DESCRIPTION |8 +- R/pkg/R/mllib.R | 2114 -- R/pkg/R/mllib_classification.R | 417 R/pkg/R/mllib_clustering.R | 456 R/pkg/R/mllib_recommendation.R | 162 ++ R/pkg/R/mllib_regression.R | 448 R/pkg/R/mllib_stat.R| 127 ++ R/pkg/R/mllib_tree.R| 496 R/pkg/R/mllib_utils.R | 119 + R/pkg/inst/tests/testthat/test_mllib.R | 1170 -- .../tests/testthat/test_mllib_classification.R | 341 +++ .../inst/tests/testthat/test_mllib_clustering.R | 224 ++ .../tests/testthat/test_mllib_recommendation.R | 65 + .../inst/tests/testthat/test_mllib_regression.R | 417 R/pkg/inst/tests/testthat/test_mllib_stat.R | 53 + R/pkg/inst/tests/testthat/test_mllib_tree.R | 203 ++ 16 files changed, 3535 insertions(+), 3285 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6b6b555a/R/pkg/DESCRIPTION -- diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION index 6a36748..cc471ed 100644 --- a/R/pkg/DESCRIPTION +++ b/R/pkg/DESCRIPTION @@ -41,7 +41,13 @@ Collate: 'functions.R' 'install.R' 'jvm.R' -'mllib.R' +'mllib_classification.R' +'mllib_clustering.R' +'mllib_recommendation.R' +'mllib_regression.R' +'mllib_stat.R' +'mllib_tree.R' +'mllib_utils.R' 'serialize.R' 'sparkR.R' 'stats.R' - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[1/5] spark git commit: [SPARK-18862][SPARKR][ML] Split SparkR mllib.R into multiple files
Repository: spark Updated Branches: refs/heads/master 923e59484 -> 6b6b555a1 http://git-wip-us.apache.org/repos/asf/spark/blob/6b6b555a/R/pkg/inst/tests/testthat/test_mllib_classification.R -- diff --git a/R/pkg/inst/tests/testthat/test_mllib_classification.R b/R/pkg/inst/tests/testthat/test_mllib_classification.R new file mode 100644 index 000..2e0dea3 --- /dev/null +++ b/R/pkg/inst/tests/testthat/test_mllib_classification.R @@ -0,0 +1,341 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +library(testthat) + +context("MLlib classification algorithms, except for tree-based algorithms") + +# Tests for MLlib classification algorithms in SparkR +sparkSession <- sparkR.session(enableHiveSupport = FALSE) + +absoluteSparkPath <- function(x) { + sparkHome <- sparkR.conf("spark.home") + file.path(sparkHome, x) +} + +test_that("spark.logit", { + # R code to reproduce the result. + # nolint start + #' library(glmnet) + #' iris.x = as.matrix(iris[, 1:4]) + #' iris.y = as.factor(as.character(iris[, 5])) + #' logit = glmnet(iris.x, iris.y, family="multinomial", alpha=0, lambda=0.5) + #' coef(logit) + # + # $setosa + # 5 x 1 sparse Matrix of class "dgCMatrix" + # s0 + # 1.0981324 + # Sepal.Length -0.2909860 + # Sepal.Width 0.5510907 + # Petal.Length -0.1915217 + # Petal.Width -0.4211946 + # + # $versicolor + # 5 x 1 sparse Matrix of class "dgCMatrix" + # s0 + # 1.520061e+00 + # Sepal.Length 2.524501e-02 + # Sepal.Width -5.310313e-01 + # Petal.Length 3.656543e-02 + # Petal.Width -3.144464e-05 + # + # $virginica + # 5 x 1 sparse Matrix of class "dgCMatrix" + # s0 + # -2.61819385 + # Sepal.Length 0.26574097 + # Sepal.Width -0.02005932 + # Petal.Length 0.15495629 + # Petal.Width 0.42122607 + # nolint end + + # Test multinomial logistic regression againt three classes + df <- suppressWarnings(createDataFrame(iris)) + model <- spark.logit(df, Species ~ ., regParam = 0.5) + summary <- summary(model) + versicolorCoefsR <- c(1.52, 0.03, -0.53, 0.04, 0.00) + virginicaCoefsR <- c(-2.62, 0.27, -0.02, 0.16, 0.42) + setosaCoefsR <- c(1.10, -0.29, 0.55, -0.19, -0.42) + versicolorCoefs <- unlist(summary$coefficients[, "versicolor"]) + virginicaCoefs <- unlist(summary$coefficients[, "virginica"]) + setosaCoefs <- unlist(summary$coefficients[, "setosa"]) + expect_true(all(abs(versicolorCoefsR - versicolorCoefs) < 0.1)) + expect_true(all(abs(virginicaCoefsR - virginicaCoefs) < 0.1)) + expect_true(all(abs(setosaCoefs - setosaCoefs) < 0.1)) + + # Test model save and load + modelPath <- tempfile(pattern = "spark-logit", fileext = ".tmp") + write.ml(model, modelPath) + expect_error(write.ml(model, modelPath)) + write.ml(model, modelPath, overwrite = TRUE) + model2 <- read.ml(modelPath) + coefs <- summary(model)$coefficients + coefs2 <- summary(model2)$coefficients + expect_equal(coefs, coefs2) + unlink(modelPath) + + # R code to reproduce the result. + # nolint start + #' library(glmnet) + #' iris2 <- iris[iris$Species %in% c("versicolor", "virginica"), ] + #' iris.x = as.matrix(iris2[, 1:4]) + #' iris.y = as.factor(as.character(iris2[, 5])) + #' logit = glmnet(iris.x, iris.y, family="multinomial", alpha=0, lambda=0.5) + #' coef(logit) + # + # $versicolor + # 5 x 1 sparse Matrix of class "dgCMatrix" + # s0 + # 3.93844796 + # Sepal.Length -0.13538675 + # Sepal.Width -0.02386443 + # Petal.Length -0.35076451 + # Petal.Width -0.77971954 + # + # $virginica + # 5 x 1 sparse Matrix of class "dgCMatrix" + # s0 + # -3.93844796 + # Sepal.Length 0.13538675 + # Sepal.Width 0.02386443 + # Petal.Length 0.35076451 + # Petal.Width 0.77971954 + # + #' logit = glmnet(iris.x, iris.y, family="binomial", alpha=0, lambda=0.5) + #' coef(logit) + # + # 5 x 1 sparse Matrix of class "dgCMatrix" + # s0 + # (Intercept) -6.0824412 + # Sepal.Length 0.2458260 + # Sepal.Width 0.1642093 + # Petal.Length 0.4759487 + # Petal.Width 1.0383948 + # + # nolint end + + # Test multinomial logistic regression againt two classes + df <- suppressWarnings(createDataFrame(iris))
[3/5] spark git commit: [SPARK-18862][SPARKR][ML] Split SparkR mllib.R into multiple files
http://git-wip-us.apache.org/repos/asf/spark/blob/6b6b555a/R/pkg/R/mllib_classification.R -- diff --git a/R/pkg/R/mllib_classification.R b/R/pkg/R/mllib_classification.R new file mode 100644 index 000..8da8449 --- /dev/null +++ b/R/pkg/R/mllib_classification.R @@ -0,0 +1,417 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# mllib_regression.R: Provides methods for MLlib classification algorithms +# (except for tree-based algorithms) integration + +#' S4 class that represents an LogisticRegressionModel +#' +#' @param jobj a Java object reference to the backing Scala LogisticRegressionModel +#' @export +#' @note LogisticRegressionModel since 2.1.0 +setClass("LogisticRegressionModel", representation(jobj = "jobj")) + +#' S4 class that represents a MultilayerPerceptronClassificationModel +#' +#' @param jobj a Java object reference to the backing Scala MultilayerPerceptronClassifierWrapper +#' @export +#' @note MultilayerPerceptronClassificationModel since 2.1.0 +setClass("MultilayerPerceptronClassificationModel", representation(jobj = "jobj")) + +#' S4 class that represents a NaiveBayesModel +#' +#' @param jobj a Java object reference to the backing Scala NaiveBayesWrapper +#' @export +#' @note NaiveBayesModel since 2.0.0 +setClass("NaiveBayesModel", representation(jobj = "jobj")) + +#' Logistic Regression Model +#' +#' Fits an logistic regression model against a Spark DataFrame. It supports "binomial": Binary logistic regression +#' with pivoting; "multinomial": Multinomial logistic (softmax) regression without pivoting, similar to glmnet. +#' Users can print, make predictions on the produced model and save the model to the input path. +#' +#' @param data SparkDataFrame for training. +#' @param formula A symbolic description of the model to be fitted. Currently only a few formula +#'operators are supported, including '~', '.', ':', '+', and '-'. +#' @param regParam the regularization parameter. +#' @param elasticNetParam the ElasticNet mixing parameter. For alpha = 0.0, the penalty is an L2 penalty. +#'For alpha = 1.0, it is an L1 penalty. For 0.0 < alpha < 1.0, the penalty is a combination +#'of L1 and L2. Default is 0.0 which is an L2 penalty. +#' @param maxIter maximum iteration number. +#' @param tol convergence tolerance of iterations. +#' @param family the name of family which is a description of the label distribution to be used in the model. +#' Supported options: +#' \itemize{ +#' \item{"auto": Automatically select the family based on the number of classes: +#' If number of classes == 1 || number of classes == 2, set to "binomial". +#' Else, set to "multinomial".} +#' \item{"binomial": Binary logistic regression with pivoting.} +#' \item{"multinomial": Multinomial logistic (softmax) regression without pivoting.} +#' } +#' @param standardization whether to standardize the training features before fitting the model. The coefficients +#'of models will be always returned on the original scale, so it will be transparent for +#'users. Note that with/without standardization, the models should be always converged +#'to the same solution when no regularization is applied. Default is TRUE, same as glmnet. +#' @param thresholds in binary classification, in range [0, 1]. If the estimated probability of class label 1 +#' is > threshold, then predict 1, else 0. A high threshold encourages the model to predict 0 +#' more often; a low threshold encourages the model to predict 1 more often. Note: Setting this with +#' threshold p is equivalent to setting thresholds c(1-p, p). In multiclass (or binary) classification to adjust the probability of +#' predicting each class. Array must have length equal to the number of classes, with values > 0, +#' excepting that at most one value may be 0. The
[2/5] spark git commit: [SPARK-18862][SPARKR][ML] Split SparkR mllib.R into multiple files
http://git-wip-us.apache.org/repos/asf/spark/blob/6b6b555a/R/pkg/R/mllib_tree.R -- diff --git a/R/pkg/R/mllib_tree.R b/R/pkg/R/mllib_tree.R new file mode 100644 index 000..0d53fad --- /dev/null +++ b/R/pkg/R/mllib_tree.R @@ -0,0 +1,496 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# mllib_tree.R: Provides methods for MLlib tree-based algorithms integration + +#' S4 class that represents a GBTRegressionModel +#' +#' @param jobj a Java object reference to the backing Scala GBTRegressionModel +#' @export +#' @note GBTRegressionModel since 2.1.0 +setClass("GBTRegressionModel", representation(jobj = "jobj")) + +#' S4 class that represents a GBTClassificationModel +#' +#' @param jobj a Java object reference to the backing Scala GBTClassificationModel +#' @export +#' @note GBTClassificationModel since 2.1.0 +setClass("GBTClassificationModel", representation(jobj = "jobj")) + +#' S4 class that represents a RandomForestRegressionModel +#' +#' @param jobj a Java object reference to the backing Scala RandomForestRegressionModel +#' @export +#' @note RandomForestRegressionModel since 2.1.0 +setClass("RandomForestRegressionModel", representation(jobj = "jobj")) + +#' S4 class that represents a RandomForestClassificationModel +#' +#' @param jobj a Java object reference to the backing Scala RandomForestClassificationModel +#' @export +#' @note RandomForestClassificationModel since 2.1.0 +setClass("RandomForestClassificationModel", representation(jobj = "jobj")) + +# Create the summary of a tree ensemble model (eg. Random Forest, GBT) +summary.treeEnsemble <- function(model) { + jobj <- model@jobj + formula <- callJMethod(jobj, "formula") + numFeatures <- callJMethod(jobj, "numFeatures") + features <- callJMethod(jobj, "features") + featureImportances <- callJMethod(callJMethod(jobj, "featureImportances"), "toString") + numTrees <- callJMethod(jobj, "numTrees") + treeWeights <- callJMethod(jobj, "treeWeights") + list(formula = formula, + numFeatures = numFeatures, + features = features, + featureImportances = featureImportances, + numTrees = numTrees, + treeWeights = treeWeights, + jobj = jobj) +} + +# Prints the summary of tree ensemble models (eg. Random Forest, GBT) +print.summary.treeEnsemble <- function(x) { + jobj <- x$jobj + cat("Formula: ", x$formula) + cat("\nNumber of features: ", x$numFeatures) + cat("\nFeatures: ", unlist(x$features)) + cat("\nFeature importances: ", x$featureImportances) + cat("\nNumber of trees: ", x$numTrees) + cat("\nTree weights: ", unlist(x$treeWeights)) + + summaryStr <- callJMethod(jobj, "summary") + cat("\n", summaryStr, "\n") + invisible(x) +} + +#' Gradient Boosted Tree Model for Regression and Classification +#' +#' \code{spark.gbt} fits a Gradient Boosted Tree Regression model or Classification model on a +#' SparkDataFrame. Users can call \code{summary} to get a summary of the fitted +#' Gradient Boosted Tree model, \code{predict} to make predictions on new data, and +#' \code{write.ml}/\code{read.ml} to save/load fitted models. +#' For more details, see +#' \href{http://spark.apache.org/docs/latest/ml-classification-regression.html#gradient-boosted-tree-regression}{ +#' GBT Regression} and +#' \href{http://spark.apache.org/docs/latest/ml-classification-regression.html#gradient-boosted-tree-classifier}{ +#' GBT Classification} +#' +#' @param data a SparkDataFrame for training. +#' @param formula a symbolic description of the model to be fitted. Currently only a few formula +#'operators are supported, including '~', ':', '+', and '-'. +#' @param type type of model, one of "regression" or "classification", to fit +#' @param maxDepth Maximum depth of the tree (>= 0). +#' @param maxBins Maximum number of bins used for discretizing continuous features and for choosing +#'how to split on features at each node. More bins give higher granularity. Must be +#'>= 2 and >= number of categories in any categorical feature. +#' @param maxIter Param for maximum number of iterations (>= 0). +#' @param stepSize Param for Step size to be used for each iteration of