spark git commit: [SPARK-15706][SQL] Fix Wrong Answer when using IF NOT EXISTS in INSERT OVERWRITE for DYNAMIC PARTITION
Repository: spark Updated Branches: refs/heads/master 5ada60614 -> e5d703bca [SPARK-15706][SQL] Fix Wrong Answer when using IF NOT EXISTS in INSERT OVERWRITE for DYNAMIC PARTITION What changes were proposed in this pull request? `IF NOT EXISTS` in `INSERT OVERWRITE` should not support dynamic partitions. If we specify `IF NOT EXISTS`, the inserted statement is not shown in the table. This PR is to issue an exception in this case, just like what Hive does. Also issue an exception if users specify `IF NOT EXISTS` if users do not specify any `PARTITION` specification. How was this patch tested? Added test cases into `PlanParserSuite` and `InsertIntoHiveTableSuite` Author: gatorsmile Closes #13447 from gatorsmile/insertIfNotExist. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e5d703bc Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e5d703bc Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e5d703bc Branch: refs/heads/master Commit: e5d703bca85c65ce329b1e202283cfa35d109146 Parents: 5ada606 Author: gatorsmile Authored: Thu Jun 16 22:54:02 2016 -0700 Committer: Yin Huai Committed: Thu Jun 16 22:54:02 2016 -0700 -- .../apache/spark/sql/catalyst/parser/SqlBase.g4 | 2 +- .../spark/sql/catalyst/parser/AstBuilder.scala | 6 ++ .../plans/logical/basicLogicalOperators.scala | 1 + .../sql/catalyst/parser/PlanParserSuite.scala | 13 ++-- .../sql/hive/InsertIntoHiveTableSuite.scala | 68 5 files changed, 85 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e5d703bc/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 -- diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 index b603196..23e925e 100644 --- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 +++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 @@ -203,7 +203,7 @@ query ; insertInto -: INSERT OVERWRITE TABLE tableIdentifier partitionSpec? (IF NOT EXISTS)? +: INSERT OVERWRITE TABLE tableIdentifier (partitionSpec (IF NOT EXISTS)?)? | INSERT INTO TABLE? tableIdentifier partitionSpec? ; http://git-wip-us.apache.org/repos/asf/spark/blob/e5d703bc/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index e380643..c7420a1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -171,6 +171,12 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with Logging { val tableIdent = visitTableIdentifier(ctx.tableIdentifier) val partitionKeys = Option(ctx.partitionSpec).map(visitPartitionSpec).getOrElse(Map.empty) +val dynamicPartitionKeys = partitionKeys.filter(_._2.isEmpty) +if (ctx.EXISTS != null && dynamicPartitionKeys.nonEmpty) { + throw new ParseException(s"Dynamic partitions do not support IF NOT EXISTS. Specified " + +"partitions with value: " + dynamicPartitionKeys.keys.mkString("[", ",", "]"), ctx) +} + InsertIntoTable( UnresolvedRelation(tableIdent, None), partitionKeys, http://git-wip-us.apache.org/repos/asf/spark/blob/e5d703bc/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala index 898784d..6c3eb3a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala @@ -377,6 +377,7 @@ case class InsertIntoTable( } assert(overwrite || !ifNotExists) + assert(partition.values.forall(_.nonEmpty) || !ifNotExists) override lazy val resolved: Boolean = childrenResolved && table.resolved && expectedColumns.forall { expected => child.output.size == expected.size && child.output.zip(expected).forall { http://git-wip-us.apache.org/repos/asf/spark/blob/e5d703bc/sql/catalyst/src/test/scala/or
spark git commit: [SPARK-15706][SQL] Fix Wrong Answer when using IF NOT EXISTS in INSERT OVERWRITE for DYNAMIC PARTITION
Repository: spark Updated Branches: refs/heads/branch-2.0 3994372f4 -> b82abde06 [SPARK-15706][SQL] Fix Wrong Answer when using IF NOT EXISTS in INSERT OVERWRITE for DYNAMIC PARTITION What changes were proposed in this pull request? `IF NOT EXISTS` in `INSERT OVERWRITE` should not support dynamic partitions. If we specify `IF NOT EXISTS`, the inserted statement is not shown in the table. This PR is to issue an exception in this case, just like what Hive does. Also issue an exception if users specify `IF NOT EXISTS` if users do not specify any `PARTITION` specification. How was this patch tested? Added test cases into `PlanParserSuite` and `InsertIntoHiveTableSuite` Author: gatorsmile Closes #13447 from gatorsmile/insertIfNotExist. (cherry picked from commit e5d703bca85c65ce329b1e202283cfa35d109146) Signed-off-by: Yin Huai Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b82abde0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b82abde0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b82abde0 Branch: refs/heads/branch-2.0 Commit: b82abde060d97bd95f4fba547545a830602a35fa Parents: 3994372 Author: gatorsmile Authored: Thu Jun 16 22:54:02 2016 -0700 Committer: Yin Huai Committed: Thu Jun 16 22:54:17 2016 -0700 -- .../apache/spark/sql/catalyst/parser/SqlBase.g4 | 2 +- .../spark/sql/catalyst/parser/AstBuilder.scala | 6 ++ .../plans/logical/basicLogicalOperators.scala | 1 + .../sql/catalyst/parser/PlanParserSuite.scala | 13 ++-- .../sql/hive/InsertIntoHiveTableSuite.scala | 68 5 files changed, 85 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b82abde0/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 -- diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 index b603196..23e925e 100644 --- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 +++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 @@ -203,7 +203,7 @@ query ; insertInto -: INSERT OVERWRITE TABLE tableIdentifier partitionSpec? (IF NOT EXISTS)? +: INSERT OVERWRITE TABLE tableIdentifier (partitionSpec (IF NOT EXISTS)?)? | INSERT INTO TABLE? tableIdentifier partitionSpec? ; http://git-wip-us.apache.org/repos/asf/spark/blob/b82abde0/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index e380643..c7420a1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -171,6 +171,12 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with Logging { val tableIdent = visitTableIdentifier(ctx.tableIdentifier) val partitionKeys = Option(ctx.partitionSpec).map(visitPartitionSpec).getOrElse(Map.empty) +val dynamicPartitionKeys = partitionKeys.filter(_._2.isEmpty) +if (ctx.EXISTS != null && dynamicPartitionKeys.nonEmpty) { + throw new ParseException(s"Dynamic partitions do not support IF NOT EXISTS. Specified " + +"partitions with value: " + dynamicPartitionKeys.keys.mkString("[", ",", "]"), ctx) +} + InsertIntoTable( UnresolvedRelation(tableIdent, None), partitionKeys, http://git-wip-us.apache.org/repos/asf/spark/blob/b82abde0/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala index 898784d..6c3eb3a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala @@ -377,6 +377,7 @@ case class InsertIntoTable( } assert(overwrite || !ifNotExists) + assert(partition.values.forall(_.nonEmpty) || !ifNotExists) override lazy val resolved: Boolean = childrenResolved && table.resolved && expectedColumns.forall { expected => child.output.size == expected.size && child.output.zip(expect
spark git commit: [SPARK-15822] [SQL] Prevent byte array backed classes from referencing freed memory
Repository: spark Updated Branches: refs/heads/branch-2.0 f530331e6 -> 3994372f4 [SPARK-15822] [SQL] Prevent byte array backed classes from referencing freed memory ## What changes were proposed in this pull request? `UTF8String` and all `Unsafe*` classes are backed by either on-heap or off-heap byte arrays. The code generated version `SortMergeJoin` buffers the left hand side join keys during iteration. This was actually problematic in off-heap mode when one of the keys is a `UTF8String` (or any other 'Unsafe*` object) and the left hand side iterator was exhausted (and released its memory); the buffered keys would reference freed memory. This causes Seg-faults and all kinds of other undefined behavior when we would use one these buffered keys. This PR fixes this problem by creating copies of the buffered variables. I have added a general method to the `CodeGenerator` for this. I have checked all places in which this could happen, and only `SortMergeJoin` had this problem. This PR is largely based on the work of robbinspg and he should be credited for this. closes https://github.com/apache/spark/pull/13707 ## How was this patch tested? Manually tested on problematic workloads. Author: Pete Robbins Author: Herman van Hovell Closes #13723 from hvanhovell/SPARK-15822-2. (cherry picked from commit 5ada606144c7bf38a797764619d7d1ff677802b3) Signed-off-by: Davies Liu Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3994372f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3994372f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3994372f Branch: refs/heads/branch-2.0 Commit: 3994372f48eefe080ce7a80750ccf960e3a7968b Parents: f530331 Author: Pete Robbins Authored: Thu Jun 16 22:27:32 2016 -0700 Committer: Davies Liu Committed: Thu Jun 16 22:27:43 2016 -0700 -- .../expressions/codegen/CodeGenerator.scala | 16 .../sql/execution/joins/SortMergeJoinExec.scala | 8 +--- 2 files changed, 17 insertions(+), 7 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3994372f/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala index ff97cd3..6392ff4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala @@ -130,6 +130,22 @@ class CodegenContext { mutableStates += ((javaType, variableName, initCode)) } + /** + * Add buffer variable which stores data coming from an [[InternalRow]]. This methods guarantees + * that the variable is safely stored, which is important for (potentially) byte array backed + * data types like: UTF8String, ArrayData, MapData & InternalRow. + */ + def addBufferedState(dataType: DataType, variableName: String, initCode: String): ExprCode = { +val value = freshName(variableName) +addMutableState(javaType(dataType), value, "") +val code = dataType match { + case StringType => s"$value = $initCode.clone();" + case _: StructType | _: ArrayType | _: MapType => s"$value = $initCode.copy();" + case _ => s"$value = $initCode;" +} +ExprCode(code, "false", value) + } + def declareMutableStates(): String = { // It's possible that we add same mutable state twice, e.g. the `mergeExpressions` in // `TypedAggregateExpression`, we should call `distinct` here to remove the duplicated ones. http://git-wip-us.apache.org/repos/asf/spark/blob/3994372f/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala index 32f0bc5..fac6b8d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala @@ -336,13 +336,7 @@ case class SortMergeJoinExec( private def copyKeys(ctx: CodegenContext, vars: Seq[ExprCode]): Seq[ExprCode] = { vars.zipWithIndex.map { case (ev, i) => - val value = ctx.freshName("value") - ctx.addMutableState(ctx.javaType(leftKeys(i).dataType), value, "") - val code = -s""" - |$value = ${ev.value}; - """.st
spark git commit: [SPARK-15822] [SQL] Prevent byte array backed classes from referencing freed memory
Repository: spark Updated Branches: refs/heads/master 513a03e41 -> 5ada60614 [SPARK-15822] [SQL] Prevent byte array backed classes from referencing freed memory ## What changes were proposed in this pull request? `UTF8String` and all `Unsafe*` classes are backed by either on-heap or off-heap byte arrays. The code generated version `SortMergeJoin` buffers the left hand side join keys during iteration. This was actually problematic in off-heap mode when one of the keys is a `UTF8String` (or any other 'Unsafe*` object) and the left hand side iterator was exhausted (and released its memory); the buffered keys would reference freed memory. This causes Seg-faults and all kinds of other undefined behavior when we would use one these buffered keys. This PR fixes this problem by creating copies of the buffered variables. I have added a general method to the `CodeGenerator` for this. I have checked all places in which this could happen, and only `SortMergeJoin` had this problem. This PR is largely based on the work of robbinspg and he should be credited for this. closes https://github.com/apache/spark/pull/13707 ## How was this patch tested? Manually tested on problematic workloads. Author: Pete Robbins Author: Herman van Hovell Closes #13723 from hvanhovell/SPARK-15822-2. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5ada6061 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5ada6061 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5ada6061 Branch: refs/heads/master Commit: 5ada606144c7bf38a797764619d7d1ff677802b3 Parents: 513a03e Author: Pete Robbins Authored: Thu Jun 16 22:27:32 2016 -0700 Committer: Davies Liu Committed: Thu Jun 16 22:27:32 2016 -0700 -- .../expressions/codegen/CodeGenerator.scala | 16 .../sql/execution/joins/SortMergeJoinExec.scala | 8 +--- 2 files changed, 17 insertions(+), 7 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5ada6061/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala index ff97cd3..6392ff4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala @@ -130,6 +130,22 @@ class CodegenContext { mutableStates += ((javaType, variableName, initCode)) } + /** + * Add buffer variable which stores data coming from an [[InternalRow]]. This methods guarantees + * that the variable is safely stored, which is important for (potentially) byte array backed + * data types like: UTF8String, ArrayData, MapData & InternalRow. + */ + def addBufferedState(dataType: DataType, variableName: String, initCode: String): ExprCode = { +val value = freshName(variableName) +addMutableState(javaType(dataType), value, "") +val code = dataType match { + case StringType => s"$value = $initCode.clone();" + case _: StructType | _: ArrayType | _: MapType => s"$value = $initCode.copy();" + case _ => s"$value = $initCode;" +} +ExprCode(code, "false", value) + } + def declareMutableStates(): String = { // It's possible that we add same mutable state twice, e.g. the `mergeExpressions` in // `TypedAggregateExpression`, we should call `distinct` here to remove the duplicated ones. http://git-wip-us.apache.org/repos/asf/spark/blob/5ada6061/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala index 32f0bc5..fac6b8d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala @@ -336,13 +336,7 @@ case class SortMergeJoinExec( private def copyKeys(ctx: CodegenContext, vars: Seq[ExprCode]): Seq[ExprCode] = { vars.zipWithIndex.map { case (ev, i) => - val value = ctx.freshName("value") - ctx.addMutableState(ctx.javaType(leftKeys(i).dataType), value, "") - val code = -s""" - |$value = ${ev.value}; - """.stripMargin - ExprCode(code, "false", value) + ctx.addBufferedState(leftKeys(i).dataType, "value"
spark git commit: [SPARK-15908][R] Add varargs-type dropDuplicates() function in SparkR
Repository: spark Updated Branches: refs/heads/branch-2.0 2127f99f2 -> f530331e6 [SPARK-15908][R] Add varargs-type dropDuplicates() function in SparkR ## What changes were proposed in this pull request? This PR adds varargs-type `dropDuplicates` function to SparkR for API parity. Refer to https://issues.apache.org/jira/browse/SPARK-15807, too. ## How was this patch tested? Pass the Jenkins tests with new testcases. Author: Dongjoon Hyun Closes #13684 from dongjoon-hyun/SPARK-15908. (cherry picked from commit 513a03e41e27d9c5f70911faccc5d3aecd8bdde9) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f530331e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f530331e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f530331e Branch: refs/heads/branch-2.0 Commit: f530331e6f8160f3fb2613722fae01ea589f0e99 Parents: 2127f99 Author: Dongjoon Hyun Authored: Thu Jun 16 20:35:17 2016 -0700 Committer: Shivaram Venkataraman Committed: Thu Jun 16 20:35:25 2016 -0700 -- R/pkg/R/DataFrame.R | 25 +++-- R/pkg/R/generics.R| 7 ++- R/pkg/inst/tests/testthat/test_sparkSQL.R | 8 3 files changed, 29 insertions(+), 11 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f530331e/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index d72cbbd..c710bff 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -1936,10 +1936,11 @@ setMethod("where", #' the subset of columns. #' #' @param x A SparkDataFrame. -#' @param colnames A character vector of column names. +#' @param ... A character vector of column names or string column names. +#'If the first argument contains a character vector, the followings are ignored. #' @return A SparkDataFrame with duplicate rows removed. #' @family SparkDataFrame functions -#' @rdname dropduplicates +#' @rdname dropDuplicates #' @name dropDuplicates #' @export #' @examples @@ -1949,14 +1950,26 @@ setMethod("where", #' path <- "path/to/file.json" #' df <- read.json(path) #' dropDuplicates(df) +#' dropDuplicates(df, "col1", "col2") #' dropDuplicates(df, c("col1", "col2")) #' } setMethod("dropDuplicates", signature(x = "SparkDataFrame"), - function(x, colNames = columns(x)) { -stopifnot(class(colNames) == "character") - -sdf <- callJMethod(x@sdf, "dropDuplicates", as.list(colNames)) + function(x, ...) { +cols <- list(...) +if (length(cols) == 0) { + sdf <- callJMethod(x@sdf, "dropDuplicates", as.list(columns(x))) +} else { + if (!all(sapply(cols, function(c) { is.character(c) }))) { +stop("all columns names should be characters") + } + col <- cols[[1]] + if (length(col) > 1) { +sdf <- callJMethod(x@sdf, "dropDuplicates", as.list(col)) + } else { +sdf <- callJMethod(x@sdf, "dropDuplicates", cols) + } +} dataFrame(sdf) }) http://git-wip-us.apache.org/repos/asf/spark/blob/f530331e/R/pkg/R/generics.R -- diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 40a96d8..8164e77 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -466,12 +466,9 @@ setGeneric("describe", function(x, col, ...) { standardGeneric("describe") }) #' @export setGeneric("drop", function(x, ...) { standardGeneric("drop") }) -#' @rdname dropduplicates +#' @rdname dropDuplicates #' @export -setGeneric("dropDuplicates", - function(x, colNames = columns(x)) { - standardGeneric("dropDuplicates") - }) +setGeneric("dropDuplicates", function(x, ...) { standardGeneric("dropDuplicates") }) #' @rdname nafunctions #' @export http://git-wip-us.apache.org/repos/asf/spark/blob/f530331e/R/pkg/inst/tests/testthat/test_sparkSQL.R -- diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index c11930a..11d6936 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -796,6 +796,14 @@ test_that("distinct(), unique() and dropDuplicates() on DataFrames", { result[order(result$key, result$value1, result$value2), ], expected) + result <- collect(dropDuplicates(df, "key", "value1")) + expected <- rbind.data.frame( +c(1, 1, 1), c(1, 2, 1), c(2, 1, 2), c(2, 2, 2)) + names(expected) <- c("key", "value1", "value2"
spark git commit: [SPARK-15908][R] Add varargs-type dropDuplicates() function in SparkR
Repository: spark Updated Branches: refs/heads/master 5fd20b66f -> 513a03e41 [SPARK-15908][R] Add varargs-type dropDuplicates() function in SparkR ## What changes were proposed in this pull request? This PR adds varargs-type `dropDuplicates` function to SparkR for API parity. Refer to https://issues.apache.org/jira/browse/SPARK-15807, too. ## How was this patch tested? Pass the Jenkins tests with new testcases. Author: Dongjoon Hyun Closes #13684 from dongjoon-hyun/SPARK-15908. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/513a03e4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/513a03e4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/513a03e4 Branch: refs/heads/master Commit: 513a03e41e27d9c5f70911faccc5d3aecd8bdde9 Parents: 5fd20b6 Author: Dongjoon Hyun Authored: Thu Jun 16 20:35:17 2016 -0700 Committer: Shivaram Venkataraman Committed: Thu Jun 16 20:35:17 2016 -0700 -- R/pkg/R/DataFrame.R | 25 +++-- R/pkg/R/generics.R| 7 ++- R/pkg/inst/tests/testthat/test_sparkSQL.R | 8 3 files changed, 29 insertions(+), 11 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/513a03e4/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index d72cbbd..c710bff 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -1936,10 +1936,11 @@ setMethod("where", #' the subset of columns. #' #' @param x A SparkDataFrame. -#' @param colnames A character vector of column names. +#' @param ... A character vector of column names or string column names. +#'If the first argument contains a character vector, the followings are ignored. #' @return A SparkDataFrame with duplicate rows removed. #' @family SparkDataFrame functions -#' @rdname dropduplicates +#' @rdname dropDuplicates #' @name dropDuplicates #' @export #' @examples @@ -1949,14 +1950,26 @@ setMethod("where", #' path <- "path/to/file.json" #' df <- read.json(path) #' dropDuplicates(df) +#' dropDuplicates(df, "col1", "col2") #' dropDuplicates(df, c("col1", "col2")) #' } setMethod("dropDuplicates", signature(x = "SparkDataFrame"), - function(x, colNames = columns(x)) { -stopifnot(class(colNames) == "character") - -sdf <- callJMethod(x@sdf, "dropDuplicates", as.list(colNames)) + function(x, ...) { +cols <- list(...) +if (length(cols) == 0) { + sdf <- callJMethod(x@sdf, "dropDuplicates", as.list(columns(x))) +} else { + if (!all(sapply(cols, function(c) { is.character(c) }))) { +stop("all columns names should be characters") + } + col <- cols[[1]] + if (length(col) > 1) { +sdf <- callJMethod(x@sdf, "dropDuplicates", as.list(col)) + } else { +sdf <- callJMethod(x@sdf, "dropDuplicates", cols) + } +} dataFrame(sdf) }) http://git-wip-us.apache.org/repos/asf/spark/blob/513a03e4/R/pkg/R/generics.R -- diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 40a96d8..8164e77 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -466,12 +466,9 @@ setGeneric("describe", function(x, col, ...) { standardGeneric("describe") }) #' @export setGeneric("drop", function(x, ...) { standardGeneric("drop") }) -#' @rdname dropduplicates +#' @rdname dropDuplicates #' @export -setGeneric("dropDuplicates", - function(x, colNames = columns(x)) { - standardGeneric("dropDuplicates") - }) +setGeneric("dropDuplicates", function(x, ...) { standardGeneric("dropDuplicates") }) #' @rdname nafunctions #' @export http://git-wip-us.apache.org/repos/asf/spark/blob/513a03e4/R/pkg/inst/tests/testthat/test_sparkSQL.R -- diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index c11930a..11d6936 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -796,6 +796,14 @@ test_that("distinct(), unique() and dropDuplicates() on DataFrames", { result[order(result$key, result$value1, result$value2), ], expected) + result <- collect(dropDuplicates(df, "key", "value1")) + expected <- rbind.data.frame( +c(1, 1, 1), c(1, 2, 1), c(2, 1, 2), c(2, 2, 2)) + names(expected) <- c("key", "value1", "value2") + expect_equivalent( +result[order(result$key, result$value1, result$value2), ], +expected) + result <
spark git commit: [SPARK-15490][R][DOC] SparkR 2.0 QA: New R APIs and API docs for non-MLib changes
Repository: spark Updated Branches: refs/heads/branch-2.0 feaba979b -> 2127f99f2 [SPARK-15490][R][DOC] SparkR 2.0 QA: New R APIs and API docs for non-MLib changes ## What changes were proposed in this pull request? R Docs changes include typos, format, layout. ## How was this patch tested? Test locally. Author: Kai Jiang Closes #13394 from vectorijk/spark-15490. (cherry picked from commit 5fd20b66ffe18c05cf257af7f30d32464d2fe8e7) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2127f99f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2127f99f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2127f99f Branch: refs/heads/branch-2.0 Commit: 2127f99f2c8cf6d3f85e6408ce47b82e0c3cad4d Parents: feaba97 Author: Kai Jiang Authored: Thu Jun 16 19:39:33 2016 -0700 Committer: Joseph K. Bradley Committed: Thu Jun 16 19:39:43 2016 -0700 -- R/pkg/R/DataFrame.R | 91 ++- R/pkg/R/RDD.R| 14 R/pkg/R/WindowSpec.R | 7 ++-- R/pkg/R/broadcast.R | 8 +++-- R/pkg/R/column.R | 6 ++-- R/pkg/R/context.R| 41 ++--- R/pkg/R/functions.R | 2 +- R/pkg/R/group.R | 6 ++-- R/pkg/R/mllib.R | 34 -- R/pkg/R/utils.R | 2 ++ 10 files changed, 123 insertions(+), 88 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2127f99f/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 9a9b3f7..d72cbbd 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -23,9 +23,11 @@ NULL setOldClass("jobj") setOldClass("structType") -#' @title S4 class that represents a SparkDataFrame -#' @description DataFrames can be created using functions like \link{createDataFrame}, -#' \link{read.json}, \link{table} etc. +#' S4 class that represents a SparkDataFrame +#' +#' DataFrames can be created using functions like \link{createDataFrame}, +#' \link{read.json}, \link{table} etc. +#' #' @family SparkDataFrame functions #' @rdname SparkDataFrame #' @docType class @@ -629,8 +631,6 @@ setMethod("repartition", #' #' @param x A SparkDataFrame #' @return A StringRRDD of JSON objects -#' @family SparkDataFrame functions -#' @rdname tojson #' @noRd #' @examples #'\dontrun{ @@ -648,7 +648,7 @@ setMethod("toJSON", RDD(jrdd, serializedMode = "string") }) -#' write.json +#' Save the contents of SparkDataFrame as a JSON file #' #' Save the contents of a SparkDataFrame as a JSON file (one object per line). Files written out #' with this method can be read back in as a SparkDataFrame using read.json(). @@ -675,7 +675,7 @@ setMethod("write.json", invisible(callJMethod(write, "json", path)) }) -#' write.parquet +#' Save the contents of SparkDataFrame as a Parquet file, preserving the schema. #' #' Save the contents of a SparkDataFrame as a Parquet file, preserving the schema. Files written out #' with this method can be read back in as a SparkDataFrame using read.parquet(). @@ -713,9 +713,9 @@ setMethod("saveAsParquetFile", write.parquet(x, path) }) -#' write.text +#' Save the content of SparkDataFrame in a text file at the specified path. #' -#' Saves the content of the SparkDataFrame in a text file at the specified path. +#' Save the content of the SparkDataFrame in a text file at the specified path. #' The SparkDataFrame must have only one column of string type with the name "value". #' Each row becomes a new line in the output file. #' @@ -820,8 +820,6 @@ setMethod("sample_frac", sample(x, withReplacement, fraction, seed) }) -#' nrow -#' #' Returns the number of rows in a SparkDataFrame #' #' @param x A SparkDataFrame @@ -874,6 +872,8 @@ setMethod("ncol", length(columns(x)) }) +#' Returns the dimensions of SparkDataFrame +#' #' Returns the dimensions (number of rows and columns) of a SparkDataFrame #' @param x a SparkDataFrame #' @@ -2012,8 +2012,9 @@ setMethod("join", dataFrame(sdf) }) +#' Merges two data frames +#' #' @name merge -#' @title Merges two data frames #' @param x the first data frame to be joined #' @param y the second data frame to be joined #' @param by a character vector specifying the join columns. If by is not @@ -2127,7 +2128,6 @@ setMethod("merge", joinRes }) -#' #' Creates a list of columns by replacing the intersected ones with aliases. #' The name of the alias column is formed by concatanating the original column name and a suffix. #' @@ -2182,8 +2182,9 @@ setMethod("unionAll", dataFrame(unioned)
spark git commit: [SPARK-15490][R][DOC] SparkR 2.0 QA: New R APIs and API docs for non-MLib changes
Repository: spark Updated Branches: refs/heads/master 63470afc9 -> 5fd20b66f [SPARK-15490][R][DOC] SparkR 2.0 QA: New R APIs and API docs for non-MLib changes ## What changes were proposed in this pull request? R Docs changes include typos, format, layout. ## How was this patch tested? Test locally. Author: Kai Jiang Closes #13394 from vectorijk/spark-15490. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5fd20b66 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5fd20b66 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5fd20b66 Branch: refs/heads/master Commit: 5fd20b66ffe18c05cf257af7f30d32464d2fe8e7 Parents: 63470af Author: Kai Jiang Authored: Thu Jun 16 19:39:33 2016 -0700 Committer: Joseph K. Bradley Committed: Thu Jun 16 19:39:33 2016 -0700 -- R/pkg/R/DataFrame.R | 91 ++- R/pkg/R/RDD.R| 14 R/pkg/R/WindowSpec.R | 7 ++-- R/pkg/R/broadcast.R | 8 +++-- R/pkg/R/column.R | 6 ++-- R/pkg/R/context.R| 41 ++--- R/pkg/R/functions.R | 2 +- R/pkg/R/group.R | 6 ++-- R/pkg/R/mllib.R | 34 -- R/pkg/R/utils.R | 2 ++ 10 files changed, 123 insertions(+), 88 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5fd20b66/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 9a9b3f7..d72cbbd 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -23,9 +23,11 @@ NULL setOldClass("jobj") setOldClass("structType") -#' @title S4 class that represents a SparkDataFrame -#' @description DataFrames can be created using functions like \link{createDataFrame}, -#' \link{read.json}, \link{table} etc. +#' S4 class that represents a SparkDataFrame +#' +#' DataFrames can be created using functions like \link{createDataFrame}, +#' \link{read.json}, \link{table} etc. +#' #' @family SparkDataFrame functions #' @rdname SparkDataFrame #' @docType class @@ -629,8 +631,6 @@ setMethod("repartition", #' #' @param x A SparkDataFrame #' @return A StringRRDD of JSON objects -#' @family SparkDataFrame functions -#' @rdname tojson #' @noRd #' @examples #'\dontrun{ @@ -648,7 +648,7 @@ setMethod("toJSON", RDD(jrdd, serializedMode = "string") }) -#' write.json +#' Save the contents of SparkDataFrame as a JSON file #' #' Save the contents of a SparkDataFrame as a JSON file (one object per line). Files written out #' with this method can be read back in as a SparkDataFrame using read.json(). @@ -675,7 +675,7 @@ setMethod("write.json", invisible(callJMethod(write, "json", path)) }) -#' write.parquet +#' Save the contents of SparkDataFrame as a Parquet file, preserving the schema. #' #' Save the contents of a SparkDataFrame as a Parquet file, preserving the schema. Files written out #' with this method can be read back in as a SparkDataFrame using read.parquet(). @@ -713,9 +713,9 @@ setMethod("saveAsParquetFile", write.parquet(x, path) }) -#' write.text +#' Save the content of SparkDataFrame in a text file at the specified path. #' -#' Saves the content of the SparkDataFrame in a text file at the specified path. +#' Save the content of the SparkDataFrame in a text file at the specified path. #' The SparkDataFrame must have only one column of string type with the name "value". #' Each row becomes a new line in the output file. #' @@ -820,8 +820,6 @@ setMethod("sample_frac", sample(x, withReplacement, fraction, seed) }) -#' nrow -#' #' Returns the number of rows in a SparkDataFrame #' #' @param x A SparkDataFrame @@ -874,6 +872,8 @@ setMethod("ncol", length(columns(x)) }) +#' Returns the dimensions of SparkDataFrame +#' #' Returns the dimensions (number of rows and columns) of a SparkDataFrame #' @param x a SparkDataFrame #' @@ -2012,8 +2012,9 @@ setMethod("join", dataFrame(sdf) }) +#' Merges two data frames +#' #' @name merge -#' @title Merges two data frames #' @param x the first data frame to be joined #' @param y the second data frame to be joined #' @param by a character vector specifying the join columns. If by is not @@ -2127,7 +2128,6 @@ setMethod("merge", joinRes }) -#' #' Creates a list of columns by replacing the intersected ones with aliases. #' The name of the alias column is formed by concatanating the original column name and a suffix. #' @@ -2182,8 +2182,9 @@ setMethod("unionAll", dataFrame(unioned) }) -#' @title Union two or more SparkDataFrames -#' @description Returns a new SparkDataFrame containing r
spark git commit: [SPARK-15782][YARN] Fix spark.jars and spark.yarn.dist.jars handling
Repository: spark Updated Branches: refs/heads/master f1bf0d2f3 -> 63470afc9 [SPARK-15782][YARN] Fix spark.jars and spark.yarn.dist.jars handling When `--packages` is specified with spark-shell the classes from those packages cannot be found, which I think is due to some of the changes in SPARK-12343. Tested manually with both scala 2.10 and 2.11 repls. vanzin davies can you guys please review? Author: Marcelo Vanzin Author: Nezih Yigitbasi Closes #13709 from nezihyigitbasi/SPARK-15782. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/63470afc Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/63470afc Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/63470afc Branch: refs/heads/master Commit: 63470afc997fb9d6b6f8a911c25964743556c9cc Parents: f1bf0d2 Author: Nezih Yigitbasi Authored: Thu Jun 16 18:19:29 2016 -0700 Committer: Marcelo Vanzin Committed: Thu Jun 16 18:20:16 2016 -0700 -- .../scala/org/apache/spark/SparkContext.scala | 2 +- .../scala/org/apache/spark/util/Utils.scala | 25 +++ .../apache/spark/deploy/SparkSubmitSuite.scala | 12 .../org/apache/spark/repl/SparkILoop.scala | 32 .../main/scala/org/apache/spark/repl/Main.scala | 4 +-- 5 files changed, 59 insertions(+), 16 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/63470afc/core/src/main/scala/org/apache/spark/SparkContext.scala -- diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index d56946e..d870181 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -391,7 +391,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli _conf.set("spark.executor.id", SparkContext.DRIVER_IDENTIFIER) -_jars = _conf.getOption("spark.jars").map(_.split(",")).map(_.filter(_.nonEmpty)).toSeq.flatten +_jars = Utils.getUserJars(_conf) _files = _conf.getOption("spark.files").map(_.split(",")).map(_.filter(_.nonEmpty)) .toSeq.flatten http://git-wip-us.apache.org/repos/asf/spark/blob/63470afc/core/src/main/scala/org/apache/spark/util/Utils.scala -- diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index f9d0540..17d193b 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -2352,6 +2352,31 @@ private[spark] object Utils extends Logging { log.info(s"Started daemon with process name: ${Utils.getProcessName()}") SignalUtils.registerLogger(log) } + + /** + * Unions two comma-separated lists of files and filters out empty strings. + */ + def unionFileLists(leftList: Option[String], rightList: Option[String]): Set[String] = { +var allFiles = Set[String]() +leftList.foreach { value => allFiles ++= value.split(",") } +rightList.foreach { value => allFiles ++= value.split(",") } +allFiles.filter { _.nonEmpty } + } + + /** + * In YARN mode this method returns a union of the jar files pointed by "spark.jars" and the + * "spark.yarn.dist.jars" properties, while in other modes it returns the jar files pointed by + * only the "spark.jars" property. + */ + def getUserJars(conf: SparkConf): Seq[String] = { +val sparkJars = conf.getOption("spark.jars") +if (conf.get("spark.master") == "yarn") { + val yarnJars = conf.getOption("spark.yarn.dist.jars") + unionFileLists(sparkJars, yarnJars).toSeq +} else { + sparkJars.map(_.split(",")).map(_.filter(_.nonEmpty)).toSeq.flatten +} + } } /** http://git-wip-us.apache.org/repos/asf/spark/blob/63470afc/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala index 2718976..0b02059 100644 --- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala @@ -570,6 +570,18 @@ class SparkSubmitSuite appArgs.executorMemory should be ("2.3g") } } + + test("comma separated list of files are unioned correctly") { +val left = Option("/tmp/a.jar,/tmp/b.jar") +val right = Option("/tmp/c.jar,/tmp/a.jar") +val emptyString = Option("") +Utils.unionFileLists(left, right) should be (Set("/tmp/a.jar", "/tmp/b.jar", "/tmp/c.jar")) +
spark git commit: [SPARK-15782][YARN] Fix spark.jars and spark.yarn.dist.jars handling
Repository: spark Updated Branches: refs/heads/branch-2.0 68e7a25cc -> feaba979b [SPARK-15782][YARN] Fix spark.jars and spark.yarn.dist.jars handling When `--packages` is specified with spark-shell the classes from those packages cannot be found, which I think is due to some of the changes in SPARK-12343. Tested manually with both scala 2.10 and 2.11 repls. vanzin davies can you guys please review? Author: Marcelo Vanzin Author: Nezih Yigitbasi Closes #13709 from nezihyigitbasi/SPARK-15782. (cherry picked from commit 63470afc997fb9d6b6f8a911c25964743556c9cc) Signed-off-by: Marcelo Vanzin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/feaba979 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/feaba979 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/feaba979 Branch: refs/heads/branch-2.0 Commit: feaba979b30b27f661ae44ae3f12eabc3a6e55b3 Parents: 68e7a25 Author: Nezih Yigitbasi Authored: Thu Jun 16 18:19:29 2016 -0700 Committer: Marcelo Vanzin Committed: Thu Jun 16 18:20:42 2016 -0700 -- .../scala/org/apache/spark/SparkContext.scala | 2 +- .../scala/org/apache/spark/util/Utils.scala | 25 +++ .../apache/spark/deploy/SparkSubmitSuite.scala | 12 .../org/apache/spark/repl/SparkILoop.scala | 32 .../main/scala/org/apache/spark/repl/Main.scala | 4 +-- 5 files changed, 59 insertions(+), 16 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/feaba979/core/src/main/scala/org/apache/spark/SparkContext.scala -- diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index d56946e..d870181 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -391,7 +391,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli _conf.set("spark.executor.id", SparkContext.DRIVER_IDENTIFIER) -_jars = _conf.getOption("spark.jars").map(_.split(",")).map(_.filter(_.nonEmpty)).toSeq.flatten +_jars = Utils.getUserJars(_conf) _files = _conf.getOption("spark.files").map(_.split(",")).map(_.filter(_.nonEmpty)) .toSeq.flatten http://git-wip-us.apache.org/repos/asf/spark/blob/feaba979/core/src/main/scala/org/apache/spark/util/Utils.scala -- diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index f9d0540..17d193b 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -2352,6 +2352,31 @@ private[spark] object Utils extends Logging { log.info(s"Started daemon with process name: ${Utils.getProcessName()}") SignalUtils.registerLogger(log) } + + /** + * Unions two comma-separated lists of files and filters out empty strings. + */ + def unionFileLists(leftList: Option[String], rightList: Option[String]): Set[String] = { +var allFiles = Set[String]() +leftList.foreach { value => allFiles ++= value.split(",") } +rightList.foreach { value => allFiles ++= value.split(",") } +allFiles.filter { _.nonEmpty } + } + + /** + * In YARN mode this method returns a union of the jar files pointed by "spark.jars" and the + * "spark.yarn.dist.jars" properties, while in other modes it returns the jar files pointed by + * only the "spark.jars" property. + */ + def getUserJars(conf: SparkConf): Seq[String] = { +val sparkJars = conf.getOption("spark.jars") +if (conf.get("spark.master") == "yarn") { + val yarnJars = conf.getOption("spark.yarn.dist.jars") + unionFileLists(sparkJars, yarnJars).toSeq +} else { + sparkJars.map(_.split(",")).map(_.filter(_.nonEmpty)).toSeq.flatten +} + } } /** http://git-wip-us.apache.org/repos/asf/spark/blob/feaba979/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala index 2718976..0b02059 100644 --- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala @@ -570,6 +570,18 @@ class SparkSubmitSuite appArgs.executorMemory should be ("2.3g") } } + + test("comma separated list of files are unioned correctly") { +val left = Option("/tmp/a.jar,/tmp/b.jar") +val right = Option("/tmp/c.jar,/tmp/a.jar") +val emptyString = Opt
spark git commit: [SPARK-15966][DOC] Add closing tag to fix rendering issue for Spark monitoring
Repository: spark Updated Branches: refs/heads/master 9040d83bc -> f1bf0d2f3 [SPARK-15966][DOC] Add closing tag to fix rendering issue for Spark monitoring ## What changes were proposed in this pull request? Adds the missing closing tag for spark.ui.view.acls.groups ## How was this patch tested? I built the docs locally and verified the changed in browser. (If this patch involves UI changes, please attach a screenshot; otherwise, remove this) **Before:** ![image](https://cloud.githubusercontent.com/assets/7732317/16135005/49fc0724-33e6-11e6-9390-98711593fa5b.png) **After:** ![image](https://cloud.githubusercontent.com/assets/7732317/16135021/62b5c4a8-33e6-11e6-8118-b22fda5c66eb.png) Author: Dhruve Ashar Closes #13719 from dhruve/doc/SPARK-15966. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f1bf0d2f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f1bf0d2f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f1bf0d2f Branch: refs/heads/master Commit: f1bf0d2f3a61d81686f36763e83d3be89c98435f Parents: 9040d83 Author: Dhruve Ashar Authored: Thu Jun 16 16:44:54 2016 -0700 Committer: Andrew Or Committed: Thu Jun 16 17:46:19 2016 -0700 -- docs/monitoring.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f1bf0d2f/docs/monitoring.md -- diff --git a/docs/monitoring.md b/docs/monitoring.md index 78a3470..fa6c899 100644 --- a/docs/monitoring.md +++ b/docs/monitoring.md @@ -157,7 +157,7 @@ The history server can be configured as follows: If enabled, access control checks are made regardless of what the individual application had set for spark.ui.acls.enable when the application was run. The application owner will always have authorization to view their own application and any users specified via - spark.ui.view.acls and groups specified via spark.ui.view.acls.groups + spark.ui.view.acls and groups specified via spark.ui.view.acls.groups when the application was run will also have authorization to view that application. If disabled, no access control checks are made. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15608][ML][EXAMPLES][DOC] add examples and documents of ml.isotonic regression
Repository: spark Updated Branches: refs/heads/branch-2.0 b3678eb7e -> 68e7a25cc [SPARK-15608][ML][EXAMPLES][DOC] add examples and documents of ml.isotonic regression ## What changes were proposed in this pull request? add ml doc for ml isotonic regression add scala example for ml isotonic regression add java example for ml isotonic regression add python example for ml isotonic regression modify scala example for mllib isotonic regression modify java example for mllib isotonic regression modify python example for mllib isotonic regression add data/mllib/sample_isotonic_regression_libsvm_data.txt delete data/mllib/sample_isotonic_regression_data.txt ## How was this patch tested? N/A Author: WeichenXu Closes #13381 from WeichenXu123/add_isotonic_regression_doc. (cherry picked from commit 9040d83bc2cdce06dab0e1bdee4f796da9a9a55c) Signed-off-by: Yanbo Liang Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/68e7a25c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/68e7a25c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/68e7a25c Branch: refs/heads/branch-2.0 Commit: 68e7a25cc06cbfe357be8d224c117abaa7ba94f4 Parents: b3678eb Author: WeichenXu Authored: Thu Jun 16 17:35:40 2016 -0700 Committer: Yanbo Liang Committed: Thu Jun 16 17:35:51 2016 -0700 -- data/mllib/sample_isotonic_regression_data.txt | 100 --- .../sample_isotonic_regression_libsvm_data.txt | 100 +++ docs/ml-classification-regression.md| 70 + .../ml/JavaIsotonicRegressionExample.java | 62 .../mllib/JavaIsotonicRegressionExample.java| 19 ++-- .../python/ml/isotonic_regression_example.py| 54 ++ .../python/mllib/isotonic_regression_example.py | 11 +- .../examples/ml/IsotonicRegressionExample.scala | 62 .../mllib/IsotonicRegressionExample.scala | 9 +- 9 files changed, 373 insertions(+), 114 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/68e7a25c/data/mllib/sample_isotonic_regression_data.txt -- diff --git a/data/mllib/sample_isotonic_regression_data.txt b/data/mllib/sample_isotonic_regression_data.txt deleted file mode 100644 index d257b50..000 --- a/data/mllib/sample_isotonic_regression_data.txt +++ /dev/null @@ -1,100 +0,0 @@ -0.24579296,0.01 -0.28505864,0.02 -0.31208567,0.03 -0.35900051,0.04 -0.35747068,0.05 -0.16675166,0.06 -0.17491076,0.07 -0.04181540,0.08 -0.04793473,0.09 -0.03926568,0.10 -0.12952575,0.11 -0.,0.12 -0.01376849,0.13 -0.13105558,0.14 -0.08873024,0.15 -0.12595614,0.16 -0.15247323,0.17 -0.25956145,0.18 -0.20040796,0.19 -0.19581846,0.20 -0.15757267,0.21 -0.13717491,0.22 -0.19020908,0.23 -0.19581846,0.24 -0.20091790,0.25 -0.16879143,0.26 -0.18510964,0.27 -0.20040796,0.28 -0.29576747,0.29 -0.43396226,0.30 -0.53391127,0.31 -0.52116267,0.32 -0.48546660,0.33 -0.49209587,0.34 -0.54156043,0.35 -0.59765426,0.36 -0.56144824,0.37 -0.58592555,0.38 -0.52983172,0.39 -0.50178480,0.40 -0.52626211,0.41 -0.58286588,0.42 -0.64660887,0.43 -0.68077511,0.44 -0.74298827,0.45 -0.64864865,0.46 -0.67261601,0.47 -0.65782764,0.48 -0.69811321,0.49 -0.63029067,0.50 -0.61601224,0.51 -0.63233044,0.52 -0.65323814,0.53 -0.65323814,0.54 -0.67363590,0.55 -0.67006629,0.56 -0.51555329,0.57 -0.50892402,0.58 -0.33299337,0.59 -0.36206017,0.60 -0.43090260,0.61 -0.45996940,0.62 -0.56348802,0.63 -0.54920959,0.64 -0.48393677,0.65 -0.48495665,0.66 -0.46965834,0.67 -0.45181030,0.68 -0.45843957,0.69 -0.47118817,0.70 -0.51555329,0.71 -0.58031617,0.72 -0.55481897,0.73 -0.56297807,0.74 -0.56603774,0.75 -0.57929628,0.76 -0.64762876,0.77 -0.66241713,0.78 -0.69301377,0.79 -0.65119837,0.80 -0.68332483,0.81 -0.66598674,0.82 -0.73890872,0.83 -0.73992861,0.84 -0.84242733,0.85 -0.91330954,0.86 -0.88016318,0.87 -0.90719021,0.88 -0.93115757,0.89 -0.93115757,0.90 -0.91942886,0.91 -0.92911780,0.92 -0.95665477,0.93 -0.95002550,0.94 -0.96940337,0.95 -1.,0.96 -0.89801122,0.97 -0.90311066,0.98 -0.90362060,0.99 -0.83477817,1.0 \ No newline at end of file http://git-wip-us.apache.org/repos/asf/spark/blob/68e7a25c/data/mllib/sample_isotonic_regression_libsvm_data.txt -- diff --git a/data/mllib/sample_isotonic_regression_libsvm_data.txt b/data/mllib/sample_isotonic_regression_libsvm_data.txt new file mode 100644 index 000..f39fe02 --- /dev/null +++ b/data/mllib/sample_isotonic_regression_libsvm_data.txt @@ -0,0 +1,100 @@ +0.24579296 1:0.01 +0.28505864 1:0.02 +0.31208567 1:0.03 +0.35900051 1:0.04 +0.35747068 1:0.05 +0.16675166 1:0.06 +0.17491076 1:0.07 +0.04181540 1:0.08 +0.04793473 1:0.09 +0.03926568 1:0.10 +0.12952575 1:0.11 +0. 1:0.12 +0.01376849 1:0.13
spark git commit: [SPARK-15608][ML][EXAMPLES][DOC] add examples and documents of ml.isotonic regression
Repository: spark Updated Branches: refs/heads/master d9c6628c4 -> 9040d83bc [SPARK-15608][ML][EXAMPLES][DOC] add examples and documents of ml.isotonic regression ## What changes were proposed in this pull request? add ml doc for ml isotonic regression add scala example for ml isotonic regression add java example for ml isotonic regression add python example for ml isotonic regression modify scala example for mllib isotonic regression modify java example for mllib isotonic regression modify python example for mllib isotonic regression add data/mllib/sample_isotonic_regression_libsvm_data.txt delete data/mllib/sample_isotonic_regression_data.txt ## How was this patch tested? N/A Author: WeichenXu Closes #13381 from WeichenXu123/add_isotonic_regression_doc. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9040d83b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9040d83b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9040d83b Branch: refs/heads/master Commit: 9040d83bc2cdce06dab0e1bdee4f796da9a9a55c Parents: d9c6628 Author: WeichenXu Authored: Thu Jun 16 17:35:40 2016 -0700 Committer: Yanbo Liang Committed: Thu Jun 16 17:35:40 2016 -0700 -- data/mllib/sample_isotonic_regression_data.txt | 100 --- .../sample_isotonic_regression_libsvm_data.txt | 100 +++ docs/ml-classification-regression.md| 70 + .../ml/JavaIsotonicRegressionExample.java | 62 .../mllib/JavaIsotonicRegressionExample.java| 19 ++-- .../python/ml/isotonic_regression_example.py| 54 ++ .../python/mllib/isotonic_regression_example.py | 11 +- .../examples/ml/IsotonicRegressionExample.scala | 62 .../mllib/IsotonicRegressionExample.scala | 9 +- 9 files changed, 373 insertions(+), 114 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9040d83b/data/mllib/sample_isotonic_regression_data.txt -- diff --git a/data/mllib/sample_isotonic_regression_data.txt b/data/mllib/sample_isotonic_regression_data.txt deleted file mode 100644 index d257b50..000 --- a/data/mllib/sample_isotonic_regression_data.txt +++ /dev/null @@ -1,100 +0,0 @@ -0.24579296,0.01 -0.28505864,0.02 -0.31208567,0.03 -0.35900051,0.04 -0.35747068,0.05 -0.16675166,0.06 -0.17491076,0.07 -0.04181540,0.08 -0.04793473,0.09 -0.03926568,0.10 -0.12952575,0.11 -0.,0.12 -0.01376849,0.13 -0.13105558,0.14 -0.08873024,0.15 -0.12595614,0.16 -0.15247323,0.17 -0.25956145,0.18 -0.20040796,0.19 -0.19581846,0.20 -0.15757267,0.21 -0.13717491,0.22 -0.19020908,0.23 -0.19581846,0.24 -0.20091790,0.25 -0.16879143,0.26 -0.18510964,0.27 -0.20040796,0.28 -0.29576747,0.29 -0.43396226,0.30 -0.53391127,0.31 -0.52116267,0.32 -0.48546660,0.33 -0.49209587,0.34 -0.54156043,0.35 -0.59765426,0.36 -0.56144824,0.37 -0.58592555,0.38 -0.52983172,0.39 -0.50178480,0.40 -0.52626211,0.41 -0.58286588,0.42 -0.64660887,0.43 -0.68077511,0.44 -0.74298827,0.45 -0.64864865,0.46 -0.67261601,0.47 -0.65782764,0.48 -0.69811321,0.49 -0.63029067,0.50 -0.61601224,0.51 -0.63233044,0.52 -0.65323814,0.53 -0.65323814,0.54 -0.67363590,0.55 -0.67006629,0.56 -0.51555329,0.57 -0.50892402,0.58 -0.33299337,0.59 -0.36206017,0.60 -0.43090260,0.61 -0.45996940,0.62 -0.56348802,0.63 -0.54920959,0.64 -0.48393677,0.65 -0.48495665,0.66 -0.46965834,0.67 -0.45181030,0.68 -0.45843957,0.69 -0.47118817,0.70 -0.51555329,0.71 -0.58031617,0.72 -0.55481897,0.73 -0.56297807,0.74 -0.56603774,0.75 -0.57929628,0.76 -0.64762876,0.77 -0.66241713,0.78 -0.69301377,0.79 -0.65119837,0.80 -0.68332483,0.81 -0.66598674,0.82 -0.73890872,0.83 -0.73992861,0.84 -0.84242733,0.85 -0.91330954,0.86 -0.88016318,0.87 -0.90719021,0.88 -0.93115757,0.89 -0.93115757,0.90 -0.91942886,0.91 -0.92911780,0.92 -0.95665477,0.93 -0.95002550,0.94 -0.96940337,0.95 -1.,0.96 -0.89801122,0.97 -0.90311066,0.98 -0.90362060,0.99 -0.83477817,1.0 \ No newline at end of file http://git-wip-us.apache.org/repos/asf/spark/blob/9040d83b/data/mllib/sample_isotonic_regression_libsvm_data.txt -- diff --git a/data/mllib/sample_isotonic_regression_libsvm_data.txt b/data/mllib/sample_isotonic_regression_libsvm_data.txt new file mode 100644 index 000..f39fe02 --- /dev/null +++ b/data/mllib/sample_isotonic_regression_libsvm_data.txt @@ -0,0 +1,100 @@ +0.24579296 1:0.01 +0.28505864 1:0.02 +0.31208567 1:0.03 +0.35900051 1:0.04 +0.35747068 1:0.05 +0.16675166 1:0.06 +0.17491076 1:0.07 +0.04181540 1:0.08 +0.04793473 1:0.09 +0.03926568 1:0.10 +0.12952575 1:0.11 +0. 1:0.12 +0.01376849 1:0.13 +0.13105558 1:0.14 +0.08873024 1:0.15 +0.12595614 1:0.16 +0.15247323 1:0.17 +0.25956145 1:0.18 +0.20040796
spark git commit: [SPARK-15991] SparkContext.hadoopConfiguration should be always the base of hadoop conf created by SessionState
Repository: spark Updated Branches: refs/heads/branch-2.0 8f7138859 -> b3678eb7e [SPARK-15991] SparkContext.hadoopConfiguration should be always the base of hadoop conf created by SessionState ## What changes were proposed in this pull request? Before this patch, after a SparkSession has been created, hadoop conf set directly to SparkContext.hadoopConfiguration will not affect the hadoop conf created by SessionState. This patch makes the change to always use SparkContext.hadoopConfiguration as the base. This patch also changes the behavior of hive-site.xml support added in https://github.com/apache/spark/pull/12689/. With this patch, we will load hive-site.xml to SparkContext.hadoopConfiguration. ## How was this patch tested? New test in SparkSessionBuilderSuite. Author: Yin Huai Closes #13711 from yhuai/SPARK-15991. (cherry picked from commit d9c6628c47de547dc537310e3c775c7f3e0e4a12) Signed-off-by: Shixiong Zhu Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b3678eb7 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b3678eb7 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b3678eb7 Branch: refs/heads/branch-2.0 Commit: b3678eb7e4ac6bb08ba8579867944ba42da99b81 Parents: 8f71388 Author: Yin Huai Authored: Thu Jun 16 17:06:24 2016 -0700 Committer: Shixiong Zhu Committed: Thu Jun 16 17:06:30 2016 -0700 -- .../spark/sql/internal/SessionState.scala | 2 +- .../apache/spark/sql/internal/SharedState.scala | 14 -- .../org/apache/spark/sql/SQLQuerySuite.scala| 4 .../spark/sql/SparkSessionBuilderSuite.scala| 20 .../apache/spark/sql/hive/HiveSharedState.scala | 5 +++-- 5 files changed, 28 insertions(+), 17 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b3678eb7/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala index 59efa81..dc95123 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala @@ -49,7 +49,7 @@ private[sql] class SessionState(sparkSession: SparkSession) { lazy val conf: SQLConf = new SQLConf def newHadoopConf(): Configuration = { -val hadoopConf = new Configuration(sparkSession.sharedState.hadoopConf) +val hadoopConf = new Configuration(sparkSession.sparkContext.hadoopConfiguration) conf.getAllConfs.foreach { case (k, v) => if (v ne null) hadoopConf.set(k, v) } hadoopConf } http://git-wip-us.apache.org/repos/asf/spark/blob/b3678eb7/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala index bc349b4..6c43fe3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala @@ -43,23 +43,17 @@ private[sql] class SharedState(val sparkContext: SparkContext) extends Logging { */ val listener: SQLListener = createListenerAndUI(sparkContext) - /** - * The base hadoop configuration which is shared among all spark sessions. It is based on the - * default hadoop configuration of Spark, with custom configurations inside `hive-site.xml`. - */ - val hadoopConf: Configuration = { -val conf = new Configuration(sparkContext.hadoopConfiguration) + { val configFile = Utils.getContextOrSparkClassLoader.getResource("hive-site.xml") if (configFile != null) { - conf.addResource(configFile) + sparkContext.hadoopConfiguration.addResource(configFile) } -conf } /** * A catalog that interacts with external systems. */ - lazy val externalCatalog: ExternalCatalog = new InMemoryCatalog(hadoopConf) + lazy val externalCatalog: ExternalCatalog = new InMemoryCatalog(sparkContext.hadoopConfiguration) /** * A classloader used to load all user-added jar. @@ -71,7 +65,7 @@ private[sql] class SharedState(val sparkContext: SparkContext) extends Logging { // Set the Hive metastore warehouse path to the one we use val tempConf = new SQLConf sparkContext.conf.getAll.foreach { case (k, v) => tempConf.setConfString(k, v) } -val hiveWarehouseDir = hadoopConf.get("hive.metastore.warehouse.dir") +val hiveWarehouseDir = sparkContext.hadoopConfiguration.get("hive.metastore.warehouse.dir") if (h
spark git commit: [SPARK-15991] SparkContext.hadoopConfiguration should be always the base of hadoop conf created by SessionState
Repository: spark Updated Branches: refs/heads/master 62d2fa5e9 -> d9c6628c4 [SPARK-15991] SparkContext.hadoopConfiguration should be always the base of hadoop conf created by SessionState ## What changes were proposed in this pull request? Before this patch, after a SparkSession has been created, hadoop conf set directly to SparkContext.hadoopConfiguration will not affect the hadoop conf created by SessionState. This patch makes the change to always use SparkContext.hadoopConfiguration as the base. This patch also changes the behavior of hive-site.xml support added in https://github.com/apache/spark/pull/12689/. With this patch, we will load hive-site.xml to SparkContext.hadoopConfiguration. ## How was this patch tested? New test in SparkSessionBuilderSuite. Author: Yin Huai Closes #13711 from yhuai/SPARK-15991. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d9c6628c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d9c6628c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d9c6628c Branch: refs/heads/master Commit: d9c6628c47de547dc537310e3c775c7f3e0e4a12 Parents: 62d2fa5 Author: Yin Huai Authored: Thu Jun 16 17:06:24 2016 -0700 Committer: Shixiong Zhu Committed: Thu Jun 16 17:06:24 2016 -0700 -- .../spark/sql/internal/SessionState.scala | 2 +- .../apache/spark/sql/internal/SharedState.scala | 14 -- .../org/apache/spark/sql/SQLQuerySuite.scala| 4 .../spark/sql/SparkSessionBuilderSuite.scala| 20 .../apache/spark/sql/hive/HiveSharedState.scala | 5 +++-- 5 files changed, 28 insertions(+), 17 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d9c6628c/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala index 59efa81..dc95123 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala @@ -49,7 +49,7 @@ private[sql] class SessionState(sparkSession: SparkSession) { lazy val conf: SQLConf = new SQLConf def newHadoopConf(): Configuration = { -val hadoopConf = new Configuration(sparkSession.sharedState.hadoopConf) +val hadoopConf = new Configuration(sparkSession.sparkContext.hadoopConfiguration) conf.getAllConfs.foreach { case (k, v) => if (v ne null) hadoopConf.set(k, v) } hadoopConf } http://git-wip-us.apache.org/repos/asf/spark/blob/d9c6628c/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala index bc349b4..6c43fe3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala @@ -43,23 +43,17 @@ private[sql] class SharedState(val sparkContext: SparkContext) extends Logging { */ val listener: SQLListener = createListenerAndUI(sparkContext) - /** - * The base hadoop configuration which is shared among all spark sessions. It is based on the - * default hadoop configuration of Spark, with custom configurations inside `hive-site.xml`. - */ - val hadoopConf: Configuration = { -val conf = new Configuration(sparkContext.hadoopConfiguration) + { val configFile = Utils.getContextOrSparkClassLoader.getResource("hive-site.xml") if (configFile != null) { - conf.addResource(configFile) + sparkContext.hadoopConfiguration.addResource(configFile) } -conf } /** * A catalog that interacts with external systems. */ - lazy val externalCatalog: ExternalCatalog = new InMemoryCatalog(hadoopConf) + lazy val externalCatalog: ExternalCatalog = new InMemoryCatalog(sparkContext.hadoopConfiguration) /** * A classloader used to load all user-added jar. @@ -71,7 +65,7 @@ private[sql] class SharedState(val sparkContext: SparkContext) extends Logging { // Set the Hive metastore warehouse path to the one we use val tempConf = new SQLConf sparkContext.conf.getAll.foreach { case (k, v) => tempConf.setConfString(k, v) } -val hiveWarehouseDir = hadoopConf.get("hive.metastore.warehouse.dir") +val hiveWarehouseDir = sparkContext.hadoopConfiguration.get("hive.metastore.warehouse.dir") if (hiveWarehouseDir != null && !tempConf.contains(SQLConf.WAREHOUSE_PATH.key)) { // If hive.metastore.w
spark git commit: [SPARK-15966][DOC] Add closing tag to fix rendering issue for Spark monitoring
Repository: spark Updated Branches: refs/heads/branch-2.0 2280ad8a3 -> 8f7138859 [SPARK-15966][DOC] Add closing tag to fix rendering issue for Spark monitoring ## What changes were proposed in this pull request? Adds the missing closing tag for spark.ui.view.acls.groups ## How was this patch tested? I built the docs locally and verified the changed in browser. (If this patch involves UI changes, please attach a screenshot; otherwise, remove this) **Before:** ![image](https://cloud.githubusercontent.com/assets/7732317/16135005/49fc0724-33e6-11e6-9390-98711593fa5b.png) **After:** ![image](https://cloud.githubusercontent.com/assets/7732317/16135021/62b5c4a8-33e6-11e6-8118-b22fda5c66eb.png) Author: Dhruve Ashar Closes #13719 from dhruve/doc/SPARK-15966. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8f713885 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8f713885 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8f713885 Branch: refs/heads/branch-2.0 Commit: 8f713885963a410571370faa6b147cd0ada3832b Parents: 2280ad8 Author: Dhruve Ashar Authored: Thu Jun 16 16:44:54 2016 -0700 Committer: Reynold Xin Committed: Thu Jun 16 16:44:54 2016 -0700 -- docs/monitoring.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8f713885/docs/monitoring.md -- diff --git a/docs/monitoring.md b/docs/monitoring.md index 78a3470..fa6c899 100644 --- a/docs/monitoring.md +++ b/docs/monitoring.md @@ -157,7 +157,7 @@ The history server can be configured as follows: If enabled, access control checks are made regardless of what the individual application had set for spark.ui.acls.enable when the application was run. The application owner will always have authorization to view their own application and any users specified via - spark.ui.view.acls and groups specified via spark.ui.view.acls.groups + spark.ui.view.acls and groups specified via spark.ui.view.acls.groups when the application was run will also have authorization to view that application. If disabled, no access control checks are made. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[1/2] spark git commit: Preparing Spark release v1.6.2-rc1
Repository: spark Updated Branches: refs/heads/branch-1.6 b8f380f79 -> 4621fe94b Preparing Spark release v1.6.2-rc1 Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4168d9c9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4168d9c9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4168d9c9 Branch: refs/heads/branch-1.6 Commit: 4168d9c94a9564f6b3e62f5d669acde13a7c7cf6 Parents: b8f380f Author: Patrick Wendell Authored: Thu Jun 16 16:40:19 2016 -0700 Committer: Patrick Wendell Committed: Thu Jun 16 16:40:19 2016 -0700 -- assembly/pom.xml| 2 +- bagel/pom.xml | 2 +- core/pom.xml| 2 +- docker-integration-tests/pom.xml| 2 +- examples/pom.xml| 2 +- external/flume-assembly/pom.xml | 2 +- external/flume-sink/pom.xml | 2 +- external/flume/pom.xml | 2 +- external/kafka-assembly/pom.xml | 2 +- external/kafka/pom.xml | 2 +- external/mqtt-assembly/pom.xml | 2 +- external/mqtt/pom.xml | 2 +- external/twitter/pom.xml| 2 +- external/zeromq/pom.xml | 2 +- extras/java8-tests/pom.xml | 2 +- extras/kinesis-asl-assembly/pom.xml | 2 +- extras/kinesis-asl/pom.xml | 2 +- extras/spark-ganglia-lgpl/pom.xml | 2 +- graphx/pom.xml | 2 +- launcher/pom.xml| 2 +- mllib/pom.xml | 2 +- network/common/pom.xml | 2 +- network/shuffle/pom.xml | 2 +- network/yarn/pom.xml| 2 +- pom.xml | 2 +- repl/pom.xml| 2 +- sql/catalyst/pom.xml| 2 +- sql/core/pom.xml| 2 +- sql/hive-thriftserver/pom.xml | 2 +- sql/hive/pom.xml| 2 +- streaming/pom.xml | 2 +- tags/pom.xml| 2 +- tools/pom.xml | 2 +- unsafe/pom.xml | 2 +- yarn/pom.xml| 2 +- 35 files changed, 35 insertions(+), 35 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4168d9c9/assembly/pom.xml -- diff --git a/assembly/pom.xml b/assembly/pom.xml index 6ec2ca4..438e6ed 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 -1.6.3-SNAPSHOT +1.6.2 ../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/4168d9c9/bagel/pom.xml -- diff --git a/bagel/pom.xml b/bagel/pom.xml index 2d778c5..85be37f 100644 --- a/bagel/pom.xml +++ b/bagel/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 -1.6.3-SNAPSHOT +1.6.2 ../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/4168d9c9/core/pom.xml -- diff --git a/core/pom.xml b/core/pom.xml index a8d7863..15e60a3 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 -1.6.3-SNAPSHOT +1.6.2 ../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/4168d9c9/docker-integration-tests/pom.xml -- diff --git a/docker-integration-tests/pom.xml b/docker-integration-tests/pom.xml index a06e59c..0bc749f 100644 --- a/docker-integration-tests/pom.xml +++ b/docker-integration-tests/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.10 -1.6.3-SNAPSHOT +1.6.2 ../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/4168d9c9/examples/pom.xml -- diff --git a/examples/pom.xml b/examples/pom.xml index 8e9e02e..f771a36 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 -1.6.3-SNAPSHOT +1.6.2 ../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/4168d9c9/external/flume-assembly/pom.xml -- diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml index 52c8a91..1ef7e7f 100644 --- a/external/flume-assembly/pom.xml +++ b/external/flume-assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 -1.6.3-SNAPSHOT +1.6.2 ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/4168d9c9/external/flume-sink/pom.xml -- diff --gi
[2/2] spark git commit: Preparing development version 1.6.3-SNAPSHOT
Preparing development version 1.6.3-SNAPSHOT Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4621fe94 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4621fe94 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4621fe94 Branch: refs/heads/branch-1.6 Commit: 4621fe94b0bdcbb0634e41db926d1d9a98e5014e Parents: 4168d9c Author: Patrick Wendell Authored: Thu Jun 16 16:40:26 2016 -0700 Committer: Patrick Wendell Committed: Thu Jun 16 16:40:26 2016 -0700 -- assembly/pom.xml| 2 +- bagel/pom.xml | 2 +- core/pom.xml| 2 +- docker-integration-tests/pom.xml| 2 +- examples/pom.xml| 2 +- external/flume-assembly/pom.xml | 2 +- external/flume-sink/pom.xml | 2 +- external/flume/pom.xml | 2 +- external/kafka-assembly/pom.xml | 2 +- external/kafka/pom.xml | 2 +- external/mqtt-assembly/pom.xml | 2 +- external/mqtt/pom.xml | 2 +- external/twitter/pom.xml| 2 +- external/zeromq/pom.xml | 2 +- extras/java8-tests/pom.xml | 2 +- extras/kinesis-asl-assembly/pom.xml | 2 +- extras/kinesis-asl/pom.xml | 2 +- extras/spark-ganglia-lgpl/pom.xml | 2 +- graphx/pom.xml | 2 +- launcher/pom.xml| 2 +- mllib/pom.xml | 2 +- network/common/pom.xml | 2 +- network/shuffle/pom.xml | 2 +- network/yarn/pom.xml| 2 +- pom.xml | 2 +- repl/pom.xml| 2 +- sql/catalyst/pom.xml| 2 +- sql/core/pom.xml| 2 +- sql/hive-thriftserver/pom.xml | 2 +- sql/hive/pom.xml| 2 +- streaming/pom.xml | 2 +- tags/pom.xml| 2 +- tools/pom.xml | 2 +- unsafe/pom.xml | 2 +- yarn/pom.xml| 2 +- 35 files changed, 35 insertions(+), 35 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4621fe94/assembly/pom.xml -- diff --git a/assembly/pom.xml b/assembly/pom.xml index 438e6ed..6ec2ca4 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 -1.6.2 +1.6.3-SNAPSHOT ../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/4621fe94/bagel/pom.xml -- diff --git a/bagel/pom.xml b/bagel/pom.xml index 85be37f..2d778c5 100644 --- a/bagel/pom.xml +++ b/bagel/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 -1.6.2 +1.6.3-SNAPSHOT ../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/4621fe94/core/pom.xml -- diff --git a/core/pom.xml b/core/pom.xml index 15e60a3..a8d7863 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 -1.6.2 +1.6.3-SNAPSHOT ../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/4621fe94/docker-integration-tests/pom.xml -- diff --git a/docker-integration-tests/pom.xml b/docker-integration-tests/pom.xml index 0bc749f..a06e59c 100644 --- a/docker-integration-tests/pom.xml +++ b/docker-integration-tests/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.10 -1.6.2 +1.6.3-SNAPSHOT ../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/4621fe94/examples/pom.xml -- diff --git a/examples/pom.xml b/examples/pom.xml index f771a36..8e9e02e 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 -1.6.2 +1.6.3-SNAPSHOT ../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/4621fe94/external/flume-assembly/pom.xml -- diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml index 1ef7e7f..52c8a91 100644 --- a/external/flume-assembly/pom.xml +++ b/external/flume-assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 -1.6.2 +1.6.3-SNAPSHOT ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/4621fe94/external/flume-sink/pom.xml -- diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml index c1cb64b
[spark] Git Push Summary
Repository: spark Updated Tags: refs/tags/v1.6.2-rc1 [created] 4168d9c94 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] Git Push Summary
Repository: spark Updated Tags: refs/tags/v1.6.2 [deleted] f16649304 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[2/2] spark git commit: Preparing development version 1.6.3-SNAPSHOT
Preparing development version 1.6.3-SNAPSHOT Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b8f380f7 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b8f380f7 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b8f380f7 Branch: refs/heads/branch-1.6 Commit: b8f380f79aa46754d308cd9729fa97a76fdb951e Parents: f166493 Author: Patrick Wendell Authored: Thu Jun 16 16:35:51 2016 -0700 Committer: Patrick Wendell Committed: Thu Jun 16 16:35:51 2016 -0700 -- assembly/pom.xml| 2 +- bagel/pom.xml | 2 +- core/pom.xml| 2 +- docker-integration-tests/pom.xml| 2 +- examples/pom.xml| 2 +- external/flume-assembly/pom.xml | 2 +- external/flume-sink/pom.xml | 2 +- external/flume/pom.xml | 2 +- external/kafka-assembly/pom.xml | 2 +- external/kafka/pom.xml | 2 +- external/mqtt-assembly/pom.xml | 2 +- external/mqtt/pom.xml | 2 +- external/twitter/pom.xml| 2 +- external/zeromq/pom.xml | 2 +- extras/java8-tests/pom.xml | 2 +- extras/kinesis-asl-assembly/pom.xml | 2 +- extras/kinesis-asl/pom.xml | 2 +- extras/spark-ganglia-lgpl/pom.xml | 2 +- graphx/pom.xml | 2 +- launcher/pom.xml| 2 +- mllib/pom.xml | 2 +- network/common/pom.xml | 2 +- network/shuffle/pom.xml | 2 +- network/yarn/pom.xml| 2 +- pom.xml | 2 +- repl/pom.xml| 2 +- sql/catalyst/pom.xml| 2 +- sql/core/pom.xml| 2 +- sql/hive-thriftserver/pom.xml | 2 +- sql/hive/pom.xml| 2 +- streaming/pom.xml | 2 +- tags/pom.xml| 2 +- tools/pom.xml | 2 +- unsafe/pom.xml | 2 +- yarn/pom.xml| 2 +- 35 files changed, 35 insertions(+), 35 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b8f380f7/assembly/pom.xml -- diff --git a/assembly/pom.xml b/assembly/pom.xml index 438e6ed..6ec2ca4 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 -1.6.2 +1.6.3-SNAPSHOT ../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/b8f380f7/bagel/pom.xml -- diff --git a/bagel/pom.xml b/bagel/pom.xml index 85be37f..2d778c5 100644 --- a/bagel/pom.xml +++ b/bagel/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 -1.6.2 +1.6.3-SNAPSHOT ../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/b8f380f7/core/pom.xml -- diff --git a/core/pom.xml b/core/pom.xml index 15e60a3..a8d7863 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 -1.6.2 +1.6.3-SNAPSHOT ../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/b8f380f7/docker-integration-tests/pom.xml -- diff --git a/docker-integration-tests/pom.xml b/docker-integration-tests/pom.xml index 0bc749f..a06e59c 100644 --- a/docker-integration-tests/pom.xml +++ b/docker-integration-tests/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.10 -1.6.2 +1.6.3-SNAPSHOT ../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/b8f380f7/examples/pom.xml -- diff --git a/examples/pom.xml b/examples/pom.xml index f771a36..8e9e02e 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 -1.6.2 +1.6.3-SNAPSHOT ../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/b8f380f7/external/flume-assembly/pom.xml -- diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml index 1ef7e7f..52c8a91 100644 --- a/external/flume-assembly/pom.xml +++ b/external/flume-assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 -1.6.2 +1.6.3-SNAPSHOT ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/b8f380f7/external/flume-sink/pom.xml -- diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml index c1cb64b
[spark] Git Push Summary
Repository: spark Updated Tags: refs/tags/v1.6.2 [created] f16649304 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[1/2] spark git commit: Preparing Spark release v1.6.2
Repository: spark Updated Branches: refs/heads/branch-1.6 a4485c3b5 -> b8f380f79 Preparing Spark release v1.6.2 Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f1664930 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f1664930 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f1664930 Branch: refs/heads/branch-1.6 Commit: f166493046f33b511d0b1f05e83d913741497648 Parents: a4485c3 Author: Patrick Wendell Authored: Thu Jun 16 16:35:44 2016 -0700 Committer: Patrick Wendell Committed: Thu Jun 16 16:35:44 2016 -0700 -- assembly/pom.xml| 2 +- bagel/pom.xml | 2 +- core/pom.xml| 2 +- docker-integration-tests/pom.xml| 2 +- examples/pom.xml| 2 +- external/flume-assembly/pom.xml | 2 +- external/flume-sink/pom.xml | 2 +- external/flume/pom.xml | 2 +- external/kafka-assembly/pom.xml | 2 +- external/kafka/pom.xml | 2 +- external/mqtt-assembly/pom.xml | 2 +- external/mqtt/pom.xml | 2 +- external/twitter/pom.xml| 2 +- external/zeromq/pom.xml | 2 +- extras/java8-tests/pom.xml | 2 +- extras/kinesis-asl-assembly/pom.xml | 2 +- extras/kinesis-asl/pom.xml | 2 +- extras/spark-ganglia-lgpl/pom.xml | 2 +- graphx/pom.xml | 2 +- launcher/pom.xml| 2 +- mllib/pom.xml | 2 +- network/common/pom.xml | 2 +- network/shuffle/pom.xml | 2 +- network/yarn/pom.xml| 2 +- pom.xml | 2 +- repl/pom.xml| 2 +- sql/catalyst/pom.xml| 2 +- sql/core/pom.xml| 2 +- sql/hive-thriftserver/pom.xml | 2 +- sql/hive/pom.xml| 2 +- streaming/pom.xml | 2 +- tags/pom.xml| 2 +- tools/pom.xml | 2 +- unsafe/pom.xml | 2 +- yarn/pom.xml| 2 +- 35 files changed, 35 insertions(+), 35 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f1664930/assembly/pom.xml -- diff --git a/assembly/pom.xml b/assembly/pom.xml index ff345226..438e6ed 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 -1.6.2-SNAPSHOT +1.6.2 ../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/f1664930/bagel/pom.xml -- diff --git a/bagel/pom.xml b/bagel/pom.xml index e31cf2f..85be37f 100644 --- a/bagel/pom.xml +++ b/bagel/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 -1.6.2-SNAPSHOT +1.6.2 ../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/f1664930/core/pom.xml -- diff --git a/core/pom.xml b/core/pom.xml index 4f909ba..15e60a3 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 -1.6.2-SNAPSHOT +1.6.2 ../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/f1664930/docker-integration-tests/pom.xml -- diff --git a/docker-integration-tests/pom.xml b/docker-integration-tests/pom.xml index 0cce7ff..0bc749f 100644 --- a/docker-integration-tests/pom.xml +++ b/docker-integration-tests/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.10 -1.6.2-SNAPSHOT +1.6.2 ../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/f1664930/examples/pom.xml -- diff --git a/examples/pom.xml b/examples/pom.xml index ed97dc2..f771a36 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 -1.6.2-SNAPSHOT +1.6.2 ../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/f1664930/external/flume-assembly/pom.xml -- diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml index 0d6b563..1ef7e7f 100644 --- a/external/flume-assembly/pom.xml +++ b/external/flume-assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 -1.6.2-SNAPSHOT +1.6.2 ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/f1664930/external/flume-sink/pom.xml -- diff --git a
spark git commit: Update branch-1.6 for 1.6.2 release.
Repository: spark Updated Branches: refs/heads/branch-1.6 0a8ada506 -> a4485c3b5 Update branch-1.6 for 1.6.2 release. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a4485c3b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a4485c3b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a4485c3b Branch: refs/heads/branch-1.6 Commit: a4485c3b561dbcd43bcb203e8ad139901ed581f0 Parents: 0a8ada5 Author: Reynold Xin Authored: Thu Jun 16 16:30:18 2016 -0700 Committer: Reynold Xin Committed: Thu Jun 16 16:30:18 2016 -0700 -- core/src/main/scala/org/apache/spark/package.scala | 2 +- docs/_config.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a4485c3b/core/src/main/scala/org/apache/spark/package.scala -- diff --git a/core/src/main/scala/org/apache/spark/package.scala b/core/src/main/scala/org/apache/spark/package.scala index 4e2269f..1499d14 100644 --- a/core/src/main/scala/org/apache/spark/package.scala +++ b/core/src/main/scala/org/apache/spark/package.scala @@ -43,5 +43,5 @@ package org.apache package object spark { // For package docs only - val SPARK_VERSION = "1.6.1" + val SPARK_VERSION = "1.6.2" } http://git-wip-us.apache.org/repos/asf/spark/blob/a4485c3b/docs/_config.yml -- diff --git a/docs/_config.yml b/docs/_config.yml index 9334516..c2ecb59 100644 --- a/docs/_config.yml +++ b/docs/_config.yml @@ -14,8 +14,8 @@ include: # These allow the documentation to be updated with newer releases # of Spark, Scala, and Mesos. -SPARK_VERSION: 1.6.1 -SPARK_VERSION_SHORT: 1.6.1 +SPARK_VERSION: 1.6.2 +SPARK_VERSION_SHORT: 1.6.2 SCALA_BINARY_VERSION: "2.10" SCALA_VERSION: "2.10.5" MESOS_VERSION: 0.21.0 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
svn commit: r1748776 [2/2] - in /spark: news/_posts/ site/ site/graphx/ site/mllib/ site/news/ site/releases/ site/screencasts/ site/sql/ site/streaming/
Modified: spark/site/news/spark-tips-from-quantifind.html URL: http://svn.apache.org/viewvc/spark/site/news/spark-tips-from-quantifind.html?rev=1748776&r1=1748775&r2=1748776&view=diff == --- spark/site/news/spark-tips-from-quantifind.html (original) +++ spark/site/news/spark-tips-from-quantifind.html Thu Jun 16 22:14:05 2016 @@ -150,6 +150,9 @@ Latest News + Call for Presentations for Spark Summit EU is Open + (Jun 16, 2016) + Preview release of Spark 2.0 (May 26, 2016) @@ -159,9 +162,6 @@ Spark 1.6.1 released (Mar 09, 2016) - Submission is open for Spark Summit San Francisco - (Feb 11, 2016) - Archive Modified: spark/site/news/spark-user-survey-and-powered-by-page.html URL: http://svn.apache.org/viewvc/spark/site/news/spark-user-survey-and-powered-by-page.html?rev=1748776&r1=1748775&r2=1748776&view=diff == --- spark/site/news/spark-user-survey-and-powered-by-page.html (original) +++ spark/site/news/spark-user-survey-and-powered-by-page.html Thu Jun 16 22:14:05 2016 @@ -150,6 +150,9 @@ Latest News + Call for Presentations for Spark Summit EU is Open + (Jun 16, 2016) + Preview release of Spark 2.0 (May 26, 2016) @@ -159,9 +162,6 @@ Spark 1.6.1 released (Mar 09, 2016) - Submission is open for Spark Summit San Francisco - (Feb 11, 2016) - Archive Modified: spark/site/news/spark-version-0-6-0-released.html URL: http://svn.apache.org/viewvc/spark/site/news/spark-version-0-6-0-released.html?rev=1748776&r1=1748775&r2=1748776&view=diff == --- spark/site/news/spark-version-0-6-0-released.html (original) +++ spark/site/news/spark-version-0-6-0-released.html Thu Jun 16 22:14:05 2016 @@ -150,6 +150,9 @@ Latest News + Call for Presentations for Spark Summit EU is Open + (Jun 16, 2016) + Preview release of Spark 2.0 (May 26, 2016) @@ -159,9 +162,6 @@ Spark 1.6.1 released (Mar 09, 2016) - Submission is open for Spark Summit San Francisco - (Feb 11, 2016) - Archive Modified: spark/site/news/spark-wins-daytona-gray-sort-100tb-benchmark.html URL: http://svn.apache.org/viewvc/spark/site/news/spark-wins-daytona-gray-sort-100tb-benchmark.html?rev=1748776&r1=1748775&r2=1748776&view=diff == --- spark/site/news/spark-wins-daytona-gray-sort-100tb-benchmark.html (original) +++ spark/site/news/spark-wins-daytona-gray-sort-100tb-benchmark.html Thu Jun 16 22:14:05 2016 @@ -150,6 +150,9 @@ Latest News + Call for Presentations for Spark Summit EU is Open + (Jun 16, 2016) + Preview release of Spark 2.0 (May 26, 2016) @@ -159,9 +162,6 @@ Spark 1.6.1 released (Mar 09, 2016) - Submission is open for Spark Summit San Francisco - (Feb 11, 2016) - Archive Modified: spark/site/news/strata-exercises-now-available-online.html URL: http://svn.apache.org/viewvc/spark/site/news/strata-exercises-now-available-online.html?rev=1748776&r1=1748775&r2=1748776&view=diff == --- spark/site/news/strata-exercises-now-available-online.html (original) +++ spark/site/news/strata-exercises-now-available-online.html Thu Jun 16 22:14:05 2016 @@ -150,6 +150,9 @@ Latest News + Call for Presentations for Spark Summit EU is Open + (Jun 16, 2016) + Preview release of Spark 2.0 (May 26, 2016) @@ -159,9 +162,6 @@ Spark 1.6.1 released (Mar 09, 2016) - Submission is open for Spark Summit San Francisco - (Feb 11, 2016) - Archive Modified: spark/site/news/submit-talks-to-spark-summit-2014.html URL: http://svn.apache.org/viewvc/spark/site/news/submit-talks-to-spark-summit-2014.html?rev=1748776&r1=1748775&r2=1748776&view=diff == --- spark/site/news/submit-talks-to-spark-summit-2014.html (original) +++ spark/site/news/submit-talks-to-spark-summit-2014.html Thu Jun 16 22:14:05 2016 @@ -150,6 +150,9 @@ Latest News + Call for Presentations for Spark Summit EU is Open +
svn commit: r1748776 [1/2] - in /spark: news/_posts/ site/ site/graphx/ site/mllib/ site/news/ site/releases/ site/screencasts/ site/sql/ site/streaming/
Author: yhuai Date: Thu Jun 16 22:14:05 2016 New Revision: 1748776 URL: http://svn.apache.org/viewvc?rev=1748776&view=rev Log: Add a new news for CFP of Spark Summit 2016 EU Added: spark/news/_posts/2016-06-16-submit-talks-to-spark-summit-eu-2016.md spark/site/news/submit-talks-to-spark-summit-eu-2016.html Modified: spark/site/community.html spark/site/documentation.html spark/site/downloads.html spark/site/examples.html spark/site/faq.html spark/site/graphx/index.html spark/site/index.html spark/site/mailing-lists.html spark/site/mllib/index.html spark/site/news/amp-camp-2013-registration-ope.html spark/site/news/announcing-the-first-spark-summit.html spark/site/news/fourth-spark-screencast-published.html spark/site/news/index.html spark/site/news/nsdi-paper.html spark/site/news/one-month-to-spark-summit-2015.html spark/site/news/proposals-open-for-spark-summit-east.html spark/site/news/registration-open-for-spark-summit-east.html spark/site/news/run-spark-and-shark-on-amazon-emr.html spark/site/news/spark-0-6-1-and-0-5-2-released.html spark/site/news/spark-0-6-2-released.html spark/site/news/spark-0-7-0-released.html spark/site/news/spark-0-7-2-released.html spark/site/news/spark-0-7-3-released.html spark/site/news/spark-0-8-0-released.html spark/site/news/spark-0-8-1-released.html spark/site/news/spark-0-9-0-released.html spark/site/news/spark-0-9-1-released.html spark/site/news/spark-0-9-2-released.html spark/site/news/spark-1-0-0-released.html spark/site/news/spark-1-0-1-released.html spark/site/news/spark-1-0-2-released.html spark/site/news/spark-1-1-0-released.html spark/site/news/spark-1-1-1-released.html spark/site/news/spark-1-2-0-released.html spark/site/news/spark-1-2-1-released.html spark/site/news/spark-1-2-2-released.html spark/site/news/spark-1-3-0-released.html spark/site/news/spark-1-4-0-released.html spark/site/news/spark-1-4-1-released.html spark/site/news/spark-1-5-0-released.html spark/site/news/spark-1-5-1-released.html spark/site/news/spark-1-5-2-released.html spark/site/news/spark-1-6-0-released.html spark/site/news/spark-1-6-1-released.html spark/site/news/spark-2.0.0-preview.html spark/site/news/spark-accepted-into-apache-incubator.html spark/site/news/spark-and-shark-in-the-news.html spark/site/news/spark-becomes-tlp.html spark/site/news/spark-featured-in-wired.html spark/site/news/spark-mailing-lists-moving-to-apache.html spark/site/news/spark-meetups.html spark/site/news/spark-screencasts-published.html spark/site/news/spark-summit-2013-is-a-wrap.html spark/site/news/spark-summit-2014-videos-posted.html spark/site/news/spark-summit-2015-videos-posted.html spark/site/news/spark-summit-agenda-posted.html spark/site/news/spark-summit-east-2015-videos-posted.html spark/site/news/spark-summit-east-2016-cfp-closing.html spark/site/news/spark-summit-east-agenda-posted.html spark/site/news/spark-summit-europe-agenda-posted.html spark/site/news/spark-summit-europe.html spark/site/news/spark-summit-june-2016-agenda-posted.html spark/site/news/spark-tips-from-quantifind.html spark/site/news/spark-user-survey-and-powered-by-page.html spark/site/news/spark-version-0-6-0-released.html spark/site/news/spark-wins-daytona-gray-sort-100tb-benchmark.html spark/site/news/strata-exercises-now-available-online.html spark/site/news/submit-talks-to-spark-summit-2014.html spark/site/news/submit-talks-to-spark-summit-2016.html spark/site/news/submit-talks-to-spark-summit-east-2016.html spark/site/news/two-weeks-to-spark-summit-2014.html spark/site/news/video-from-first-spark-development-meetup.html spark/site/releases/spark-release-0-3.html spark/site/releases/spark-release-0-5-0.html spark/site/releases/spark-release-0-5-1.html spark/site/releases/spark-release-0-5-2.html spark/site/releases/spark-release-0-6-0.html spark/site/releases/spark-release-0-6-1.html spark/site/releases/spark-release-0-6-2.html spark/site/releases/spark-release-0-7-0.html spark/site/releases/spark-release-0-7-2.html spark/site/releases/spark-release-0-7-3.html spark/site/releases/spark-release-0-8-0.html spark/site/releases/spark-release-0-8-1.html spark/site/releases/spark-release-0-9-0.html spark/site/releases/spark-release-0-9-1.html spark/site/releases/spark-release-0-9-2.html spark/site/releases/spark-release-1-0-0.html spark/site/releases/spark-release-1-0-1.html spark/site/releases/spark-release-1-0-2.html spark/site/releases/spark-release-1-1-0.html spark/site/releases/spark-release-1-1-1.html spark/site/releases/spark-release-1-2-0.html spark/site/releases/spark-release-1-2-1.html spark/site/releases/spark-release-1-2-2.html spar
spark git commit: [SPARK-15749][SQL] make the error message more meaningful
Repository: spark Updated Branches: refs/heads/master e849285df -> 62d2fa5e9 [SPARK-15749][SQL] make the error message more meaningful ## What changes were proposed in this pull request? For table test1 (C1 varchar (10), C2 varchar (10)), when I insert a row using ``` sqlContext.sql("insert into test1 values ('abc', 'def', 1)") ``` I got error message ``` Exception in thread "main" java.lang.RuntimeException: RelationC1#0,C2#1 JDBCRelation(test1) requires that the query in the SELECT clause of the INSERT INTO/OVERWRITE statement generates the same number of columns as its schema. ``` The error message is a little confusing. In my simple insert statement, it doesn't have a SELECT clause. I will change the error message to a more general one ``` Exception in thread "main" java.lang.RuntimeException: RelationC1#0,C2#1 JDBCRelation(test1) requires that the data to be inserted have the same number of columns as the target table. ``` ## How was this patch tested? I tested the patch using my simple unit test, but it's a very trivial change and I don't think I need to check in any test. Author: Huaxin Gao Closes #13492 from huaxingao/spark-15749. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/62d2fa5e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/62d2fa5e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/62d2fa5e Branch: refs/heads/master Commit: 62d2fa5e996d428caaea005041b17ec115473762 Parents: e849285 Author: Huaxin Gao Authored: Thu Jun 16 14:37:10 2016 -0700 Committer: Andrew Or Committed: Thu Jun 16 14:37:10 2016 -0700 -- .../org/apache/spark/sql/execution/datasources/rules.scala | 5 +++-- .../test/scala/org/apache/spark/sql/sources/InsertSuite.scala | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/62d2fa5e/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala index 7ac62fb..543389e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala @@ -78,8 +78,9 @@ private[sql] object PreInsertCastAndRename extends Rule[LogicalPlan] { // schema of the relation. if (l.output.size != child.output.size) { sys.error( -s"$l requires that the query in the SELECT clause of the INSERT INTO/OVERWRITE " + - s"statement generates the same number of columns as its schema.") +s"$l requires that the data to be inserted have the same number of columns as the " + + s"target table: target table has ${l.output.size} column(s) but " + + s"the inserted data has ${child.output.size} column(s).") } castAndRenameChildOutput(i, l.output, child) } http://git-wip-us.apache.org/repos/asf/spark/blob/62d2fa5e/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala index bade41b..d717955 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala @@ -95,7 +95,7 @@ class InsertSuite extends DataSourceTest with SharedSQLContext { """.stripMargin) }.getMessage assert( - message.contains("generates the same number of columns as its schema"), + message.contains("requires that the data to be inserted have the same number of columns"), "SELECT clause generating a different number of columns should not be not allowed." ) } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15749][SQL] make the error message more meaningful
Repository: spark Updated Branches: refs/heads/branch-2.0 27e274c3e -> 2280ad8a3 [SPARK-15749][SQL] make the error message more meaningful ## What changes were proposed in this pull request? For table test1 (C1 varchar (10), C2 varchar (10)), when I insert a row using ``` sqlContext.sql("insert into test1 values ('abc', 'def', 1)") ``` I got error message ``` Exception in thread "main" java.lang.RuntimeException: RelationC1#0,C2#1 JDBCRelation(test1) requires that the query in the SELECT clause of the INSERT INTO/OVERWRITE statement generates the same number of columns as its schema. ``` The error message is a little confusing. In my simple insert statement, it doesn't have a SELECT clause. I will change the error message to a more general one ``` Exception in thread "main" java.lang.RuntimeException: RelationC1#0,C2#1 JDBCRelation(test1) requires that the data to be inserted have the same number of columns as the target table. ``` ## How was this patch tested? I tested the patch using my simple unit test, but it's a very trivial change and I don't think I need to check in any test. Author: Huaxin Gao Closes #13492 from huaxingao/spark-15749. (cherry picked from commit 62d2fa5e996d428caaea005041b17ec115473762) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2280ad8a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2280ad8a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2280ad8a Branch: refs/heads/branch-2.0 Commit: 2280ad8a3ddfff0b7cc10de6eadb2cc93423bbcf Parents: 27e274c Author: Huaxin Gao Authored: Thu Jun 16 14:37:10 2016 -0700 Committer: Andrew Or Committed: Thu Jun 16 14:37:19 2016 -0700 -- .../org/apache/spark/sql/execution/datasources/rules.scala | 5 +++-- .../test/scala/org/apache/spark/sql/sources/InsertSuite.scala | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2280ad8a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala index 7ac62fb..543389e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala @@ -78,8 +78,9 @@ private[sql] object PreInsertCastAndRename extends Rule[LogicalPlan] { // schema of the relation. if (l.output.size != child.output.size) { sys.error( -s"$l requires that the query in the SELECT clause of the INSERT INTO/OVERWRITE " + - s"statement generates the same number of columns as its schema.") +s"$l requires that the data to be inserted have the same number of columns as the " + + s"target table: target table has ${l.output.size} column(s) but " + + s"the inserted data has ${child.output.size} column(s).") } castAndRenameChildOutput(i, l.output, child) } http://git-wip-us.apache.org/repos/asf/spark/blob/2280ad8a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala index bade41b..d717955 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala @@ -95,7 +95,7 @@ class InsertSuite extends DataSourceTest with SharedSQLContext { """.stripMargin) }.getMessage assert( - message.contains("generates the same number of columns as its schema"), + message.contains("requires that the data to be inserted have the same number of columns"), "SELECT clause generating a different number of columns should not be not allowed." ) } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15868][WEB UI] Executors table in Executors tab should sort Executor IDs in numerical order
Repository: spark Updated Branches: refs/heads/branch-2.0 fb0fab63c -> 27e274c3e [SPARK-15868][WEB UI] Executors table in Executors tab should sort Executor IDs in numerical order ## What changes were proposed in this pull request? Currently the Executors table sorts by id using a string sort (since that's what it is stored as). Since the id is a number (other than the driver) we should be sorting numerically. I have changed both the initial sort on page load as well as the table sort to sort on id numerically, treating non-numeric strings (like the driver) as "-1" ## How was this patch tested? Manually tested and dev/run-tests ![pageload](https://cloud.githubusercontent.com/assets/13952758/16027882/d32edd0a-318e-11e6-9faf-fc972b7c36ab.png) ![sorted](https://cloud.githubusercontent.com/assets/13952758/16027883/d34541c6-318e-11e6-9ed7-6bfc0cd4152e.png) Author: Alex Bozarth Closes #13654 from ajbozarth/spark15868. (cherry picked from commit e849285df03b1233d5f647f1b6c5a6dad0665855) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/27e274c3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/27e274c3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/27e274c3 Branch: refs/heads/branch-2.0 Commit: 27e274c3e8cad29fc684a1611cef19d60acdfbc0 Parents: fb0fab6 Author: Alex Bozarth Authored: Thu Jun 16 14:29:11 2016 -0700 Committer: Andrew Or Committed: Thu Jun 16 14:29:21 2016 -0700 -- .../main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala | 9 +++-- 1 file changed, 7 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/27e274c3/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala -- diff --git a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala index 791dbe5..67deb7b 100644 --- a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala +++ b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala @@ -20,6 +20,7 @@ package org.apache.spark.ui.exec import java.net.URLEncoder import javax.servlet.http.HttpServletRequest +import scala.util.Try import scala.xml.Node import org.apache.spark.status.api.v1.ExecutorSummary @@ -53,6 +54,9 @@ private[ui] class ExecutorsPage( // When GCTimePercent is edited change ToolTips.TASK_TIME to match private val GCTimePercent = 0.1 + // a safe String to Int for sorting ids (converts non-numeric Strings to -1) + private def idStrToInt(str: String) : Int = Try(str.toInt).getOrElse(-1) + def render(request: HttpServletRequest): Seq[Node] = { val (activeExecutorInfo, deadExecutorInfo) = listener.synchronized { // The follow codes should be protected by `listener` to make sure no executors will be @@ -69,13 +73,14 @@ private[ui] class ExecutorsPage( } val execInfo = activeExecutorInfo ++ deadExecutorInfo +implicit val idOrder = Ordering[Int].on((s: String) => idStrToInt(s)).reverse val execInfoSorted = execInfo.sortBy(_.id) val logsExist = execInfo.filter(_.executorLogs.nonEmpty).nonEmpty val execTable = { - Executor ID + Executor ID Address Status RDD Blocks @@ -136,7 +141,7 @@ private[ui] class ExecutorsPage( } - {info.id} + {info.id} {info.hostPort} {executorStatus} - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15868][WEB UI] Executors table in Executors tab should sort Executor IDs in numerical order
Repository: spark Updated Branches: refs/heads/master 2d27eb1e7 -> e849285df [SPARK-15868][WEB UI] Executors table in Executors tab should sort Executor IDs in numerical order ## What changes were proposed in this pull request? Currently the Executors table sorts by id using a string sort (since that's what it is stored as). Since the id is a number (other than the driver) we should be sorting numerically. I have changed both the initial sort on page load as well as the table sort to sort on id numerically, treating non-numeric strings (like the driver) as "-1" ## How was this patch tested? Manually tested and dev/run-tests ![pageload](https://cloud.githubusercontent.com/assets/13952758/16027882/d32edd0a-318e-11e6-9faf-fc972b7c36ab.png) ![sorted](https://cloud.githubusercontent.com/assets/13952758/16027883/d34541c6-318e-11e6-9ed7-6bfc0cd4152e.png) Author: Alex Bozarth Closes #13654 from ajbozarth/spark15868. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e849285d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e849285d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e849285d Branch: refs/heads/master Commit: e849285df03b1233d5f647f1b6c5a6dad0665855 Parents: 2d27eb1 Author: Alex Bozarth Authored: Thu Jun 16 14:29:11 2016 -0700 Committer: Andrew Or Committed: Thu Jun 16 14:29:11 2016 -0700 -- .../main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala | 9 +++-- 1 file changed, 7 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e849285d/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala -- diff --git a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala index 791dbe5..67deb7b 100644 --- a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala +++ b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala @@ -20,6 +20,7 @@ package org.apache.spark.ui.exec import java.net.URLEncoder import javax.servlet.http.HttpServletRequest +import scala.util.Try import scala.xml.Node import org.apache.spark.status.api.v1.ExecutorSummary @@ -53,6 +54,9 @@ private[ui] class ExecutorsPage( // When GCTimePercent is edited change ToolTips.TASK_TIME to match private val GCTimePercent = 0.1 + // a safe String to Int for sorting ids (converts non-numeric Strings to -1) + private def idStrToInt(str: String) : Int = Try(str.toInt).getOrElse(-1) + def render(request: HttpServletRequest): Seq[Node] = { val (activeExecutorInfo, deadExecutorInfo) = listener.synchronized { // The follow codes should be protected by `listener` to make sure no executors will be @@ -69,13 +73,14 @@ private[ui] class ExecutorsPage( } val execInfo = activeExecutorInfo ++ deadExecutorInfo +implicit val idOrder = Ordering[Int].on((s: String) => idStrToInt(s)).reverse val execInfoSorted = execInfo.sortBy(_.id) val logsExist = execInfo.filter(_.executorLogs.nonEmpty).nonEmpty val execTable = { - Executor ID + Executor ID Address Status RDD Blocks @@ -136,7 +141,7 @@ private[ui] class ExecutorsPage( } - {info.id} + {info.id} {info.hostPort} {executorStatus} - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [MINOR][DOCS][SQL] Fix some comments about types(TypeCoercion, Partition) and exceptions.
Repository: spark Updated Branches: refs/heads/branch-2.0 7d8cddfb4 -> fb0fab63c [MINOR][DOCS][SQL] Fix some comments about types(TypeCoercion,Partition) and exceptions. ## What changes were proposed in this pull request? This PR contains a few changes on code comments. - `HiveTypeCoercion` is renamed into `TypeCoercion`. - `NoSuchDatabaseException` is only used for the absence of database. - For partition type inference, only `DoubleType` is considered. ## How was this patch tested? N/A Author: Dongjoon Hyun Closes #13674 from dongjoon-hyun/minor_doc_types. (cherry picked from commit 2d27eb1e753daefbd311136fc7de1a3e8fb9dc63) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fb0fab63 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fb0fab63 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fb0fab63 Branch: refs/heads/branch-2.0 Commit: fb0fab63cb005d9efc624aeb0ac85476a9ddc4f4 Parents: 7d8cddf Author: Dongjoon Hyun Authored: Thu Jun 16 14:27:09 2016 -0700 Committer: Andrew Or Committed: Thu Jun 16 14:27:17 2016 -0700 -- .../org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala| 4 ++-- .../org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala | 2 +- .../src/main/scala/org/apache/spark/sql/types/Decimal.scala | 2 +- .../spark/sql/execution/datasources/PartitioningUtils.scala | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/fb0fab63/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala index 16df628..baec6d1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala @@ -73,7 +73,7 @@ object TypeCoercion { DoubleType) /** - * Case 1 type widening (see the classdoc comment above for HiveTypeCoercion). + * Case 1 type widening (see the classdoc comment above for TypeCoercion). * * Find the tightest common type of two types that might be used in a binary expression. * This handles all numeric types except fixed-precision decimals interacting with each other or @@ -132,7 +132,7 @@ object TypeCoercion { } /** - * Case 2 type widening (see the classdoc comment above for HiveTypeCoercion). + * Case 2 type widening (see the classdoc comment above for TypeCoercion). * * i.e. the main difference with [[findTightestCommonTypeOfTwo]] is that here we allow some * loss of precision when widening decimal and double. http://git-wip-us.apache.org/repos/asf/spark/blob/fb0fab63/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala index 81974b2..6714846 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala @@ -27,7 +27,7 @@ import org.apache.spark.sql.catalyst.analysis.NoSuchDatabaseException * can be accessed in multiple threads. This is an external catalog because it is expected to * interact with external systems. * - * Implementations should throw [[NoSuchDatabaseException]] when table or database don't exist. + * Implementations should throw [[NoSuchDatabaseException]] when databases don't exist. */ abstract class ExternalCatalog { import CatalogTypes.TablePartitionSpec http://git-wip-us.apache.org/repos/asf/spark/blob/fb0fab63/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala index 52e0210..cc8175c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala @@ -322,7 +322,7 @@ final class Decimal extends Ordered[Decimal] with Serializable { } } - // HiveTypeCoercion will take care of the precision, scale of result + // TypeCoercion will take care of the precision, scale of result def * (that: Decima
spark git commit: [MINOR][DOCS][SQL] Fix some comments about types(TypeCoercion, Partition) and exceptions.
Repository: spark Updated Branches: refs/heads/master 796429d71 -> 2d27eb1e7 [MINOR][DOCS][SQL] Fix some comments about types(TypeCoercion,Partition) and exceptions. ## What changes were proposed in this pull request? This PR contains a few changes on code comments. - `HiveTypeCoercion` is renamed into `TypeCoercion`. - `NoSuchDatabaseException` is only used for the absence of database. - For partition type inference, only `DoubleType` is considered. ## How was this patch tested? N/A Author: Dongjoon Hyun Closes #13674 from dongjoon-hyun/minor_doc_types. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2d27eb1e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2d27eb1e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2d27eb1e Branch: refs/heads/master Commit: 2d27eb1e753daefbd311136fc7de1a3e8fb9dc63 Parents: 796429d Author: Dongjoon Hyun Authored: Thu Jun 16 14:27:09 2016 -0700 Committer: Andrew Or Committed: Thu Jun 16 14:27:09 2016 -0700 -- .../org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala| 4 ++-- .../org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala | 2 +- .../src/main/scala/org/apache/spark/sql/types/Decimal.scala | 2 +- .../spark/sql/execution/datasources/PartitioningUtils.scala | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2d27eb1e/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala index 16df628..baec6d1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala @@ -73,7 +73,7 @@ object TypeCoercion { DoubleType) /** - * Case 1 type widening (see the classdoc comment above for HiveTypeCoercion). + * Case 1 type widening (see the classdoc comment above for TypeCoercion). * * Find the tightest common type of two types that might be used in a binary expression. * This handles all numeric types except fixed-precision decimals interacting with each other or @@ -132,7 +132,7 @@ object TypeCoercion { } /** - * Case 2 type widening (see the classdoc comment above for HiveTypeCoercion). + * Case 2 type widening (see the classdoc comment above for TypeCoercion). * * i.e. the main difference with [[findTightestCommonTypeOfTwo]] is that here we allow some * loss of precision when widening decimal and double. http://git-wip-us.apache.org/repos/asf/spark/blob/2d27eb1e/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala index 81974b2..6714846 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala @@ -27,7 +27,7 @@ import org.apache.spark.sql.catalyst.analysis.NoSuchDatabaseException * can be accessed in multiple threads. This is an external catalog because it is expected to * interact with external systems. * - * Implementations should throw [[NoSuchDatabaseException]] when table or database don't exist. + * Implementations should throw [[NoSuchDatabaseException]] when databases don't exist. */ abstract class ExternalCatalog { import CatalogTypes.TablePartitionSpec http://git-wip-us.apache.org/repos/asf/spark/blob/2d27eb1e/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala index 52e0210..cc8175c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala @@ -322,7 +322,7 @@ final class Decimal extends Ordered[Decimal] with Serializable { } } - // HiveTypeCoercion will take care of the precision, scale of result + // TypeCoercion will take care of the precision, scale of result def * (that: Decimal): Decimal = Decimal(toJavaBigDecimal.multiply(that.toJavaBigDecimal, MATH_CONTEXT)) http://git
spark git commit: [SPARK-15998][SQL] Verification of SQLConf HIVE_METASTORE_PARTITION_PRUNING
Repository: spark Updated Branches: refs/heads/branch-2.0 1230516d9 -> 7d8cddfb4 [SPARK-15998][SQL] Verification of SQLConf HIVE_METASTORE_PARTITION_PRUNING What changes were proposed in this pull request? `HIVE_METASTORE_PARTITION_PRUNING` is a public `SQLConf`. When `true`, some predicates will be pushed down into the Hive metastore so that unmatching partitions can be eliminated earlier. The current default value is `false`. For performance improvement, users might turn this parameter on. So far, the code base does not have such a test case to verify whether this `SQLConf` properly works. This PR is to improve the test case coverage for avoiding future regression. How was this patch tested? N/A Author: gatorsmile Closes #13716 from gatorsmile/addTestMetastorePartitionPruning. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7d8cddfb Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7d8cddfb Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7d8cddfb Branch: refs/heads/branch-2.0 Commit: 7d8cddfb495d406b9f2fb5216edd14dea442ec73 Parents: 1230516 Author: gatorsmile Authored: Thu Jun 16 14:23:17 2016 -0700 Committer: Andrew Or Committed: Thu Jun 16 14:26:46 2016 -0700 -- .../sql/hive/execution/HiveTableScanSuite.scala | 60 +++- 1 file changed, 57 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7d8cddfb/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala index 60f8be5..76d3f3d 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala @@ -18,13 +18,14 @@ package org.apache.spark.sql.hive.execution import org.apache.spark.sql.Row -import org.apache.spark.sql.functions._ -import org.apache.spark.sql.hive.test.TestHive +import org.apache.spark.sql.hive.test.{TestHive, TestHiveSingleton} import org.apache.spark.sql.hive.test.TestHive._ import org.apache.spark.sql.hive.test.TestHive.implicits._ +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.util.Utils -class HiveTableScanSuite extends HiveComparisonTest { +class HiveTableScanSuite extends HiveComparisonTest with SQLTestUtils with TestHiveSingleton { createQueryTest("partition_based_table_scan_with_different_serde", """ @@ -89,4 +90,57 @@ class HiveTableScanSuite extends HiveComparisonTest { assert(sql("select CaseSensitiveColName from spark_4959_2").head() === Row("hi")) assert(sql("select casesensitivecolname from spark_4959_2").head() === Row("hi")) } + + private def checkNumScannedPartitions(stmt: String, expectedNumParts: Int): Unit = { +val plan = sql(stmt).queryExecution.sparkPlan +val numPartitions = plan.collectFirst { + case p: HiveTableScanExec => +p.relation.getHiveQlPartitions(p.partitionPruningPred).length +}.getOrElse(0) +assert(numPartitions == expectedNumParts) + } + + test("Verify SQLConf HIVE_METASTORE_PARTITION_PRUNING") { +val view = "src" +withTempTable(view) { + spark.range(1, 5).createOrReplaceTempView(view) + val table = "table_with_partition" + withTable(table) { +sql( + s""" + |CREATE TABLE $table(id string) + |PARTITIONED BY (p1 string,p2 string,p3 string,p4 string,p5 string) + """.stripMargin) +sql( + s""" + |FROM $view v + |INSERT INTO TABLE $table + |PARTITION (p1='a',p2='b',p3='c',p4='d',p5='e') + |SELECT v.id + |INSERT INTO TABLE $table + |PARTITION (p1='a',p2='c',p3='c',p4='d',p5='e') + |SELECT v.id + """.stripMargin) + +Seq("true", "false").foreach { hivePruning => + withSQLConf(SQLConf.HIVE_METASTORE_PARTITION_PRUNING.key -> hivePruning) { +// If the pruning predicate is used, getHiveQlPartitions should only return the +// qualified partition; Otherwise, it return all the partitions. +val expectedNumPartitions = if (hivePruning == "true") 1 else 2 +checkNumScannedPartitions( + stmt = s"SELECT id, p2 FROM $table WHERE p2 <= 'b'", expectedNumPartitions) + } +} + +Seq("true", "false").foreach { hivePruning => + withSQLConf(SQLConf.HIVE_METASTORE_PARTITION_PRUNING.key -> hivePruning) { +
spark git commit: [SPARK-15998][SQL] Verification of SQLConf HIVE_METASTORE_PARTITION_PRUNING
Repository: spark Updated Branches: refs/heads/master 7a89f2adb -> 796429d71 [SPARK-15998][SQL] Verification of SQLConf HIVE_METASTORE_PARTITION_PRUNING What changes were proposed in this pull request? `HIVE_METASTORE_PARTITION_PRUNING` is a public `SQLConf`. When `true`, some predicates will be pushed down into the Hive metastore so that unmatching partitions can be eliminated earlier. The current default value is `false`. For performance improvement, users might turn this parameter on. So far, the code base does not have such a test case to verify whether this `SQLConf` properly works. This PR is to improve the test case coverage for avoiding future regression. How was this patch tested? N/A Author: gatorsmile Closes #13716 from gatorsmile/addTestMetastorePartitionPruning. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/796429d7 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/796429d7 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/796429d7 Branch: refs/heads/master Commit: 796429d7117e2544207bd9d67bda8b603cb1a535 Parents: 7a89f2a Author: gatorsmile Authored: Thu Jun 16 14:23:17 2016 -0700 Committer: Andrew Or Committed: Thu Jun 16 14:23:17 2016 -0700 -- .../sql/hive/execution/HiveTableScanSuite.scala | 60 +++- 1 file changed, 57 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/796429d7/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala index 60f8be5..76d3f3d 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala @@ -18,13 +18,14 @@ package org.apache.spark.sql.hive.execution import org.apache.spark.sql.Row -import org.apache.spark.sql.functions._ -import org.apache.spark.sql.hive.test.TestHive +import org.apache.spark.sql.hive.test.{TestHive, TestHiveSingleton} import org.apache.spark.sql.hive.test.TestHive._ import org.apache.spark.sql.hive.test.TestHive.implicits._ +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.util.Utils -class HiveTableScanSuite extends HiveComparisonTest { +class HiveTableScanSuite extends HiveComparisonTest with SQLTestUtils with TestHiveSingleton { createQueryTest("partition_based_table_scan_with_different_serde", """ @@ -89,4 +90,57 @@ class HiveTableScanSuite extends HiveComparisonTest { assert(sql("select CaseSensitiveColName from spark_4959_2").head() === Row("hi")) assert(sql("select casesensitivecolname from spark_4959_2").head() === Row("hi")) } + + private def checkNumScannedPartitions(stmt: String, expectedNumParts: Int): Unit = { +val plan = sql(stmt).queryExecution.sparkPlan +val numPartitions = plan.collectFirst { + case p: HiveTableScanExec => +p.relation.getHiveQlPartitions(p.partitionPruningPred).length +}.getOrElse(0) +assert(numPartitions == expectedNumParts) + } + + test("Verify SQLConf HIVE_METASTORE_PARTITION_PRUNING") { +val view = "src" +withTempTable(view) { + spark.range(1, 5).createOrReplaceTempView(view) + val table = "table_with_partition" + withTable(table) { +sql( + s""" + |CREATE TABLE $table(id string) + |PARTITIONED BY (p1 string,p2 string,p3 string,p4 string,p5 string) + """.stripMargin) +sql( + s""" + |FROM $view v + |INSERT INTO TABLE $table + |PARTITION (p1='a',p2='b',p3='c',p4='d',p5='e') + |SELECT v.id + |INSERT INTO TABLE $table + |PARTITION (p1='a',p2='c',p3='c',p4='d',p5='e') + |SELECT v.id + """.stripMargin) + +Seq("true", "false").foreach { hivePruning => + withSQLConf(SQLConf.HIVE_METASTORE_PARTITION_PRUNING.key -> hivePruning) { +// If the pruning predicate is used, getHiveQlPartitions should only return the +// qualified partition; Otherwise, it return all the partitions. +val expectedNumPartitions = if (hivePruning == "true") 1 else 2 +checkNumScannedPartitions( + stmt = s"SELECT id, p2 FROM $table WHERE p2 <= 'b'", expectedNumPartitions) + } +} + +Seq("true", "false").foreach { hivePruning => + withSQLConf(SQLConf.HIVE_METASTORE_PARTITION_PRUNING.key -> hivePruning) { +
spark git commit: [SQL] Minor HashAggregateExec string output fixes
Repository: spark Updated Branches: refs/heads/branch-2.0 938988757 -> 1230516d9 [SQL] Minor HashAggregateExec string output fixes ## What changes were proposed in this pull request? This PR fixes some minor `.toString` format issues for `HashAggregateExec`. Before: ``` *HashAggregate(key=[a#234L,b#235L], functions=[count(1),max(c#236L)], output=[a#234L,b#235L,count(c)#247L,max(c)#248L]) ``` After: ``` *HashAggregate(keys=[a#234L, b#235L], functions=[count(1), max(c#236L)], output=[a#234L, b#235L, count(c)#247L, max(c)#248L]) ``` ## How was this patch tested? Manually tested. Author: Cheng Lian Closes #13710 from liancheng/minor-agg-string-fix. (cherry picked from commit 7a89f2adbbc82a23f06638806ffc8596a7efe7f3) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1230516d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1230516d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1230516d Branch: refs/heads/branch-2.0 Commit: 1230516d9314f55183bfa542eb7cdfac9d8dfec5 Parents: 9389887 Author: Cheng Lian Authored: Thu Jun 16 14:20:44 2016 -0700 Committer: Andrew Or Committed: Thu Jun 16 14:20:52 2016 -0700 -- .../spark/sql/execution/aggregate/HashAggregateExec.scala | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1230516d/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala index caeeba1..54d7340 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala @@ -774,13 +774,13 @@ case class HashAggregateExec( testFallbackStartsAt match { case None => -val keyString = Utils.truncatedString(groupingExpressions, "[", ",", "]") -val functionString = Utils.truncatedString(allAggregateExpressions, "[", ",", "]") -val outputString = Utils.truncatedString(output, "[", ",", "]") +val keyString = Utils.truncatedString(groupingExpressions, "[", ", ", "]") +val functionString = Utils.truncatedString(allAggregateExpressions, "[", ", ", "]") +val outputString = Utils.truncatedString(output, "[", ", ", "]") if (verbose) { - s"HashAggregate(key=$keyString, functions=$functionString, output=$outputString)" + s"HashAggregate(keys=$keyString, functions=$functionString, output=$outputString)" } else { - s"HashAggregate(key=$keyString, functions=$functionString)" + s"HashAggregate(keys=$keyString, functions=$functionString)" } case Some(fallbackStartsAt) => s"HashAggregateWithControlledFallback $groupingExpressions " + - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SQL] Minor HashAggregateExec string output fixes
Repository: spark Updated Branches: refs/heads/master acef843f6 -> 7a89f2adb [SQL] Minor HashAggregateExec string output fixes ## What changes were proposed in this pull request? This PR fixes some minor `.toString` format issues for `HashAggregateExec`. Before: ``` *HashAggregate(key=[a#234L,b#235L], functions=[count(1),max(c#236L)], output=[a#234L,b#235L,count(c)#247L,max(c)#248L]) ``` After: ``` *HashAggregate(keys=[a#234L, b#235L], functions=[count(1), max(c#236L)], output=[a#234L, b#235L, count(c)#247L, max(c)#248L]) ``` ## How was this patch tested? Manually tested. Author: Cheng Lian Closes #13710 from liancheng/minor-agg-string-fix. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7a89f2ad Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7a89f2ad Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7a89f2ad Branch: refs/heads/master Commit: 7a89f2adbbc82a23f06638806ffc8596a7efe7f3 Parents: acef843 Author: Cheng Lian Authored: Thu Jun 16 14:20:44 2016 -0700 Committer: Andrew Or Committed: Thu Jun 16 14:20:44 2016 -0700 -- .../spark/sql/execution/aggregate/HashAggregateExec.scala | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7a89f2ad/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala index caeeba1..54d7340 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala @@ -774,13 +774,13 @@ case class HashAggregateExec( testFallbackStartsAt match { case None => -val keyString = Utils.truncatedString(groupingExpressions, "[", ",", "]") -val functionString = Utils.truncatedString(allAggregateExpressions, "[", ",", "]") -val outputString = Utils.truncatedString(output, "[", ",", "]") +val keyString = Utils.truncatedString(groupingExpressions, "[", ", ", "]") +val functionString = Utils.truncatedString(allAggregateExpressions, "[", ", ", "]") +val outputString = Utils.truncatedString(output, "[", ", ", "]") if (verbose) { - s"HashAggregate(key=$keyString, functions=$functionString, output=$outputString)" + s"HashAggregate(keys=$keyString, functions=$functionString, output=$outputString)" } else { - s"HashAggregate(key=$keyString, functions=$functionString)" + s"HashAggregate(keys=$keyString, functions=$functionString)" } case Some(fallbackStartsAt) => s"HashAggregateWithControlledFallback $groupingExpressions " + - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15975] Fix improper Popen retcode code handling in dev/run-tests
Repository: spark Updated Branches: refs/heads/branch-1.6 cffc0800b -> 0a8ada506 [SPARK-15975] Fix improper Popen retcode code handling in dev/run-tests In the `dev/run-tests.py` script we check a `Popen.retcode` for success using `retcode > 0`, but this is subtlety wrong because Popen's return code will be negative if the child process was terminated by a signal: https://docs.python.org/2/library/subprocess.html#subprocess.Popen.returncode In order to properly handle signals, we should change this to check `retcode != 0` instead. Author: Josh Rosen Closes #13692 from JoshRosen/dev-run-tests-return-code-handling. (cherry picked from commit acef843f67e770f0a2709fb3fbd1a53c200b2bc5) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0a8ada50 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0a8ada50 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0a8ada50 Branch: refs/heads/branch-1.6 Commit: 0a8ada5064bec22116363f93ed476352776b49e4 Parents: cffc080 Author: Josh Rosen Authored: Thu Jun 16 14:18:58 2016 -0700 Committer: Andrew Or Committed: Thu Jun 16 14:19:19 2016 -0700 -- dev/run-tests.py | 2 +- dev/sparktestsupport/shellutils.py | 5 - 2 files changed, 5 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0a8ada50/dev/run-tests.py -- diff --git a/dev/run-tests.py b/dev/run-tests.py index 4a18d1a..e42e073 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -284,7 +284,7 @@ def exec_sbt(sbt_args=()): print(line, end='') retcode = sbt_proc.wait() -if retcode > 0: +if retcode != 0: exit_from_command_with_retcode(sbt_cmd, retcode) http://git-wip-us.apache.org/repos/asf/spark/blob/0a8ada50/dev/sparktestsupport/shellutils.py -- diff --git a/dev/sparktestsupport/shellutils.py b/dev/sparktestsupport/shellutils.py index d280e79..05af871 100644 --- a/dev/sparktestsupport/shellutils.py +++ b/dev/sparktestsupport/shellutils.py @@ -53,7 +53,10 @@ else: def exit_from_command_with_retcode(cmd, retcode): -print("[error] running", ' '.join(cmd), "; received return code", retcode) +if retcode < 0: +print("[error] running", ' '.join(cmd), "; process was terminated by signal", -retcode) +else: +print("[error] running", ' '.join(cmd), "; received return code", retcode) sys.exit(int(os.environ.get("CURRENT_BLOCK", 255))) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15975] Fix improper Popen retcode code handling in dev/run-tests
Repository: spark Updated Branches: refs/heads/branch-2.0 d9dd46edd -> 938988757 [SPARK-15975] Fix improper Popen retcode code handling in dev/run-tests In the `dev/run-tests.py` script we check a `Popen.retcode` for success using `retcode > 0`, but this is subtlety wrong because Popen's return code will be negative if the child process was terminated by a signal: https://docs.python.org/2/library/subprocess.html#subprocess.Popen.returncode In order to properly handle signals, we should change this to check `retcode != 0` instead. Author: Josh Rosen Closes #13692 from JoshRosen/dev-run-tests-return-code-handling. (cherry picked from commit acef843f67e770f0a2709fb3fbd1a53c200b2bc5) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/93898875 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/93898875 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/93898875 Branch: refs/heads/branch-2.0 Commit: 9389887571705e03d18e695301f0cb0aa5bd9e21 Parents: d9dd46e Author: Josh Rosen Authored: Thu Jun 16 14:18:58 2016 -0700 Committer: Andrew Or Committed: Thu Jun 16 14:19:08 2016 -0700 -- dev/run-tests.py | 2 +- dev/sparktestsupport/shellutils.py | 5 - 2 files changed, 5 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/93898875/dev/run-tests.py -- diff --git a/dev/run-tests.py b/dev/run-tests.py index dcf1be9..930d7f8 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -294,7 +294,7 @@ def exec_sbt(sbt_args=()): print(line, end='') retcode = sbt_proc.wait() -if retcode > 0: +if retcode != 0: exit_from_command_with_retcode(sbt_cmd, retcode) http://git-wip-us.apache.org/repos/asf/spark/blob/93898875/dev/sparktestsupport/shellutils.py -- diff --git a/dev/sparktestsupport/shellutils.py b/dev/sparktestsupport/shellutils.py index d280e79..05af871 100644 --- a/dev/sparktestsupport/shellutils.py +++ b/dev/sparktestsupport/shellutils.py @@ -53,7 +53,10 @@ else: def exit_from_command_with_retcode(cmd, retcode): -print("[error] running", ' '.join(cmd), "; received return code", retcode) +if retcode < 0: +print("[error] running", ' '.join(cmd), "; process was terminated by signal", -retcode) +else: +print("[error] running", ' '.join(cmd), "; received return code", retcode) sys.exit(int(os.environ.get("CURRENT_BLOCK", 255))) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15975] Fix improper Popen retcode code handling in dev/run-tests
Repository: spark Updated Branches: refs/heads/branch-1.5 6043fa8df -> 1891e04a6 [SPARK-15975] Fix improper Popen retcode code handling in dev/run-tests In the `dev/run-tests.py` script we check a `Popen.retcode` for success using `retcode > 0`, but this is subtlety wrong because Popen's return code will be negative if the child process was terminated by a signal: https://docs.python.org/2/library/subprocess.html#subprocess.Popen.returncode In order to properly handle signals, we should change this to check `retcode != 0` instead. Author: Josh Rosen Closes #13692 from JoshRosen/dev-run-tests-return-code-handling. (cherry picked from commit acef843f67e770f0a2709fb3fbd1a53c200b2bc5) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1891e04a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1891e04a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1891e04a Branch: refs/heads/branch-1.5 Commit: 1891e04a6441606f9bb14cf39f06a7d39cce456b Parents: 6043fa8 Author: Josh Rosen Authored: Thu Jun 16 14:18:58 2016 -0700 Committer: Andrew Or Committed: Thu Jun 16 14:19:32 2016 -0700 -- dev/run-tests.py | 2 +- dev/sparktestsupport/shellutils.py | 5 - 2 files changed, 5 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1891e04a/dev/run-tests.py -- diff --git a/dev/run-tests.py b/dev/run-tests.py index 623b93c..bc54968 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -255,7 +255,7 @@ def exec_sbt(sbt_args=()): print(line, end='') retcode = sbt_proc.wait() -if retcode > 0: +if retcode != 0: exit_from_command_with_retcode(sbt_cmd, retcode) http://git-wip-us.apache.org/repos/asf/spark/blob/1891e04a/dev/sparktestsupport/shellutils.py -- diff --git a/dev/sparktestsupport/shellutils.py b/dev/sparktestsupport/shellutils.py index 12bd0bf..af483a9 100644 --- a/dev/sparktestsupport/shellutils.py +++ b/dev/sparktestsupport/shellutils.py @@ -23,7 +23,10 @@ import sys def exit_from_command_with_retcode(cmd, retcode): -print("[error] running", ' '.join(cmd), "; received return code", retcode) +if retcode < 0: +print("[error] running", ' '.join(cmd), "; process was terminated by signal", -retcode) +else: +print("[error] running", ' '.join(cmd), "; received return code", retcode) sys.exit(int(os.environ.get("CURRENT_BLOCK", 255))) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15975] Fix improper Popen retcode code handling in dev/run-tests
Repository: spark Updated Branches: refs/heads/master bbad4cb48 -> acef843f6 [SPARK-15975] Fix improper Popen retcode code handling in dev/run-tests In the `dev/run-tests.py` script we check a `Popen.retcode` for success using `retcode > 0`, but this is subtlety wrong because Popen's return code will be negative if the child process was terminated by a signal: https://docs.python.org/2/library/subprocess.html#subprocess.Popen.returncode In order to properly handle signals, we should change this to check `retcode != 0` instead. Author: Josh Rosen Closes #13692 from JoshRosen/dev-run-tests-return-code-handling. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/acef843f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/acef843f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/acef843f Branch: refs/heads/master Commit: acef843f67e770f0a2709fb3fbd1a53c200b2bc5 Parents: bbad4cb Author: Josh Rosen Authored: Thu Jun 16 14:18:58 2016 -0700 Committer: Andrew Or Committed: Thu Jun 16 14:18:58 2016 -0700 -- dev/run-tests.py | 2 +- dev/sparktestsupport/shellutils.py | 5 - 2 files changed, 5 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/acef843f/dev/run-tests.py -- diff --git a/dev/run-tests.py b/dev/run-tests.py index dcf1be9..930d7f8 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -294,7 +294,7 @@ def exec_sbt(sbt_args=()): print(line, end='') retcode = sbt_proc.wait() -if retcode > 0: +if retcode != 0: exit_from_command_with_retcode(sbt_cmd, retcode) http://git-wip-us.apache.org/repos/asf/spark/blob/acef843f/dev/sparktestsupport/shellutils.py -- diff --git a/dev/sparktestsupport/shellutils.py b/dev/sparktestsupport/shellutils.py index d280e79..05af871 100644 --- a/dev/sparktestsupport/shellutils.py +++ b/dev/sparktestsupport/shellutils.py @@ -53,7 +53,10 @@ else: def exit_from_command_with_retcode(cmd, retcode): -print("[error] running", ' '.join(cmd), "; received return code", retcode) +if retcode < 0: +print("[error] running", ' '.join(cmd), "; process was terminated by signal", -retcode) +else: +print("[error] running", ' '.join(cmd), "; received return code", retcode) sys.exit(int(os.environ.get("CURRENT_BLOCK", 255))) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15978][SQL] improve 'show tables' command related codes
Repository: spark Updated Branches: refs/heads/branch-2.0 095ddb4c9 -> d9dd46edd [SPARK-15978][SQL] improve 'show tables' command related codes ## What changes were proposed in this pull request? I've found some minor issues in "show tables" command: 1. In the `SessionCatalog.scala`, `listTables(db: String)` method will call `listTables(formatDatabaseName(db), "*")` to list all the tables for certain db, but in the method `listTables(db: String, pattern: String)`, this db name is formatted once more. So I think we should remove `formatDatabaseName()` in the caller. 2. I suggest to add sort to listTables(db: String) in InMemoryCatalog.scala, just like listDatabases(). ## How was this patch tested? The existing test cases should cover it. Author: bomeng Closes #13695 from bomeng/SPARK-15978. (cherry picked from commit bbad4cb48df2ac3ed7edb4c02db79540bd4085d8) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d9dd46ed Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d9dd46ed Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d9dd46ed Branch: refs/heads/branch-2.0 Commit: d9dd46edd3635ed79134a1521403c4478a34d3b3 Parents: 095ddb4 Author: bomeng Authored: Thu Jun 16 14:18:02 2016 -0700 Committer: Andrew Or Committed: Thu Jun 16 14:18:12 2016 -0700 -- .../org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala| 2 +- .../org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d9dd46ed/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala index 14da30a..fb3e1b3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala @@ -286,7 +286,7 @@ class InMemoryCatalog(hadoopConfig: Configuration = new Configuration) extends E override def listTables(db: String): Seq[String] = synchronized { requireDbExists(db) -catalog(db).tables.keySet.toSeq +catalog(db).tables.keySet.toSeq.sorted } override def listTables(db: String, pattern: String): Seq[String] = synchronized { http://git-wip-us.apache.org/repos/asf/spark/blob/d9dd46ed/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala index 1ec1bb1..7ab10d1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala @@ -445,7 +445,7 @@ class SessionCatalog( /** * List all tables in the specified database, including temporary tables. */ - def listTables(db: String): Seq[TableIdentifier] = listTables(formatDatabaseName(db), "*") + def listTables(db: String): Seq[TableIdentifier] = listTables(db, "*") /** * List all matching tables in the specified database, including temporary tables. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15978][SQL] improve 'show tables' command related codes
Repository: spark Updated Branches: refs/heads/master 457126e42 -> bbad4cb48 [SPARK-15978][SQL] improve 'show tables' command related codes ## What changes were proposed in this pull request? I've found some minor issues in "show tables" command: 1. In the `SessionCatalog.scala`, `listTables(db: String)` method will call `listTables(formatDatabaseName(db), "*")` to list all the tables for certain db, but in the method `listTables(db: String, pattern: String)`, this db name is formatted once more. So I think we should remove `formatDatabaseName()` in the caller. 2. I suggest to add sort to listTables(db: String) in InMemoryCatalog.scala, just like listDatabases(). ## How was this patch tested? The existing test cases should cover it. Author: bomeng Closes #13695 from bomeng/SPARK-15978. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bbad4cb4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bbad4cb4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bbad4cb4 Branch: refs/heads/master Commit: bbad4cb48df2ac3ed7edb4c02db79540bd4085d8 Parents: 457126e Author: bomeng Authored: Thu Jun 16 14:18:02 2016 -0700 Committer: Andrew Or Committed: Thu Jun 16 14:18:02 2016 -0700 -- .../org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala| 2 +- .../org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/bbad4cb4/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala index 14da30a..fb3e1b3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala @@ -286,7 +286,7 @@ class InMemoryCatalog(hadoopConfig: Configuration = new Configuration) extends E override def listTables(db: String): Seq[String] = synchronized { requireDbExists(db) -catalog(db).tables.keySet.toSeq +catalog(db).tables.keySet.toSeq.sorted } override def listTables(db: String, pattern: String): Seq[String] = synchronized { http://git-wip-us.apache.org/repos/asf/spark/blob/bbad4cb4/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala index 1ec1bb1..7ab10d1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala @@ -445,7 +445,7 @@ class SessionCatalog( /** * List all tables in the specified database, including temporary tables. */ - def listTables(db: String): Seq[TableIdentifier] = listTables(formatDatabaseName(db), "*") + def listTables(db: String): Seq[TableIdentifier] = listTables(db, "*") /** * List all matching tables in the specified database, including temporary tables. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15796][CORE] Reduce spark.memory.fraction default to avoid overrunning old gen in JVM default config
Repository: spark Updated Branches: refs/heads/branch-2.0 579268426 -> 095ddb4c9 [SPARK-15796][CORE] Reduce spark.memory.fraction default to avoid overrunning old gen in JVM default config ## What changes were proposed in this pull request? Reduce `spark.memory.fraction` default to 0.6 in order to make it fit within default JVM old generation size (2/3 heap). See JIRA discussion. This means a full cache doesn't spill into the new gen. CC andrewor14 ## How was this patch tested? Jenkins tests. Author: Sean Owen Closes #13618 from srowen/SPARK-15796. (cherry picked from commit 457126e420e66228cc68def4bc3d87e7a282069a) Signed-off-by: Sean Owen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/095ddb4c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/095ddb4c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/095ddb4c Branch: refs/heads/branch-2.0 Commit: 095ddb4c9e7ab9193c15c69eb057a9bb2dbdaed1 Parents: 5792684 Author: Sean Owen Authored: Thu Jun 16 23:04:10 2016 +0200 Committer: Sean Owen Committed: Thu Jun 16 23:04:19 2016 +0200 -- .../spark/memory/UnifiedMemoryManager.scala | 8 .../scala/org/apache/spark/DistributedSuite.scala | 2 +- docs/configuration.md | 7 --- docs/tuning.md| 18 +- 4 files changed, 26 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/095ddb4c/core/src/main/scala/org/apache/spark/memory/UnifiedMemoryManager.scala -- diff --git a/core/src/main/scala/org/apache/spark/memory/UnifiedMemoryManager.scala b/core/src/main/scala/org/apache/spark/memory/UnifiedMemoryManager.scala index ae747c1..c7b36be 100644 --- a/core/src/main/scala/org/apache/spark/memory/UnifiedMemoryManager.scala +++ b/core/src/main/scala/org/apache/spark/memory/UnifiedMemoryManager.scala @@ -25,9 +25,9 @@ import org.apache.spark.storage.BlockId * either side can borrow memory from the other. * * The region shared between execution and storage is a fraction of (the total heap space - 300MB) - * configurable through `spark.memory.fraction` (default 0.75). The position of the boundary + * configurable through `spark.memory.fraction` (default 0.6). The position of the boundary * within this space is further determined by `spark.memory.storageFraction` (default 0.5). - * This means the size of the storage region is 0.75 * 0.5 = 0.375 of the heap space by default. + * This means the size of the storage region is 0.6 * 0.5 = 0.3 of the heap space by default. * * Storage can borrow as much execution memory as is free until execution reclaims its space. * When this happens, cached blocks will be evicted from memory until sufficient borrowed @@ -187,7 +187,7 @@ object UnifiedMemoryManager { // Set aside a fixed amount of memory for non-storage, non-execution purposes. // This serves a function similar to `spark.memory.fraction`, but guarantees that we reserve // sufficient memory for the system even for small heaps. E.g. if we have a 1GB JVM, then - // the memory used for execution and storage will be (1024 - 300) * 0.75 = 543MB by default. + // the memory used for execution and storage will be (1024 - 300) * 0.6 = 434MB by default. private val RESERVED_SYSTEM_MEMORY_BYTES = 300 * 1024 * 1024 def apply(conf: SparkConf, numCores: Int): UnifiedMemoryManager = { @@ -223,7 +223,7 @@ object UnifiedMemoryManager { } } val usableMemory = systemMemory - reservedMemory -val memoryFraction = conf.getDouble("spark.memory.fraction", 0.75) +val memoryFraction = conf.getDouble("spark.memory.fraction", 0.6) (usableMemory * memoryFraction).toLong } } http://git-wip-us.apache.org/repos/asf/spark/blob/095ddb4c/core/src/test/scala/org/apache/spark/DistributedSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/DistributedSuite.scala b/core/src/test/scala/org/apache/spark/DistributedSuite.scala index 6e69fc4..0515e6e 100644 --- a/core/src/test/scala/org/apache/spark/DistributedSuite.scala +++ b/core/src/test/scala/org/apache/spark/DistributedSuite.scala @@ -223,7 +223,7 @@ class DistributedSuite extends SparkFunSuite with Matchers with LocalSparkContex test("compute when only some partitions fit in memory") { val size = 1 -val numPartitions = 10 +val numPartitions = 20 val conf = new SparkConf() .set("spark.storage.unrollMemoryThreshold", "1024") .set("spark.testing.memory", size.toString) http://git-wip-us.apache.org/repos/asf/spark/blob/095ddb4c/docs/configuration.md --
spark git commit: [SPARK-15796][CORE] Reduce spark.memory.fraction default to avoid overrunning old gen in JVM default config
Repository: spark Updated Branches: refs/heads/master 36110a830 -> 457126e42 [SPARK-15796][CORE] Reduce spark.memory.fraction default to avoid overrunning old gen in JVM default config ## What changes were proposed in this pull request? Reduce `spark.memory.fraction` default to 0.6 in order to make it fit within default JVM old generation size (2/3 heap). See JIRA discussion. This means a full cache doesn't spill into the new gen. CC andrewor14 ## How was this patch tested? Jenkins tests. Author: Sean Owen Closes #13618 from srowen/SPARK-15796. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/457126e4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/457126e4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/457126e4 Branch: refs/heads/master Commit: 457126e420e66228cc68def4bc3d87e7a282069a Parents: 36110a8 Author: Sean Owen Authored: Thu Jun 16 23:04:10 2016 +0200 Committer: Sean Owen Committed: Thu Jun 16 23:04:10 2016 +0200 -- .../spark/memory/UnifiedMemoryManager.scala | 8 .../scala/org/apache/spark/DistributedSuite.scala | 2 +- docs/configuration.md | 7 --- docs/tuning.md| 18 +- 4 files changed, 26 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/457126e4/core/src/main/scala/org/apache/spark/memory/UnifiedMemoryManager.scala -- diff --git a/core/src/main/scala/org/apache/spark/memory/UnifiedMemoryManager.scala b/core/src/main/scala/org/apache/spark/memory/UnifiedMemoryManager.scala index ae747c1..c7b36be 100644 --- a/core/src/main/scala/org/apache/spark/memory/UnifiedMemoryManager.scala +++ b/core/src/main/scala/org/apache/spark/memory/UnifiedMemoryManager.scala @@ -25,9 +25,9 @@ import org.apache.spark.storage.BlockId * either side can borrow memory from the other. * * The region shared between execution and storage is a fraction of (the total heap space - 300MB) - * configurable through `spark.memory.fraction` (default 0.75). The position of the boundary + * configurable through `spark.memory.fraction` (default 0.6). The position of the boundary * within this space is further determined by `spark.memory.storageFraction` (default 0.5). - * This means the size of the storage region is 0.75 * 0.5 = 0.375 of the heap space by default. + * This means the size of the storage region is 0.6 * 0.5 = 0.3 of the heap space by default. * * Storage can borrow as much execution memory as is free until execution reclaims its space. * When this happens, cached blocks will be evicted from memory until sufficient borrowed @@ -187,7 +187,7 @@ object UnifiedMemoryManager { // Set aside a fixed amount of memory for non-storage, non-execution purposes. // This serves a function similar to `spark.memory.fraction`, but guarantees that we reserve // sufficient memory for the system even for small heaps. E.g. if we have a 1GB JVM, then - // the memory used for execution and storage will be (1024 - 300) * 0.75 = 543MB by default. + // the memory used for execution and storage will be (1024 - 300) * 0.6 = 434MB by default. private val RESERVED_SYSTEM_MEMORY_BYTES = 300 * 1024 * 1024 def apply(conf: SparkConf, numCores: Int): UnifiedMemoryManager = { @@ -223,7 +223,7 @@ object UnifiedMemoryManager { } } val usableMemory = systemMemory - reservedMemory -val memoryFraction = conf.getDouble("spark.memory.fraction", 0.75) +val memoryFraction = conf.getDouble("spark.memory.fraction", 0.6) (usableMemory * memoryFraction).toLong } } http://git-wip-us.apache.org/repos/asf/spark/blob/457126e4/core/src/test/scala/org/apache/spark/DistributedSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/DistributedSuite.scala b/core/src/test/scala/org/apache/spark/DistributedSuite.scala index 6e69fc4..0515e6e 100644 --- a/core/src/test/scala/org/apache/spark/DistributedSuite.scala +++ b/core/src/test/scala/org/apache/spark/DistributedSuite.scala @@ -223,7 +223,7 @@ class DistributedSuite extends SparkFunSuite with Matchers with LocalSparkContex test("compute when only some partitions fit in memory") { val size = 1 -val numPartitions = 10 +val numPartitions = 20 val conf = new SparkConf() .set("spark.storage.unrollMemoryThreshold", "1024") .set("spark.testing.memory", size.toString) http://git-wip-us.apache.org/repos/asf/spark/blob/457126e4/docs/configuration.md -- diff --git a/docs/configuration.md b/docs/configuration.md index 32c3a92..f
spark git commit: [SPARK-15922][MLLIB] `toIndexedRowMatrix` should consider the case `cols < offset+colsPerBlock`
Repository: spark Updated Branches: refs/heads/branch-2.0 5b003c9bc -> 579268426 [SPARK-15922][MLLIB] `toIndexedRowMatrix` should consider the case `cols < offset+colsPerBlock` ## What changes were proposed in this pull request? SPARK-15922 reports the following scenario throwing an exception due to the mismatched vector sizes. This PR handles the exceptional case, `cols < (offset + colsPerBlock)`. **Before** ```scala scala> import org.apache.spark.mllib.linalg.distributed._ scala> import org.apache.spark.mllib.linalg._ scala> val rows = IndexedRow(0L, new DenseVector(Array(1,2,3))) :: IndexedRow(1L, new DenseVector(Array(1,2,3))):: IndexedRow(2L, new DenseVector(Array(1,2,3))):: Nil scala> val rdd = sc.parallelize(rows) scala> val matrix = new IndexedRowMatrix(rdd, 3, 3) scala> val bmat = matrix.toBlockMatrix scala> val imat = bmat.toIndexedRowMatrix scala> imat.rows.collect ... // java.lang.IllegalArgumentException: requirement failed: Vectors must be the same length! ``` **After** ```scala ... scala> imat.rows.collect res0: Array[org.apache.spark.mllib.linalg.distributed.IndexedRow] = Array(IndexedRow(0,[1.0,2.0,3.0]), IndexedRow(1,[1.0,2.0,3.0]), IndexedRow(2,[1.0,2.0,3.0])) ``` ## How was this patch tested? Pass the Jenkins tests (including the above case) Author: Dongjoon Hyun Closes #13643 from dongjoon-hyun/SPARK-15922. (cherry picked from commit 36110a8306608186696c536028d2776e022d305a) Signed-off-by: Sean Owen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/57926842 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/57926842 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/57926842 Branch: refs/heads/branch-2.0 Commit: 5792684268b273562e694855eb671c21c4044280 Parents: 5b003c9 Author: Dongjoon Hyun Authored: Thu Jun 16 23:02:46 2016 +0200 Committer: Sean Owen Committed: Thu Jun 16 23:03:00 2016 +0200 -- .../org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala | 2 +- .../spark/mllib/linalg/distributed/BlockMatrixSuite.scala | 5 + 2 files changed, 6 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/57926842/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala index 7a24617..639295c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala @@ -288,7 +288,7 @@ class BlockMatrix @Since("1.3.0") ( vectors.foreach { case (blockColIdx: Int, vec: BV[Double]) => val offset = colsPerBlock * blockColIdx -wholeVector(offset until offset + colsPerBlock) := vec +wholeVector(offset until Math.min(cols, offset + colsPerBlock)) := vec } new IndexedRow(rowIdx, Vectors.fromBreeze(wholeVector)) } http://git-wip-us.apache.org/repos/asf/spark/blob/57926842/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala index e5a2cbb..61266f3 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala @@ -135,6 +135,11 @@ class BlockMatrixSuite extends SparkFunSuite with MLlibTestSparkContext { assert(rowMat.numCols() === n) assert(rowMat.toBreeze() === gridBasedMat.toBreeze()) +// SPARK-15922: BlockMatrix to IndexedRowMatrix throws an error" +val bmat = rowMat.toBlockMatrix +val imat = bmat.toIndexedRowMatrix +imat.rows.collect + val rows = 1 val cols = 10 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15922][MLLIB] `toIndexedRowMatrix` should consider the case `cols < offset+colsPerBlock`
Repository: spark Updated Branches: refs/heads/master f9bf15d9b -> 36110a830 [SPARK-15922][MLLIB] `toIndexedRowMatrix` should consider the case `cols < offset+colsPerBlock` ## What changes were proposed in this pull request? SPARK-15922 reports the following scenario throwing an exception due to the mismatched vector sizes. This PR handles the exceptional case, `cols < (offset + colsPerBlock)`. **Before** ```scala scala> import org.apache.spark.mllib.linalg.distributed._ scala> import org.apache.spark.mllib.linalg._ scala> val rows = IndexedRow(0L, new DenseVector(Array(1,2,3))) :: IndexedRow(1L, new DenseVector(Array(1,2,3))):: IndexedRow(2L, new DenseVector(Array(1,2,3))):: Nil scala> val rdd = sc.parallelize(rows) scala> val matrix = new IndexedRowMatrix(rdd, 3, 3) scala> val bmat = matrix.toBlockMatrix scala> val imat = bmat.toIndexedRowMatrix scala> imat.rows.collect ... // java.lang.IllegalArgumentException: requirement failed: Vectors must be the same length! ``` **After** ```scala ... scala> imat.rows.collect res0: Array[org.apache.spark.mllib.linalg.distributed.IndexedRow] = Array(IndexedRow(0,[1.0,2.0,3.0]), IndexedRow(1,[1.0,2.0,3.0]), IndexedRow(2,[1.0,2.0,3.0])) ``` ## How was this patch tested? Pass the Jenkins tests (including the above case) Author: Dongjoon Hyun Closes #13643 from dongjoon-hyun/SPARK-15922. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/36110a83 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/36110a83 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/36110a83 Branch: refs/heads/master Commit: 36110a8306608186696c536028d2776e022d305a Parents: f9bf15d Author: Dongjoon Hyun Authored: Thu Jun 16 23:02:46 2016 +0200 Committer: Sean Owen Committed: Thu Jun 16 23:02:46 2016 +0200 -- .../org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala | 2 +- .../spark/mllib/linalg/distributed/BlockMatrixSuite.scala | 5 + 2 files changed, 6 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/36110a83/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala index 7a24617..639295c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala @@ -288,7 +288,7 @@ class BlockMatrix @Since("1.3.0") ( vectors.foreach { case (blockColIdx: Int, vec: BV[Double]) => val offset = colsPerBlock * blockColIdx -wholeVector(offset until offset + colsPerBlock) := vec +wholeVector(offset until Math.min(cols, offset + colsPerBlock)) := vec } new IndexedRow(rowIdx, Vectors.fromBreeze(wholeVector)) } http://git-wip-us.apache.org/repos/asf/spark/blob/36110a83/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala index e5a2cbb..61266f3 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala @@ -135,6 +135,11 @@ class BlockMatrixSuite extends SparkFunSuite with MLlibTestSparkContext { assert(rowMat.numCols() === n) assert(rowMat.toBreeze() === gridBasedMat.toBreeze()) +// SPARK-15922: BlockMatrix to IndexedRowMatrix throws an error" +val bmat = rowMat.toBlockMatrix +val imat = bmat.toIndexedRowMatrix +imat.rows.collect + val rows = 1 val cols = 10 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15977][SQL] Fix TRUNCATE TABLE for Spark specific datasource tables
Repository: spark Updated Branches: refs/heads/branch-2.0 e11c27918 -> 5b003c9bc [SPARK-15977][SQL] Fix TRUNCATE TABLE for Spark specific datasource tables ## What changes were proposed in this pull request? `TRUNCATE TABLE` is currently broken for Spark specific datasource tables (json, csv, ...). This PR correctly sets the location for these datasources which allows them to be truncated. ## How was this patch tested? Extended the datasources `TRUNCATE TABLE` tests in `DDLSuite`. Author: Herman van Hovell Closes #13697 from hvanhovell/SPARK-15977. (cherry picked from commit f9bf15d9bde4df2178f7a8f932c883bb77c46149) Signed-off-by: Herman van Hovell Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5b003c9b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5b003c9b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5b003c9b Branch: refs/heads/branch-2.0 Commit: 5b003c9bcf43709408ed8f68d17b249675f50fbc Parents: e11c279 Author: Herman van Hovell Authored: Thu Jun 16 13:47:36 2016 -0700 Committer: Herman van Hovell Committed: Thu Jun 16 13:47:55 2016 -0700 -- .../spark/sql/execution/command/tables.scala| 4 ++- .../spark/sql/execution/command/DDLSuite.scala | 28 +--- 2 files changed, 21 insertions(+), 11 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5b003c9b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala index 58bb5cd..3eb93a2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala @@ -348,7 +348,9 @@ case class TruncateTableCommand( s"for tables that are not partitioned: '$tableName'") } val locations = - if (isDatasourceTable || table.partitionColumnNames.isEmpty) { + if (isDatasourceTable) { +Seq(table.storage.serdeProperties.get("path")) + } else if (table.partitionColumnNames.isEmpty) { Seq(table.storage.locationUri) } else { catalog.listPartitions(tableName, partitionSpec).map(_.storage.locationUri) http://git-wip-us.apache.org/repos/asf/spark/blob/5b003c9b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala index e15fcf4..7eb2fff 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala @@ -1280,17 +1280,25 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach { test("truncate table - datasource table") { import testImplicits._ val data = (1 to 10).map { i => (i, i) }.toDF("width", "length") -data.write.saveAsTable("rectangles") -spark.catalog.cacheTable("rectangles") -assume(spark.table("rectangles").collect().nonEmpty, "bad test; table was empty to begin with") -assume(spark.catalog.isCached("rectangles"), "bad test; table was not cached to begin with") -sql("TRUNCATE TABLE rectangles") -assert(spark.table("rectangles").collect().isEmpty) -assert(!spark.catalog.isCached("rectangles")) + +// Test both a Hive compatible and incompatible code path. +Seq("json", "parquet").foreach { format => + withTable("rectangles") { +data.write.format(format).saveAsTable("rectangles") +assume(spark.table("rectangles").collect().nonEmpty, + "bad test; table was empty to begin with") +sql("TRUNCATE TABLE rectangles") +assert(spark.table("rectangles").collect().isEmpty) + } +} + // truncating partitioned data source tables is not supported -data.write.partitionBy("length").saveAsTable("rectangles2") -assertUnsupported("TRUNCATE TABLE rectangles PARTITION (width=1)") -assertUnsupported("TRUNCATE TABLE rectangles2 PARTITION (width=1)") +withTable("rectangles", "rectangles2") { + data.write.saveAsTable("rectangles") + data.write.partitionBy("length").saveAsTable("rectangles2") + assertUnsupported("TRUNCATE TABLE rectangles PARTITION (width=1)") + assertUnsupported("TRUNCATE TABLE rectangles2 PARTITION (width=1)") +} } test("truncate table - external table, temporary table, view (not allowed)") { --
spark git commit: [SPARK-15977][SQL] Fix TRUNCATE TABLE for Spark specific datasource tables
Repository: spark Updated Branches: refs/heads/master 084dca770 -> f9bf15d9b [SPARK-15977][SQL] Fix TRUNCATE TABLE for Spark specific datasource tables ## What changes were proposed in this pull request? `TRUNCATE TABLE` is currently broken for Spark specific datasource tables (json, csv, ...). This PR correctly sets the location for these datasources which allows them to be truncated. ## How was this patch tested? Extended the datasources `TRUNCATE TABLE` tests in `DDLSuite`. Author: Herman van Hovell Closes #13697 from hvanhovell/SPARK-15977. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f9bf15d9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f9bf15d9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f9bf15d9 Branch: refs/heads/master Commit: f9bf15d9bde4df2178f7a8f932c883bb77c46149 Parents: 084dca7 Author: Herman van Hovell Authored: Thu Jun 16 13:47:36 2016 -0700 Committer: Herman van Hovell Committed: Thu Jun 16 13:47:36 2016 -0700 -- .../spark/sql/execution/command/tables.scala| 4 ++- .../spark/sql/execution/command/DDLSuite.scala | 28 +--- 2 files changed, 21 insertions(+), 11 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f9bf15d9/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala index 58bb5cd..3eb93a2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala @@ -348,7 +348,9 @@ case class TruncateTableCommand( s"for tables that are not partitioned: '$tableName'") } val locations = - if (isDatasourceTable || table.partitionColumnNames.isEmpty) { + if (isDatasourceTable) { +Seq(table.storage.serdeProperties.get("path")) + } else if (table.partitionColumnNames.isEmpty) { Seq(table.storage.locationUri) } else { catalog.listPartitions(tableName, partitionSpec).map(_.storage.locationUri) http://git-wip-us.apache.org/repos/asf/spark/blob/f9bf15d9/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala index e15fcf4..7eb2fff 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala @@ -1280,17 +1280,25 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach { test("truncate table - datasource table") { import testImplicits._ val data = (1 to 10).map { i => (i, i) }.toDF("width", "length") -data.write.saveAsTable("rectangles") -spark.catalog.cacheTable("rectangles") -assume(spark.table("rectangles").collect().nonEmpty, "bad test; table was empty to begin with") -assume(spark.catalog.isCached("rectangles"), "bad test; table was not cached to begin with") -sql("TRUNCATE TABLE rectangles") -assert(spark.table("rectangles").collect().isEmpty) -assert(!spark.catalog.isCached("rectangles")) + +// Test both a Hive compatible and incompatible code path. +Seq("json", "parquet").foreach { format => + withTable("rectangles") { +data.write.format(format).saveAsTable("rectangles") +assume(spark.table("rectangles").collect().nonEmpty, + "bad test; table was empty to begin with") +sql("TRUNCATE TABLE rectangles") +assert(spark.table("rectangles").collect().isEmpty) + } +} + // truncating partitioned data source tables is not supported -data.write.partitionBy("length").saveAsTable("rectangles2") -assertUnsupported("TRUNCATE TABLE rectangles PARTITION (width=1)") -assertUnsupported("TRUNCATE TABLE rectangles2 PARTITION (width=1)") +withTable("rectangles", "rectangles2") { + data.write.saveAsTable("rectangles") + data.write.partitionBy("length").saveAsTable("rectangles2") + assertUnsupported("TRUNCATE TABLE rectangles PARTITION (width=1)") + assertUnsupported("TRUNCATE TABLE rectangles2 PARTITION (width=1)") +} } test("truncate table - external table, temporary table, view (not allowed)") { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional comm
spark git commit: [SPARK-15981][SQL][STREAMING] Fixed bug and added tests in DataStreamReader Python API
Repository: spark Updated Branches: refs/heads/branch-2.0 0a2291cd1 -> e11c27918 [SPARK-15981][SQL][STREAMING] Fixed bug and added tests in DataStreamReader Python API ## What changes were proposed in this pull request? - Fixed bug in Python API of DataStreamReader. Because a single path was being converted to a array before calling Java DataStreamReader method (which takes a string only), it gave the following error. ``` File "/Users/tdas/Projects/Spark/spark/python/pyspark/sql/readwriter.py", line 947, in pyspark.sql.readwriter.DataStreamReader.json Failed example: json_sdf = spark.readStream.json(os.path.join(tempfile.mkdtemp(), 'data'), schema = sdf_schema) Exception raised: Traceback (most recent call last): File "/System/Library/Frameworks/Python.framework/Versions/2.6/lib/python2.6/doctest.py", line 1253, in __run compileflags, 1) in test.globs File "", line 1, in json_sdf = spark.readStream.json(os.path.join(tempfile.mkdtemp(), 'data'), schema = sdf_schema) File "/Users/tdas/Projects/Spark/spark/python/pyspark/sql/readwriter.py", line 963, in json return self._df(self._jreader.json(path)) File "/Users/tdas/Projects/Spark/spark/python/lib/py4j-0.10.1-src.zip/py4j/java_gateway.py", line 933, in __call__ answer, self.gateway_client, self.target_id, self.name) File "/Users/tdas/Projects/Spark/spark/python/pyspark/sql/utils.py", line 63, in deco return f(*a, **kw) File "/Users/tdas/Projects/Spark/spark/python/lib/py4j-0.10.1-src.zip/py4j/protocol.py", line 316, in get_return_value format(target_id, ".", name, value)) Py4JError: An error occurred while calling o121.json. Trace: py4j.Py4JException: Method json([class java.util.ArrayList]) does not exist at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318) at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:326) at py4j.Gateway.invoke(Gateway.java:272) at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:128) at py4j.commands.CallCommand.execute(CallCommand.java:79) at py4j.GatewayConnection.run(GatewayConnection.java:211) at java.lang.Thread.run(Thread.java:744) ``` - Reduced code duplication between DataStreamReader and DataFrameWriter - Added missing Python doctests ## How was this patch tested? New tests Author: Tathagata Das Closes #13703 from tdas/SPARK-15981. (cherry picked from commit 084dca770f5c26f906e7555707c7894cf05fb86b) Signed-off-by: Shixiong Zhu Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e11c2791 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e11c2791 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e11c2791 Branch: refs/heads/branch-2.0 Commit: e11c279188b34d410f6ecf17cb1773c95f24a19e Parents: 0a2291c Author: Tathagata Das Authored: Thu Jun 16 13:17:41 2016 -0700 Committer: Shixiong Zhu Committed: Thu Jun 16 13:17:50 2016 -0700 -- python/pyspark/sql/readwriter.py | 258 ++ 1 file changed, 136 insertions(+), 122 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e11c2791/python/pyspark/sql/readwriter.py -- diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index c982de6..72fd184 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -44,7 +44,82 @@ def to_str(value): return str(value) -class DataFrameReader(object): +class ReaderUtils(object): + +def _set_json_opts(self, schema, primitivesAsString, prefersDecimal, + allowComments, allowUnquotedFieldNames, allowSingleQuotes, + allowNumericLeadingZero, allowBackslashEscapingAnyCharacter, + mode, columnNameOfCorruptRecord): +""" +Set options based on the Json optional parameters +""" +if schema is not None: +self.schema(schema) +if primitivesAsString is not None: +self.option("primitivesAsString", primitivesAsString) +if prefersDecimal is not None: +self.option("prefersDecimal", prefersDecimal) +if allowComments is not None: +self.option("allowComments", allowComments) +if allowUnquotedFieldNames is not None: +self.option("allowUnquotedFieldNames", allowUnquotedFieldNames) +if allowSingleQuotes is not None: +self.option("allowSingleQuotes", allowSingleQuotes) +if allowNumericLeadingZero is not None: +self.option("allowNumericLeadingZero", allowNumericLeadingZero) +if allo
spark git commit: [SPARK-15981][SQL][STREAMING] Fixed bug and added tests in DataStreamReader Python API
Repository: spark Updated Branches: refs/heads/master a865f6e05 -> 084dca770 [SPARK-15981][SQL][STREAMING] Fixed bug and added tests in DataStreamReader Python API ## What changes were proposed in this pull request? - Fixed bug in Python API of DataStreamReader. Because a single path was being converted to a array before calling Java DataStreamReader method (which takes a string only), it gave the following error. ``` File "/Users/tdas/Projects/Spark/spark/python/pyspark/sql/readwriter.py", line 947, in pyspark.sql.readwriter.DataStreamReader.json Failed example: json_sdf = spark.readStream.json(os.path.join(tempfile.mkdtemp(), 'data'), schema = sdf_schema) Exception raised: Traceback (most recent call last): File "/System/Library/Frameworks/Python.framework/Versions/2.6/lib/python2.6/doctest.py", line 1253, in __run compileflags, 1) in test.globs File "", line 1, in json_sdf = spark.readStream.json(os.path.join(tempfile.mkdtemp(), 'data'), schema = sdf_schema) File "/Users/tdas/Projects/Spark/spark/python/pyspark/sql/readwriter.py", line 963, in json return self._df(self._jreader.json(path)) File "/Users/tdas/Projects/Spark/spark/python/lib/py4j-0.10.1-src.zip/py4j/java_gateway.py", line 933, in __call__ answer, self.gateway_client, self.target_id, self.name) File "/Users/tdas/Projects/Spark/spark/python/pyspark/sql/utils.py", line 63, in deco return f(*a, **kw) File "/Users/tdas/Projects/Spark/spark/python/lib/py4j-0.10.1-src.zip/py4j/protocol.py", line 316, in get_return_value format(target_id, ".", name, value)) Py4JError: An error occurred while calling o121.json. Trace: py4j.Py4JException: Method json([class java.util.ArrayList]) does not exist at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318) at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:326) at py4j.Gateway.invoke(Gateway.java:272) at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:128) at py4j.commands.CallCommand.execute(CallCommand.java:79) at py4j.GatewayConnection.run(GatewayConnection.java:211) at java.lang.Thread.run(Thread.java:744) ``` - Reduced code duplication between DataStreamReader and DataFrameWriter - Added missing Python doctests ## How was this patch tested? New tests Author: Tathagata Das Closes #13703 from tdas/SPARK-15981. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/084dca77 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/084dca77 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/084dca77 Branch: refs/heads/master Commit: 084dca770f5c26f906e7555707c7894cf05fb86b Parents: a865f6e Author: Tathagata Das Authored: Thu Jun 16 13:17:41 2016 -0700 Committer: Shixiong Zhu Committed: Thu Jun 16 13:17:41 2016 -0700 -- python/pyspark/sql/readwriter.py | 258 ++ 1 file changed, 136 insertions(+), 122 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/084dca77/python/pyspark/sql/readwriter.py -- diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index c982de6..72fd184 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -44,7 +44,82 @@ def to_str(value): return str(value) -class DataFrameReader(object): +class ReaderUtils(object): + +def _set_json_opts(self, schema, primitivesAsString, prefersDecimal, + allowComments, allowUnquotedFieldNames, allowSingleQuotes, + allowNumericLeadingZero, allowBackslashEscapingAnyCharacter, + mode, columnNameOfCorruptRecord): +""" +Set options based on the Json optional parameters +""" +if schema is not None: +self.schema(schema) +if primitivesAsString is not None: +self.option("primitivesAsString", primitivesAsString) +if prefersDecimal is not None: +self.option("prefersDecimal", prefersDecimal) +if allowComments is not None: +self.option("allowComments", allowComments) +if allowUnquotedFieldNames is not None: +self.option("allowUnquotedFieldNames", allowUnquotedFieldNames) +if allowSingleQuotes is not None: +self.option("allowSingleQuotes", allowSingleQuotes) +if allowNumericLeadingZero is not None: +self.option("allowNumericLeadingZero", allowNumericLeadingZero) +if allowBackslashEscapingAnyCharacter is not None: +self.option("allowBackslashEscapingAnyCharacter",
spark git commit: [SPARK-15996][R] Fix R examples by removing deprecated functions
Repository: spark Updated Branches: refs/heads/branch-2.0 c53eda03a -> 0a2291cd1 [SPARK-15996][R] Fix R examples by removing deprecated functions ## What changes were proposed in this pull request? Currently, R examples(`dataframe.R` and `data-manipulation.R`) fail like the following. We had better update them before releasing 2.0 RC. This PR updates them to use up-to-date APIs. ```bash $ bin/spark-submit examples/src/main/r/dataframe.R ... Warning message: 'createDataFrame(sqlContext...)' is deprecated. Use 'createDataFrame(data, schema = NULL, samplingRatio = 1.0)' instead. See help("Deprecated") ... Warning message: 'read.json(sqlContext...)' is deprecated. Use 'read.json(path)' instead. See help("Deprecated") ... Error: could not find function "registerTempTable" Execution halted ``` ## How was this patch tested? Manual. ``` curl -LO http://s3-us-west-2.amazonaws.com/sparkr-data/flights.csv bin/spark-submit examples/src/main/r/dataframe.R bin/spark-submit examples/src/main/r/data-manipulation.R flights.csv ``` Author: Dongjoon Hyun Closes #13714 from dongjoon-hyun/SPARK-15996. (cherry picked from commit a865f6e05297f6121bb2fde717860f9edeed263e) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0a2291cd Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0a2291cd Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0a2291cd Branch: refs/heads/branch-2.0 Commit: 0a2291cd15751018f1680e92aa8f63be4546e7a7 Parents: c53eda0 Author: Dongjoon Hyun Authored: Thu Jun 16 12:46:25 2016 -0700 Committer: Shivaram Venkataraman Committed: Thu Jun 16 12:46:32 2016 -0700 -- examples/src/main/r/data-manipulation.R | 8 examples/src/main/r/dataframe.R | 11 +++ 2 files changed, 11 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0a2291cd/examples/src/main/r/data-manipulation.R -- diff --git a/examples/src/main/r/data-manipulation.R b/examples/src/main/r/data-manipulation.R index 58a3013..badb98b 100644 --- a/examples/src/main/r/data-manipulation.R +++ b/examples/src/main/r/data-manipulation.R @@ -49,10 +49,10 @@ flights_df$date <- as.Date(flights_df$date) SFO_df <- flights_df[flights_df$dest == "SFO", ] # Convert the local data frame into a SparkDataFrame -SFO_DF <- createDataFrame(sqlContext, SFO_df) +SFO_DF <- createDataFrame(SFO_df) # Directly create a SparkDataFrame from the source data -flightsDF <- read.df(sqlContext, flightsCsvPath, source = "csv", header = "true") +flightsDF <- read.df(flightsCsvPath, source = "csv", header = "true") # Print the schema of this SparkDataFrame printSchema(flightsDF) @@ -75,8 +75,8 @@ destDF <- select(flightsDF, "dest", "cancelled") # Using SQL to select columns of data # First, register the flights SparkDataFrame as a table -registerTempTable(flightsDF, "flightsTable") -destDF <- sql(sqlContext, "SELECT dest, cancelled FROM flightsTable") +createOrReplaceTempView(flightsDF, "flightsTable") +destDF <- sql("SELECT dest, cancelled FROM flightsTable") # Use collect to create a local R data frame local_df <- collect(destDF) http://git-wip-us.apache.org/repos/asf/spark/blob/0a2291cd/examples/src/main/r/dataframe.R -- diff --git a/examples/src/main/r/dataframe.R b/examples/src/main/r/dataframe.R index 436bac6..0434705 100644 --- a/examples/src/main/r/dataframe.R +++ b/examples/src/main/r/dataframe.R @@ -25,7 +25,7 @@ sqlContext <- sparkRSQL.init(sc) localDF <- data.frame(name=c("John", "Smith", "Sarah"), age=c(19, 23, 18)) # Convert local data frame to a SparkDataFrame -df <- createDataFrame(sqlContext, localDF) +df <- createDataFrame(localDF) # Print its schema printSchema(df) @@ -35,14 +35,17 @@ printSchema(df) # Create a DataFrame from a JSON file path <- file.path(Sys.getenv("SPARK_HOME"), "examples/src/main/resources/people.json") -peopleDF <- read.json(sqlContext, path) +peopleDF <- read.json(path) printSchema(peopleDF) +# root +# |-- age: long (nullable = true) +# |-- name: string (nullable = true) # Register this DataFrame as a table. -registerTempTable(peopleDF, "people") +createOrReplaceTempView(peopleDF, "people") # SQL statements can be run by using the sql methods provided by sqlContext -teenagers <- sql(sqlContext, "SELECT name FROM people WHERE age >= 13 AND age <= 19") +teenagers <- sql("SELECT name FROM people WHERE age >= 13 AND age <= 19") # Call collect to get a local data.frame teenagersLocalDF <- collect(teenagers) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For addition
spark git commit: [SPARK-15996][R] Fix R examples by removing deprecated functions
Repository: spark Updated Branches: refs/heads/master 9ea0d5e32 -> a865f6e05 [SPARK-15996][R] Fix R examples by removing deprecated functions ## What changes were proposed in this pull request? Currently, R examples(`dataframe.R` and `data-manipulation.R`) fail like the following. We had better update them before releasing 2.0 RC. This PR updates them to use up-to-date APIs. ```bash $ bin/spark-submit examples/src/main/r/dataframe.R ... Warning message: 'createDataFrame(sqlContext...)' is deprecated. Use 'createDataFrame(data, schema = NULL, samplingRatio = 1.0)' instead. See help("Deprecated") ... Warning message: 'read.json(sqlContext...)' is deprecated. Use 'read.json(path)' instead. See help("Deprecated") ... Error: could not find function "registerTempTable" Execution halted ``` ## How was this patch tested? Manual. ``` curl -LO http://s3-us-west-2.amazonaws.com/sparkr-data/flights.csv bin/spark-submit examples/src/main/r/dataframe.R bin/spark-submit examples/src/main/r/data-manipulation.R flights.csv ``` Author: Dongjoon Hyun Closes #13714 from dongjoon-hyun/SPARK-15996. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a865f6e0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a865f6e0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a865f6e0 Branch: refs/heads/master Commit: a865f6e05297f6121bb2fde717860f9edeed263e Parents: 9ea0d5e Author: Dongjoon Hyun Authored: Thu Jun 16 12:46:25 2016 -0700 Committer: Shivaram Venkataraman Committed: Thu Jun 16 12:46:25 2016 -0700 -- examples/src/main/r/data-manipulation.R | 8 examples/src/main/r/dataframe.R | 11 +++ 2 files changed, 11 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a865f6e0/examples/src/main/r/data-manipulation.R -- diff --git a/examples/src/main/r/data-manipulation.R b/examples/src/main/r/data-manipulation.R index 58a3013..badb98b 100644 --- a/examples/src/main/r/data-manipulation.R +++ b/examples/src/main/r/data-manipulation.R @@ -49,10 +49,10 @@ flights_df$date <- as.Date(flights_df$date) SFO_df <- flights_df[flights_df$dest == "SFO", ] # Convert the local data frame into a SparkDataFrame -SFO_DF <- createDataFrame(sqlContext, SFO_df) +SFO_DF <- createDataFrame(SFO_df) # Directly create a SparkDataFrame from the source data -flightsDF <- read.df(sqlContext, flightsCsvPath, source = "csv", header = "true") +flightsDF <- read.df(flightsCsvPath, source = "csv", header = "true") # Print the schema of this SparkDataFrame printSchema(flightsDF) @@ -75,8 +75,8 @@ destDF <- select(flightsDF, "dest", "cancelled") # Using SQL to select columns of data # First, register the flights SparkDataFrame as a table -registerTempTable(flightsDF, "flightsTable") -destDF <- sql(sqlContext, "SELECT dest, cancelled FROM flightsTable") +createOrReplaceTempView(flightsDF, "flightsTable") +destDF <- sql("SELECT dest, cancelled FROM flightsTable") # Use collect to create a local R data frame local_df <- collect(destDF) http://git-wip-us.apache.org/repos/asf/spark/blob/a865f6e0/examples/src/main/r/dataframe.R -- diff --git a/examples/src/main/r/dataframe.R b/examples/src/main/r/dataframe.R index 436bac6..0434705 100644 --- a/examples/src/main/r/dataframe.R +++ b/examples/src/main/r/dataframe.R @@ -25,7 +25,7 @@ sqlContext <- sparkRSQL.init(sc) localDF <- data.frame(name=c("John", "Smith", "Sarah"), age=c(19, 23, 18)) # Convert local data frame to a SparkDataFrame -df <- createDataFrame(sqlContext, localDF) +df <- createDataFrame(localDF) # Print its schema printSchema(df) @@ -35,14 +35,17 @@ printSchema(df) # Create a DataFrame from a JSON file path <- file.path(Sys.getenv("SPARK_HOME"), "examples/src/main/resources/people.json") -peopleDF <- read.json(sqlContext, path) +peopleDF <- read.json(path) printSchema(peopleDF) +# root +# |-- age: long (nullable = true) +# |-- name: string (nullable = true) # Register this DataFrame as a table. -registerTempTable(peopleDF, "people") +createOrReplaceTempView(peopleDF, "people") # SQL statements can be run by using the sql methods provided by sqlContext -teenagers <- sql(sqlContext, "SELECT name FROM people WHERE age >= 13 AND age <= 19") +teenagers <- sql("SELECT name FROM people WHERE age >= 13 AND age <= 19") # Call collect to get a local data.frame teenagersLocalDF <- collect(teenagers) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15983][SQL] Removes FileFormat.prepareRead
Repository: spark Updated Branches: refs/heads/branch-2.0 26359d27c -> c53eda03a [SPARK-15983][SQL] Removes FileFormat.prepareRead ## What changes were proposed in this pull request? Interface method `FileFormat.prepareRead()` was added in #12088 to handle a special case in the LibSVM data source. However, the semantics of this interface method isn't intuitive: it returns a modified version of the data source options map. Considering that the LibSVM case can be easily handled using schema metadata inside `inferSchema`, we can remove this interface method to keep the `FileFormat` interface clean. ## How was this patch tested? Existing tests. Author: Cheng Lian Closes #13698 from liancheng/remove-prepare-read. (cherry picked from commit 9ea0d5e326e08b914aa46f1eec8795688a61bf74) Signed-off-by: Wenchen Fan Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c53eda03 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c53eda03 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c53eda03 Branch: refs/heads/branch-2.0 Commit: c53eda03a282fb0569dd7e0dae3785999d022c8f Parents: 26359d2 Author: Cheng Lian Authored: Thu Jun 16 10:24:29 2016 -0700 Committer: Wenchen Fan Committed: Thu Jun 16 10:24:38 2016 -0700 -- .../spark/ml/source/libsvm/LibSVMRelation.scala | 33 ++-- .../sql/execution/datasources/DataSource.scala | 5 +-- .../datasources/fileSourceInterfaces.scala | 11 --- 3 files changed, 18 insertions(+), 31 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c53eda03/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala index 62e09d2..4988dd6 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala @@ -120,9 +120,12 @@ class LibSVMFileFormat extends TextBasedFileFormat with DataSourceRegister { override def toString: String = "LibSVM" private def verifySchema(dataSchema: StructType): Unit = { -if (dataSchema.size != 2 || - (!dataSchema(0).dataType.sameType(DataTypes.DoubleType) -|| !dataSchema(1).dataType.sameType(new VectorUDT( { +if ( + dataSchema.size != 2 || +!dataSchema(0).dataType.sameType(DataTypes.DoubleType) || +!dataSchema(1).dataType.sameType(new VectorUDT()) || +!(dataSchema(1).metadata.getLong("numFeatures").toInt > 0) +) { throw new IOException(s"Illegal schema for libsvm data, schema=$dataSchema") } } @@ -131,17 +134,8 @@ class LibSVMFileFormat extends TextBasedFileFormat with DataSourceRegister { sparkSession: SparkSession, options: Map[String, String], files: Seq[FileStatus]): Option[StructType] = { -Some( - StructType( -StructField("label", DoubleType, nullable = false) :: -StructField("features", new VectorUDT(), nullable = false) :: Nil)) - } - - override def prepareRead( - sparkSession: SparkSession, - options: Map[String, String], - files: Seq[FileStatus]): Map[String, String] = { -val numFeatures = options.get("numFeatures").filter(_.toInt > 0).getOrElse { +val numFeatures: Int = options.get("numFeatures").map(_.toInt).filter(_ > 0).getOrElse { + // Infers number of features if the user doesn't specify (a valid) one. val dataFiles = files.filterNot(_.getPath.getName startsWith "_") val path = if (dataFiles.length == 1) { dataFiles.head.getPath.toUri.toString @@ -156,7 +150,14 @@ class LibSVMFileFormat extends TextBasedFileFormat with DataSourceRegister { MLUtils.computeNumFeatures(parsed) } -new CaseInsensitiveMap(options + ("numFeatures" -> numFeatures.toString)) +val featuresMetadata = new MetadataBuilder() + .putLong("numFeatures", numFeatures) + .build() + +Some( + StructType( +StructField("label", DoubleType, nullable = false) :: +StructField("features", new VectorUDT(), nullable = false, featuresMetadata) :: Nil)) } override def prepareWrite( @@ -185,7 +186,7 @@ class LibSVMFileFormat extends TextBasedFileFormat with DataSourceRegister { options: Map[String, String], hadoopConf: Configuration): (PartitionedFile) => Iterator[InternalRow] = { verifySchema(dataSchema) -val numFeatures = options("numFeatures").toInt +val numFeatures = dataSchema("features").metadata.getLong("numFeatures").toInt assert(numFeatures > 0) val sparse = options.getOrElse("vec
spark git commit: [SPARK-15983][SQL] Removes FileFormat.prepareRead
Repository: spark Updated Branches: refs/heads/master 6451cf927 -> 9ea0d5e32 [SPARK-15983][SQL] Removes FileFormat.prepareRead ## What changes were proposed in this pull request? Interface method `FileFormat.prepareRead()` was added in #12088 to handle a special case in the LibSVM data source. However, the semantics of this interface method isn't intuitive: it returns a modified version of the data source options map. Considering that the LibSVM case can be easily handled using schema metadata inside `inferSchema`, we can remove this interface method to keep the `FileFormat` interface clean. ## How was this patch tested? Existing tests. Author: Cheng Lian Closes #13698 from liancheng/remove-prepare-read. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9ea0d5e3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9ea0d5e3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9ea0d5e3 Branch: refs/heads/master Commit: 9ea0d5e326e08b914aa46f1eec8795688a61bf74 Parents: 6451cf9 Author: Cheng Lian Authored: Thu Jun 16 10:24:29 2016 -0700 Committer: Wenchen Fan Committed: Thu Jun 16 10:24:29 2016 -0700 -- .../spark/ml/source/libsvm/LibSVMRelation.scala | 33 ++-- .../sql/execution/datasources/DataSource.scala | 5 +-- .../datasources/fileSourceInterfaces.scala | 11 --- 3 files changed, 18 insertions(+), 31 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9ea0d5e3/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala index 62e09d2..4988dd6 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala @@ -120,9 +120,12 @@ class LibSVMFileFormat extends TextBasedFileFormat with DataSourceRegister { override def toString: String = "LibSVM" private def verifySchema(dataSchema: StructType): Unit = { -if (dataSchema.size != 2 || - (!dataSchema(0).dataType.sameType(DataTypes.DoubleType) -|| !dataSchema(1).dataType.sameType(new VectorUDT( { +if ( + dataSchema.size != 2 || +!dataSchema(0).dataType.sameType(DataTypes.DoubleType) || +!dataSchema(1).dataType.sameType(new VectorUDT()) || +!(dataSchema(1).metadata.getLong("numFeatures").toInt > 0) +) { throw new IOException(s"Illegal schema for libsvm data, schema=$dataSchema") } } @@ -131,17 +134,8 @@ class LibSVMFileFormat extends TextBasedFileFormat with DataSourceRegister { sparkSession: SparkSession, options: Map[String, String], files: Seq[FileStatus]): Option[StructType] = { -Some( - StructType( -StructField("label", DoubleType, nullable = false) :: -StructField("features", new VectorUDT(), nullable = false) :: Nil)) - } - - override def prepareRead( - sparkSession: SparkSession, - options: Map[String, String], - files: Seq[FileStatus]): Map[String, String] = { -val numFeatures = options.get("numFeatures").filter(_.toInt > 0).getOrElse { +val numFeatures: Int = options.get("numFeatures").map(_.toInt).filter(_ > 0).getOrElse { + // Infers number of features if the user doesn't specify (a valid) one. val dataFiles = files.filterNot(_.getPath.getName startsWith "_") val path = if (dataFiles.length == 1) { dataFiles.head.getPath.toUri.toString @@ -156,7 +150,14 @@ class LibSVMFileFormat extends TextBasedFileFormat with DataSourceRegister { MLUtils.computeNumFeatures(parsed) } -new CaseInsensitiveMap(options + ("numFeatures" -> numFeatures.toString)) +val featuresMetadata = new MetadataBuilder() + .putLong("numFeatures", numFeatures) + .build() + +Some( + StructType( +StructField("label", DoubleType, nullable = false) :: +StructField("features", new VectorUDT(), nullable = false, featuresMetadata) :: Nil)) } override def prepareWrite( @@ -185,7 +186,7 @@ class LibSVMFileFormat extends TextBasedFileFormat with DataSourceRegister { options: Map[String, String], hadoopConf: Configuration): (PartitionedFile) => Iterator[InternalRow] = { verifySchema(dataSchema) -val numFeatures = options("numFeatures").toInt +val numFeatures = dataSchema("features").metadata.getLong("numFeatures").toInt assert(numFeatures > 0) val sparse = options.getOrElse("vectorType", "sparse") == "sparse" http://git-wip-us.apache.org/repos/asf/spark/blob/9ea0d5e3/sql/core/src/m
spark git commit: [SPARK-15862][SQL] Better Error Message When Having Database Name in CACHE TABLE AS SELECT
Repository: spark Updated Branches: refs/heads/branch-2.0 52cb1ad38 -> 26359d27c [SPARK-15862][SQL] Better Error Message When Having Database Name in CACHE TABLE AS SELECT What changes were proposed in this pull request? ~~If the temp table already exists, we should not silently replace it when doing `CACHE TABLE AS SELECT`. This is inconsistent with the behavior of `CREAT VIEW` or `CREATE TABLE`. This PR is to fix this silent drop.~~ ~~Maybe, we also can introduce new syntax for replacing the existing one. For example, in Hive, to replace a view, the syntax should be like `ALTER VIEW AS SELECT` or `CREATE OR REPLACE VIEW AS SELECT`~~ The table name in `CACHE TABLE AS SELECT` should NOT contain database prefix like "database.table". Thus, this PR captures this in Parser and outputs a better error message, instead of reporting the view already exists. In addition, refactoring the `Parser` to generate table identifiers instead of returning the table name string. How was this patch tested? - Added a test case for caching and uncaching qualified table names - Fixed a few test cases that do not drop temp table at the end - Added the related test case for the issue resolved in this PR Author: gatorsmile Author: xiaoli Author: Xiao Li Closes #13572 from gatorsmile/cacheTableAsSelect. (cherry picked from commit 6451cf9270b55465d8ecea4c4031329a1058561a) Signed-off-by: Cheng Lian Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/26359d27 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/26359d27 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/26359d27 Branch: refs/heads/branch-2.0 Commit: 26359d27c47ae3ec53e442de3884ec9245d15cee Parents: 52cb1ad Author: gatorsmile Authored: Thu Jun 16 10:01:59 2016 -0700 Committer: Cheng Lian Committed: Thu Jun 16 10:02:12 2016 -0700 -- .../apache/spark/sql/catalyst/parser/SqlBase.g4 | 4 +- .../spark/sql/execution/SparkSqlParser.scala| 10 ++- .../spark/sql/execution/command/cache.scala | 20 ++--- .../spark/sql/execution/command/views.scala | 2 +- .../org/apache/spark/sql/CachedTableSuite.scala | 68 + .../apache/spark/sql/hive/test/TestHive.scala | 2 +- .../spark/sql/hive/CachedTableSuite.scala | 79 +++- 7 files changed, 121 insertions(+), 64 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/26359d27/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 -- diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 index 044f910..b603196 100644 --- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 +++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 @@ -114,8 +114,8 @@ statement tableIdentifier partitionSpec? describeColName? #describeTable | REFRESH TABLE tableIdentifier #refreshTable | REFRESH .*? #refreshResource -| CACHE LAZY? TABLE identifier (AS? query)? #cacheTable -| UNCACHE TABLE identifier #uncacheTable +| CACHE LAZY? TABLE tableIdentifier (AS? query)? #cacheTable +| UNCACHE TABLE tableIdentifier #uncacheTable | CLEAR CACHE #clearCache | LOAD DATA LOCAL? INPATH path=STRING OVERWRITE? INTO TABLE tableIdentifier partitionSpec? #loadData http://git-wip-us.apache.org/repos/asf/spark/blob/26359d27/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala index a0508ad..154c25a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala @@ -221,14 +221,20 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder { */ override def visitCacheTable(ctx: CacheTableContext): LogicalPlan = withOrigin(ctx) { val query = Option(ctx.query).map(plan) -CacheTableCommand(ctx.identifier.getText, query, ctx.LAZY != null) +val tableIdent = visitTableIdentifier(ctx.tableIdentifier) +if (query.isDefined && tableIdent.database.isDefined) { + val d
spark git commit: [SPARK-15862][SQL] Better Error Message When Having Database Name in CACHE TABLE AS SELECT
Repository: spark Updated Branches: refs/heads/master 7c6c69263 -> 6451cf927 [SPARK-15862][SQL] Better Error Message When Having Database Name in CACHE TABLE AS SELECT What changes were proposed in this pull request? ~~If the temp table already exists, we should not silently replace it when doing `CACHE TABLE AS SELECT`. This is inconsistent with the behavior of `CREAT VIEW` or `CREATE TABLE`. This PR is to fix this silent drop.~~ ~~Maybe, we also can introduce new syntax for replacing the existing one. For example, in Hive, to replace a view, the syntax should be like `ALTER VIEW AS SELECT` or `CREATE OR REPLACE VIEW AS SELECT`~~ The table name in `CACHE TABLE AS SELECT` should NOT contain database prefix like "database.table". Thus, this PR captures this in Parser and outputs a better error message, instead of reporting the view already exists. In addition, refactoring the `Parser` to generate table identifiers instead of returning the table name string. How was this patch tested? - Added a test case for caching and uncaching qualified table names - Fixed a few test cases that do not drop temp table at the end - Added the related test case for the issue resolved in this PR Author: gatorsmile Author: xiaoli Author: Xiao Li Closes #13572 from gatorsmile/cacheTableAsSelect. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6451cf92 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6451cf92 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6451cf92 Branch: refs/heads/master Commit: 6451cf9270b55465d8ecea4c4031329a1058561a Parents: 7c6c692 Author: gatorsmile Authored: Thu Jun 16 10:01:59 2016 -0700 Committer: Cheng Lian Committed: Thu Jun 16 10:01:59 2016 -0700 -- .../apache/spark/sql/catalyst/parser/SqlBase.g4 | 4 +- .../spark/sql/execution/SparkSqlParser.scala| 10 ++- .../spark/sql/execution/command/cache.scala | 20 ++--- .../spark/sql/execution/command/views.scala | 2 +- .../org/apache/spark/sql/CachedTableSuite.scala | 68 + .../apache/spark/sql/hive/test/TestHive.scala | 2 +- .../spark/sql/hive/CachedTableSuite.scala | 79 +++- 7 files changed, 121 insertions(+), 64 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6451cf92/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 -- diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 index 044f910..b603196 100644 --- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 +++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 @@ -114,8 +114,8 @@ statement tableIdentifier partitionSpec? describeColName? #describeTable | REFRESH TABLE tableIdentifier #refreshTable | REFRESH .*? #refreshResource -| CACHE LAZY? TABLE identifier (AS? query)? #cacheTable -| UNCACHE TABLE identifier #uncacheTable +| CACHE LAZY? TABLE tableIdentifier (AS? query)? #cacheTable +| UNCACHE TABLE tableIdentifier #uncacheTable | CLEAR CACHE #clearCache | LOAD DATA LOCAL? INPATH path=STRING OVERWRITE? INTO TABLE tableIdentifier partitionSpec? #loadData http://git-wip-us.apache.org/repos/asf/spark/blob/6451cf92/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala index a0508ad..154c25a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala @@ -221,14 +221,20 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder { */ override def visitCacheTable(ctx: CacheTableContext): LogicalPlan = withOrigin(ctx) { val query = Option(ctx.query).map(plan) -CacheTableCommand(ctx.identifier.getText, query, ctx.LAZY != null) +val tableIdent = visitTableIdentifier(ctx.tableIdentifier) +if (query.isDefined && tableIdent.database.isDefined) { + val database = tableIdent.database.get + throw new ParseException(s"It is not allowed to add database pre
[2/3] spark git commit: [SPARK-15979][SQL] Rename various Parquet support classes (branch-2.0).
http://git-wip-us.apache.org/repos/asf/spark/blob/52cb1ad3/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystWriteSupport.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystWriteSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystWriteSupport.scala deleted file mode 100644 index 67bfd39..000 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystWriteSupport.scala +++ /dev/null @@ -1,436 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - *http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.execution.datasources.parquet - -import java.nio.{ByteBuffer, ByteOrder} -import java.util - -import scala.collection.JavaConverters.mapAsJavaMapConverter - -import org.apache.hadoop.conf.Configuration -import org.apache.parquet.column.ParquetProperties -import org.apache.parquet.hadoop.ParquetOutputFormat -import org.apache.parquet.hadoop.api.WriteSupport -import org.apache.parquet.hadoop.api.WriteSupport.WriteContext -import org.apache.parquet.io.api.{Binary, RecordConsumer} - -import org.apache.spark.internal.Logging -import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.SpecializedGetters -import org.apache.spark.sql.catalyst.util.DateTimeUtils -import org.apache.spark.sql.execution.datasources.parquet.CatalystSchemaConverter.minBytesForPrecision -import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.types._ - -/** - * A Parquet [[WriteSupport]] implementation that writes Catalyst [[InternalRow]]s as Parquet - * messages. This class can write Parquet data in two modes: - * - * - Standard mode: Parquet data are written in standard format defined in parquet-format spec. - * - Legacy mode: Parquet data are written in legacy format compatible with Spark 1.4 and prior. - * - * This behavior can be controlled by SQL option `spark.sql.parquet.writeLegacyFormat`. The value - * of this option is propagated to this class by the `init()` method and its Hadoop configuration - * argument. - */ -private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] with Logging { - // A `ValueWriter` is responsible for writing a field of an `InternalRow` to the record consumer. - // Here we are using `SpecializedGetters` rather than `InternalRow` so that we can directly access - // data in `ArrayData` without the help of `SpecificMutableRow`. - private type ValueWriter = (SpecializedGetters, Int) => Unit - - // Schema of the `InternalRow`s to be written - private var schema: StructType = _ - - // `ValueWriter`s for all fields of the schema - private var rootFieldWriters: Seq[ValueWriter] = _ - - // The Parquet `RecordConsumer` to which all `InternalRow`s are written - private var recordConsumer: RecordConsumer = _ - - // Whether to write data in legacy Parquet format compatible with Spark 1.4 and prior versions - private var writeLegacyParquetFormat: Boolean = _ - - // Reusable byte array used to write timestamps as Parquet INT96 values - private val timestampBuffer = new Array[Byte](12) - - // Reusable byte array used to write decimal values - private val decimalBuffer = new Array[Byte](minBytesForPrecision(DecimalType.MAX_PRECISION)) - - override def init(configuration: Configuration): WriteContext = { -val schemaString = configuration.get(CatalystWriteSupport.SPARK_ROW_SCHEMA) -this.schema = StructType.fromString(schemaString) -this.writeLegacyParquetFormat = { - // `SQLConf.PARQUET_WRITE_LEGACY_FORMAT` should always be explicitly set in ParquetRelation - assert(configuration.get(SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key) != null) - configuration.get(SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key).toBoolean -} -this.rootFieldWriters = schema.map(_.dataType).map(makeWriter) - -val messageType = new CatalystSchemaConverter(configuration).convert(schema) -val metadata = Map(CatalystReadSupport.SPARK_METADATA_KEY -> schemaString).asJava - -logInfo( - s"""Initialized Parquet WriteSupport with Catalyst schem
[1/3] spark git commit: [SPARK-15979][SQL] Rename various Parquet support classes (branch-2.0).
Repository: spark Updated Branches: refs/heads/branch-2.0 35c0a60a6 -> 52cb1ad38 http://git-wip-us.apache.org/repos/asf/spark/blob/52cb1ad3/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaConverter.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaConverter.scala new file mode 100644 index 000..1ac083f --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaConverter.scala @@ -0,0 +1,579 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources.parquet + +import scala.collection.JavaConverters._ + +import org.apache.hadoop.conf.Configuration +import org.apache.parquet.schema._ +import org.apache.parquet.schema.OriginalType._ +import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName._ +import org.apache.parquet.schema.Type.Repetition._ + +import org.apache.spark.sql.AnalysisException +import org.apache.spark.sql.execution.datasources.parquet.ParquetSchemaConverter.maxPrecisionForBytes +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types._ + +/** + * This converter class is used to convert Parquet [[MessageType]] to Spark SQL [[StructType]] and + * vice versa. + * + * Parquet format backwards-compatibility rules are respected when converting Parquet + * [[MessageType]] schemas. + * + * @see https://github.com/apache/parquet-format/blob/master/LogicalTypes.md + * @constructor + * @param assumeBinaryIsString Whether unannotated BINARY fields should be assumed to be Spark SQL + *[[StringType]] fields when converting Parquet a [[MessageType]] to Spark SQL + *[[StructType]]. This argument only affects Parquet read path. + * @param assumeInt96IsTimestamp Whether unannotated INT96 fields should be assumed to be Spark SQL + *[[TimestampType]] fields when converting Parquet a [[MessageType]] to Spark SQL + *[[StructType]]. Note that Spark SQL [[TimestampType]] is similar to Hive timestamp, which + *has optional nanosecond precision, but different from `TIME_MILLS` and `TIMESTAMP_MILLIS` + *described in Parquet format spec. This argument only affects Parquet read path. + * @param writeLegacyParquetFormat Whether to use legacy Parquet format compatible with Spark 1.4 + *and prior versions when converting a Catalyst [[StructType]] to a Parquet [[MessageType]]. + *When set to false, use standard format defined in parquet-format spec. This argument only + *affects Parquet write path. + */ +private[parquet] class ParquetSchemaConverter( +assumeBinaryIsString: Boolean = SQLConf.PARQUET_BINARY_AS_STRING.defaultValue.get, +assumeInt96IsTimestamp: Boolean = SQLConf.PARQUET_INT96_AS_TIMESTAMP.defaultValue.get, +writeLegacyParquetFormat: Boolean = SQLConf.PARQUET_WRITE_LEGACY_FORMAT.defaultValue.get) { + + def this(conf: SQLConf) = this( +assumeBinaryIsString = conf.isParquetBinaryAsString, +assumeInt96IsTimestamp = conf.isParquetINT96AsTimestamp, +writeLegacyParquetFormat = conf.writeLegacyParquetFormat) + + def this(conf: Configuration) = this( +assumeBinaryIsString = conf.get(SQLConf.PARQUET_BINARY_AS_STRING.key).toBoolean, +assumeInt96IsTimestamp = conf.get(SQLConf.PARQUET_INT96_AS_TIMESTAMP.key).toBoolean, +writeLegacyParquetFormat = conf.get(SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key, + SQLConf.PARQUET_WRITE_LEGACY_FORMAT.defaultValue.get.toString).toBoolean) + + /** + * Converts Parquet [[MessageType]] `parquetSchema` to a Spark SQL [[StructType]]. + */ + def convert(parquetSchema: MessageType): StructType = convert(parquetSchema.asGroupType()) + + private def convert(parquetSchema: GroupType): StructType = { +val fields = parquetSchema.getFields.asScala.map { field => + field.getRepetition match { +case OPTIONAL => + StructField(field.getName, convertField(field), nullable = true) + +
[3/3] spark git commit: [SPARK-15979][SQL] Rename various Parquet support classes (branch-2.0).
[SPARK-15979][SQL] Rename various Parquet support classes (branch-2.0). ## What changes were proposed in this pull request? This patch renames various Parquet support classes from CatalystAbc to ParquetAbc. This new naming makes more sense for two reasons: 1. These are not optimizer related (i.e. Catalyst) classes. 2. We are in the Spark code base, and as a result it'd be more clear to call out these are Parquet support classes, rather than some Spark classes. ## How was this patch tested? Renamed test cases as well. Author: Reynold Xin Closes #13700 from rxin/parquet-rename-branch-2.0. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/52cb1ad3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/52cb1ad3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/52cb1ad3 Branch: refs/heads/branch-2.0 Commit: 52cb1ad38f669dca3f276f38a3f75d57d973e982 Parents: 35c0a60 Author: Reynold Xin Authored: Thu Jun 16 00:21:08 2016 -0700 Committer: Reynold Xin Committed: Thu Jun 16 00:21:08 2016 -0700 -- .../spark/ml/source/libsvm/LibSVMRelation.scala | 6 +- .../SpecificParquetRecordReaderBase.java| 4 +- .../parquet/VectorizedColumnReader.java | 12 +- .../parquet/CatalystReadSupport.scala | 302 - .../parquet/CatalystRecordMaterializer.scala| 41 -- .../parquet/CatalystRowConverter.scala | 672 --- .../parquet/CatalystSchemaConverter.scala | 579 .../parquet/CatalystWriteSupport.scala | 436 .../datasources/parquet/ParquetFileFormat.scala | 44 +- .../parquet/ParquetReadSupport.scala| 302 + .../parquet/ParquetRecordMaterializer.scala | 41 ++ .../parquet/ParquetRowConverter.scala | 672 +++ .../parquet/ParquetSchemaConverter.scala| 579 .../parquet/ParquetWriteSupport.scala | 436 .../datasources/parquet/ParquetIOSuite.scala| 4 +- .../datasources/parquet/ParquetQuerySuite.scala | 6 +- .../parquet/ParquetSchemaSuite.scala| 6 +- .../datasources/parquet/ParquetTest.scala | 4 +- 18 files changed, 2071 insertions(+), 2075 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/52cb1ad3/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala index b5b2a68..62e09d2 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala @@ -141,7 +141,7 @@ class LibSVMFileFormat extends TextBasedFileFormat with DataSourceRegister { sparkSession: SparkSession, options: Map[String, String], files: Seq[FileStatus]): Map[String, String] = { -def computeNumFeatures(): Int = { +val numFeatures = options.get("numFeatures").filter(_.toInt > 0).getOrElse { val dataFiles = files.filterNot(_.getPath.getName startsWith "_") val path = if (dataFiles.length == 1) { dataFiles.head.getPath.toUri.toString @@ -156,10 +156,6 @@ class LibSVMFileFormat extends TextBasedFileFormat with DataSourceRegister { MLUtils.computeNumFeatures(parsed) } -val numFeatures = options.get("numFeatures").filter(_.toInt > 0).getOrElse { - computeNumFeatures() -} - new CaseInsensitiveMap(options + ("numFeatures" -> numFeatures.toString)) } http://git-wip-us.apache.org/repos/asf/spark/blob/52cb1ad3/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java -- diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java index cbe8f78..1a25679 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java @@ -136,7 +136,7 @@ public abstract class SpecificParquetRecordReaderBase extends RecordReader extends RecordReaderhttp://git-wip-us.apache.org/repos/asf/spark/blob/52cb1ad3/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java -- diff --git a/sql/core/src/main