[spark] branch master updated: [SPARK-26894][SQL] Handle Alias as well in AggregateEstimation to propagate child stats
This is an automated email from the ASF dual-hosted git repository. yamamuro pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new b1857a4 [SPARK-26894][SQL] Handle Alias as well in AggregateEstimation to propagate child stats b1857a4 is described below commit b1857a4d7dfe17663f8adccd7825d890ae70d2a1 Author: Venkata krishnan Sowrirajan AuthorDate: Thu Mar 21 11:21:56 2019 +0900 [SPARK-26894][SQL] Handle Alias as well in AggregateEstimation to propagate child stats ## What changes were proposed in this pull request? Currently aliases are not handled in the Aggregate Estimation due to which stats are not getting propagated. This causes CBO join-reordering to not give optimal join plans. ProjectEstimation is already taking care of aliases, we need same logic for AggregateEstimation as well to properly propagate stats when CBO is enabled. ## How was this patch tested? This patch is manually tested using the query Q83 of TPCDS benchmark (scale 1000) Closes #23803 from venkata91/aggstats. Authored-by: Venkata krishnan Sowrirajan Signed-off-by: Takeshi Yamamuro --- .../statsEstimation/AggregateEstimation.scala | 7 +-- .../logical/statsEstimation/EstimationUtils.scala | 14 - .../statsEstimation/ProjectEstimation.scala| 10 +++-- .../statsEstimation/AggregateEstimationSuite.scala | 24 ++ 4 files changed, 45 insertions(+), 10 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/AggregateEstimation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/AggregateEstimation.scala index eb56ab4..0606d0d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/AggregateEstimation.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/AggregateEstimation.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.catalyst.plans.logical.statsEstimation -import org.apache.spark.sql.catalyst.expressions.Attribute +import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap} import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, Statistics} @@ -52,7 +52,10 @@ object AggregateEstimation { outputRows.min(childStats.rowCount.get) } - val outputAttrStats = getOutputMap(childStats.attributeStats, agg.output) + val aliasStats = EstimationUtils.getAliasStats(agg.expressions, childStats.attributeStats) + + val outputAttrStats = getOutputMap( +AttributeMap(childStats.attributeStats.toSeq ++ aliasStats), agg.output) Some(Statistics( sizeInBytes = getOutputSize(agg.output, outputRows, outputAttrStats), rowCount = Some(outputRows), diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala index 211a2a0..11d2f02 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.plans.logical.statsEstimation import scala.collection.mutable.ArrayBuffer import scala.math.BigDecimal.RoundingMode -import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap} +import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, AttributeMap, Expression} import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.types.{DecimalType, _} @@ -71,6 +71,18 @@ object EstimationUtils { AttributeMap(output.flatMap(a => inputMap.get(a).map(a -> _))) } + /** + * Returns the stats for aliases of child's attributes + */ + def getAliasStats( + expressions: Seq[Expression], + attributeStats: AttributeMap[ColumnStat]): Seq[(Attribute, ColumnStat)] = { +expressions.collect { + case alias @ Alias(attr: Attribute, _) if attributeStats.contains(attr) => +alias.toAttribute -> attributeStats(attr) +} + } + def getSizePerRow( attributes: Seq[Attribute], attrStats: AttributeMap[ColumnStat] = AttributeMap(Nil)): BigInt = { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/ProjectEstimation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/ProjectEstimation.scala index 489eb90..6925423 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/ProjectEstimation.scala +++ b/sql/catalys
[spark] branch master updated: [SPARK-27221][SQL] Improve the assert error message in TreeNode.parseToJson
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new c26379b [SPARK-27221][SQL] Improve the assert error message in TreeNode.parseToJson c26379b is described below commit c26379b446e90b3104ebe36f288be483f613a652 Author: Shixiong Zhu AuthorDate: Thu Mar 21 11:15:05 2019 +0900 [SPARK-27221][SQL] Improve the assert error message in TreeNode.parseToJson ## What changes were proposed in this pull request? When `TreeNode.parseToJson` may throw an assert error without any error message when a TreeNode is not implemented properly, and it's hard to find the bad TreeNode implementation. This PR adds the assert message to improve the error, like what `TreeNode.jsonFields` does. ## How was this patch tested? Jenkins Closes #24159 from zsxwing/SPARK-27221. Authored-by: Shixiong Zhu Signed-off-by: Hyukjin Kwon --- .../main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala| 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala index 72b1931..66342af 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala @@ -630,7 +630,7 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product { val fieldNames = getConstructorParameterNames(getClass) val fieldValues = productIterator.toSeq ++ otherCopyArgs assert(fieldNames.length == fieldValues.length, s"${getClass.getSimpleName} fields: " + - fieldNames.mkString(", ") + s", values: " + fieldValues.map(_.toString).mkString(", ")) + fieldNames.mkString(", ") + s", values: " + fieldValues.mkString(", ")) fieldNames.zip(fieldValues).map { // If the field value is a child, then use an int to encode it, represents the index of @@ -683,7 +683,8 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product { try { val fieldNames = getConstructorParameterNames(p.getClass) val fieldValues = p.productIterator.toSeq -assert(fieldNames.length == fieldValues.length) +assert(fieldNames.length == fieldValues.length, s"${getClass.getSimpleName} fields: " + + fieldNames.mkString(", ") + s", values: " + fieldValues.mkString(", ")) ("product-class" -> JString(p.getClass.getName)) :: fieldNames.zip(fieldValues).map { case (name, value) => name -> parseToJson(value) }.toList - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-27223][SQL] Remove private methods that skip conversion when passing user schemas for constructing a DataFrame
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 2e090ba [SPARK-27223][SQL] Remove private methods that skip conversion when passing user schemas for constructing a DataFrame 2e090ba is described below commit 2e090ba628d4622f28e934f602bf8bc9783924c8 Author: maryannxue AuthorDate: Thu Mar 21 11:13:25 2019 +0900 [SPARK-27223][SQL] Remove private methods that skip conversion when passing user schemas for constructing a DataFrame ## What changes were proposed in this pull request? When passing in a user schema to create a DataFrame, there might be mismatched nullability between the user schema and the the actual data. All related public interfaces now perform catalyst conversion using the user provided schema, which catches such mismatches to avoid runtime errors later on. However, there're private methods which allow this conversion to be skipped, so we need to remove these private methods which may lead to confusion and potential issues. ## How was this patch tested? Passed existing tests. No new tests were added since this PR removed the private interfaces that would potentially cause null problems and other interfaces are covered already by existing tests. Closes #24162 from maryannxue/spark-27223. Authored-by: maryannxue Signed-off-by: Hyukjin Kwon --- .../scala/org/apache/spark/sql/SQLContext.scala| 9 .../scala/org/apache/spark/sql/SparkSession.scala | 25 +- .../org/apache/spark/sql/DataFrameSuite.scala | 6 +++--- 3 files changed, 8 insertions(+), 32 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala index 43f34e6..08b7521 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala @@ -325,15 +325,6 @@ class SQLContext private[sql](val sparkSession: SparkSession) } /** - * Creates a DataFrame from an RDD[Row]. User can specify whether the input rows should be - * converted to Catalyst rows. - */ - private[sql] - def createDataFrame(rowRDD: RDD[Row], schema: StructType, needsConversion: Boolean) = { -sparkSession.createDataFrame(rowRDD, schema, needsConversion) - } - - /** * :: Experimental :: * Creates a [[Dataset]] from a local Seq of data of a given type. This method requires an * encoder (to convert a JVM object of type `T` to and from the internal Spark SQL representation) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala index 5208b9a..0b5bf3f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala @@ -361,7 +361,11 @@ class SparkSession private( @DeveloperApi @Evolving def createDataFrame(rowRDD: RDD[Row], schema: StructType): DataFrame = { -createDataFrame(rowRDD, schema, needsConversion = true) +// TODO: use MutableProjection when rowRDD is another DataFrame and the applied +// schema differs from the existing schema on any field data type. +val encoder = RowEncoder(schema) +val catalystRows = rowRDD.map(encoder.toRow) +internalCreateDataFrame(catalystRows.setName(rowRDD.name), schema) } /** @@ -590,25 +594,6 @@ class SparkSession private( Dataset.ofRows(self, logicalPlan) } - /** - * Creates a `DataFrame` from an `RDD[Row]`. - * User can specify whether the input rows should be converted to Catalyst rows. - */ - private[sql] def createDataFrame( - rowRDD: RDD[Row], - schema: StructType, - needsConversion: Boolean) = { -// TODO: use MutableProjection when rowRDD is another DataFrame and the applied -// schema differs from the existing schema on any field data type. -val catalystRows = if (needsConversion) { - val encoder = RowEncoder(schema) - rowRDD.map(encoder.toRow) -} else { - rowRDD.map { r: Row => InternalRow.fromSeq(r.toSeq) } -} -internalCreateDataFrame(catalystRows.setName(rowRDD.name), schema) - } - /* - * | Catalog-related methods | diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala index 6dec129..78decd4 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala @@ -1572,8 +1572,8 @@ class DataFrameSuite extends QueryTest with SharedSQLContext { val rdd = sparkContext.makeRDD(Seq(Row(1, 3), Row(2, 1))) val df = s
[spark] branch master updated: [MINOR][EXAMPLES] Add missing return keyword streaming word count example
This is an automated email from the ASF dual-hosted git repository. srowen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new d6ee2f3 [MINOR][EXAMPLES] Add missing return keyword streaming word count example d6ee2f3 is described below commit d6ee2f331db461c1f7a25e0ef17901f53d8b707e Author: Ruocheng Jiang AuthorDate: Wed Mar 20 17:59:12 2019 -0500 [MINOR][EXAMPLES] Add missing return keyword streaming word count example This is a very low level error. Closes #24153 from jiangruocheng/master. Authored-by: Ruocheng Jiang Signed-off-by: Sean Owen --- examples/src/main/python/streaming/recoverable_network_wordcount.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/src/main/python/streaming/recoverable_network_wordcount.py b/examples/src/main/python/streaming/recoverable_network_wordcount.py index 60167dc..a39c4d0 100644 --- a/examples/src/main/python/streaming/recoverable_network_wordcount.py +++ b/examples/src/main/python/streaming/recoverable_network_wordcount.py @@ -83,9 +83,9 @@ def createContext(host, port, outputPath): def filterFunc(wordCount): if wordCount[0] in blacklist.value: droppedWordsCounter.add(wordCount[1]) -False +return False else: -True +return True counts = "Counts at time %s %s" % (time, rdd.filter(filterFunc).collect()) print(counts) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-27205][CORE] Remove complicated logic for just leaving warning log when main class is scala.App
This is an automated email from the ASF dual-hosted git repository. srowen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new a8d9531 [SPARK-27205][CORE] Remove complicated logic for just leaving warning log when main class is scala.App a8d9531 is described below commit a8d9531edc80116d4549efd62af116a90256864b Author: Jungtaek Lim (HeartSaVioR) AuthorDate: Wed Mar 20 17:55:48 2019 -0500 [SPARK-27205][CORE] Remove complicated logic for just leaving warning log when main class is scala.App ## What changes were proposed in this pull request? [SPARK-26977](https://issues.apache.org/jira/browse/SPARK-26977) introduced very strange bug which spark-shell is no longer able to load classes which are provided via `--packages`. TBH I don't know about the details why it is broken, but looks like initializing `object class` brings the weirdness (maybe due to static initialization done twice?). This patch removes the logic to leave warning log when main class is scala.App, to not deal with such complexity for just leaving warning message. ## How was this patch tested? Manual test: suppose we run spark-shell with `--packages` option like below: ``` ./bin/spark-shell --verbose --master "local[*]" --packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.0 ``` Before this patch, importing class in transitive dependency fails: ``` Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties Setting default log level to "WARN". To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). Spark context Web UI available at http://localhost:4040 Spark context available as 'sc' (master = local[*], app id = local-1553005771597). Spark session available as 'spark'. Welcome to __ / __/__ ___ _/ /__ _\ \/ _ \/ _ `/ __/ '_/ /___/ .__/\_,_/_/ /_/\_\ version 3.0.0-SNAPSHOT /_/ Using Scala version 2.12.8 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_191) Type in expressions to have them evaluated. Type :help for more information. scala> import org.apache.kafka :23: error: object kafka is not a member of package org.apache import org.apache.kafka ``` After this patch, importing class in transitive dependency succeeds: ``` Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties Setting default log level to "WARN". To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). Spark context Web UI available at http://localhost:4040 Spark context available as 'sc' (master = local[*], app id = local-1553004095542). Spark session available as 'spark'. Welcome to __ / __/__ ___ _/ /__ _\ \/ _ \/ _ `/ __/ '_/ /___/ .__/\_,_/_/ /_/\_\ version 3.0.0-SNAPSHOT /_/ Using Scala version 2.12.8 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_191) Type in expressions to have them evaluated. Type :help for more information. scala> import org.apache.kafka import org.apache.kafka ``` Closes #24147 from HeartSaVioR/SPARK-27205. Authored-by: Jungtaek Lim (HeartSaVioR) Signed-off-by: Sean Owen --- .../src/main/scala/org/apache/spark/deploy/SparkSubmit.scala | 12 1 file changed, 12 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala index 493cad0..b6673e4 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala @@ -826,18 +826,6 @@ private[spark] class SparkSubmit extends Logging { val app: SparkApplication = if (classOf[SparkApplication].isAssignableFrom(mainClass)) { mainClass.getConstructor().newInstance().asInstanceOf[SparkApplication] } else { - // Scala object subclassing scala.App has its whole class body executed in the - // main method it inherits. Fields of the object will not have been initialized - // before the main method has been executed, which will cause problems like SPARK-4170 - // Note two Java classes are generated, the childMainClass and childMainClass$. - // Users will pass in childMainClass which will delegate all invocations to childMainClass$ - // but it's childMainClass$ that subclasses scala.App and we should check for. - Try { -if (classOf[scala.App].isAssignableFrom(Utils.classForName(s"$childMainClass$$"))) { - logWarning("Subclasses of scala.App may not work correctly. " + -"Use a main() method instead
[spark] branch master updated: [SPARK-27202][MINOR][SQL] Update comments to keep according with code
This is an automated email from the ASF dual-hosted git repository. srowen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 46f9f44 [SPARK-27202][MINOR][SQL] Update comments to keep according with code 46f9f44 is described below commit 46f9f44918ba2589c6ffc938822cbdfac1c6f4d1 Author: wangguangxin.cn AuthorDate: Wed Mar 20 17:54:28 2019 -0500 [SPARK-27202][MINOR][SQL] Update comments to keep according with code ## What changes were proposed in this pull request? Update comments in `InMemoryFileIndex.listLeafFiles` to keep according with code. ## How was this patch tested? existing test cases Closes #24146 from WangGuangxin/SPARK-27202. Authored-by: wangguangxin.cn Signed-off-by: Sean Owen --- .../apache/spark/sql/execution/datasources/InMemoryFileIndex.scala| 2 +- .../spark/sql/execution/datasources/PartitioningAwareFileIndex.scala | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala index fe418e6..db55a06 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala @@ -308,7 +308,7 @@ object InMemoryFileIndex extends Logging { // implementations don't actually issue RPC for this method. // // - Here we are calling `getFileBlockLocations` in a sequential manner, but it should not - // be a big deal since we always use to `listLeafFilesInParallel` when the number of + // be a big deal since we always use to `bulkListLeafFiles` when the number of // paths exceeds threshold. case f => // The other constructor of LocatedFileStatus will call FileStatus.getPermission(), diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala index b2e4155..f5ae095 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala @@ -89,7 +89,7 @@ abstract class PartitioningAwareFileIndex( if (partitionSpec().partitionColumns.isEmpty) { // For each of the root input paths, get the list of files inside them rootPaths.flatMap { path => -// Make the path qualified (consistent with listLeafFiles and listLeafFilesInParallel). +// Make the path qualified (consistent with listLeafFiles and bulkListLeafFiles). val fs = path.getFileSystem(hadoopConf) val qualifiedPathPre = fs.makeQualified(path) val qualifiedPath: Path = if (qualifiedPathPre.isRoot && !qualifiedPathPre.isAbsolute) { @@ -203,7 +203,7 @@ abstract class PartitioningAwareFileIndex( case None => rootPaths.map { path => - // Make the path qualified (consistent with listLeafFiles and listLeafFilesInParallel). + // Make the path qualified (consistent with listLeafFiles and bulkListLeafFiles). val qualifiedPath = path.getFileSystem(hadoopConf).makeQualified(path) if (leafFiles.contains(qualifiedPath)) qualifiedPath.getParent else qualifiedPath }.toSet } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-26729][K8S] Make image names under test configurable
This is an automated email from the ASF dual-hosted git repository. vanzin pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 61d9946 [SPARK-26729][K8S] Make image names under test configurable 61d9946 is described below commit 61d99462a0366b1a9ef3ef966673717a17d2d533 Author: Rob Vesse AuthorDate: Wed Mar 20 14:28:27 2019 -0700 [SPARK-26729][K8S] Make image names under test configurable ## What changes were proposed in this pull request? Allow specifying system properties to customise the image names for the images used in the integration testing. Useful if your CI/CD pipeline or policy requires using a different naming format. This is one part of addressing SPARK-26729, I plan to have a follow up patch that will also make the names configurable when using `docker-image-tool.sh` ## How was this patch tested? Ran integration tests against custom images generated by our CI/CD pipeline that do not follow Spark's existing hardcoded naming conventions using the new system properties to override the image names appropriately: ``` mvn clean integration-test -pl :spark-kubernetes-integration-tests_${SCALA_VERSION} \ -Pkubernetes -Pkubernetes-integration-tests \ -P${SPARK_HADOOP_PROFILE} -Dhadoop.version=${HADOOP_VERSION} \ -Dspark.kubernetes.test.sparkTgz=${TARBALL} \ -Dspark.kubernetes.test.imageTag=${TAG} \ -Dspark.kubernetes.test.imageRepo=${REPO} \ -Dspark.kubernetes.test.namespace=${K8S_NAMESPACE} \ -Dspark.kubernetes.test.kubeConfigContext=${K8S_CONTEXT} \ -Dspark.kubernetes.test.deployMode=${K8S_TEST_DEPLOY_MODE} \ -Dspark.kubernetes.test.jvmImage=apache-spark \ -Dspark.kubernetes.test.pythonImage=apache-spark-py \ -Dspark.kubernetes.test.rImage=apache-spark-r \ -Dtest.include.tags=k8s ... [INFO] --- scalatest-maven-plugin:1.0:test (integration-test) spark-kubernetes-integration-tests_2.12 --- Discovery starting. Discovery completed in 230 milliseconds. Run starting. Expected test count is: 15 KubernetesSuite: - Run SparkPi with no resources - Run SparkPi with a very long application name. - Use SparkLauncher.NO_RESOURCE - Run SparkPi with a master URL without a scheme. - Run SparkPi with an argument. - Run SparkPi with custom labels, annotations, and environment variables. - Run extraJVMOptions check on driver - Run SparkRemoteFileTest using a remote data file - Run SparkPi with env and mount secrets. - Run PySpark on simple pi.py example - Run PySpark with Python2 to test a pyfiles example - Run PySpark with Python3 to test a pyfiles example - Run PySpark with memory customization - Run in client mode. - Start pod creation from template Run completed in 8 minutes, 33 seconds. Total number of tests run: 15 Suites: completed 2, aborted 0 Tests: succeeded 15, failed 0, canceled 0, ignored 0, pending 0 All tests passed. ``` Closes #23846 from rvesse/SPARK-26729. Authored-by: Rob Vesse Signed-off-by: Marcelo Vanzin --- .../kubernetes/integration-tests/README.md | 37 -- .../dev/dev-run-integration-tests.sh | 31 ++ .../kubernetes/integration-tests/pom.xml | 3 ++ .../k8s/integrationtest/KubernetesSuite.scala | 6 ++-- .../deploy/k8s/integrationtest/TestConstants.scala | 3 ++ 5 files changed, 75 insertions(+), 5 deletions(-) diff --git a/resource-managers/kubernetes/integration-tests/README.md b/resource-managers/kubernetes/integration-tests/README.md index 73fc058..57c26e1 100644 --- a/resource-managers/kubernetes/integration-tests/README.md +++ b/resource-managers/kubernetes/integration-tests/README.md @@ -28,7 +28,7 @@ The main useful options are outlined below. ## Using a different backend The integration test backend i.e. the K8S cluster used for testing is controlled by the `--deploy-mode` option. By -default this is set to `minikube`, the available backends are their perequisites are as follows. +default this is set to `minikube`, the available backends are their prerequisites are as follows. ### `minikube` @@ -46,7 +46,7 @@ environment variable appropriately. ### `cloud` -These cloud backend configures the tests to use an arbitrary Kubernetes cluster running in the cloud or otherwise. +The cloud backend configures the tests to use an arbitrary Kubernetes cluster running in the cloud or otherwise. The `cloud` backend auto-configures the cluster to use from your K8S config file, this is assumed to be `~/.kube/config` unless the `KUBECONFIG` environment variable is set to override th
[spark] branch master updated: [SPARK-27215][CORE] Correct the kryo configurations
This is an automated email from the ASF dual-hosted git repository. vanzin pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 93c6d2a [SPARK-27215][CORE] Correct the kryo configurations 93c6d2a is described below commit 93c6d2a198d1b3070eea32210042873c68d0d5f7 Author: Lantao Jin AuthorDate: Wed Mar 20 14:27:05 2019 -0700 [SPARK-27215][CORE] Correct the kryo configurations ## What changes were proposed in this pull request? ```scala val KRYO_USE_UNSAFE = ConfigBuilder("spark.kyro.unsafe") .booleanConf .createWithDefault(false) val KRYO_USE_POOL = ConfigBuilder("spark.kyro.pool") .booleanConf .createWithDefault(true) ``` **kyro should be kryo** ## How was this patch tested? no need Closes #24156 from LantaoJin/SPARK-27215. Authored-by: Lantao Jin Signed-off-by: Marcelo Vanzin --- core/src/main/scala/org/apache/spark/internal/config/Kryo.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/internal/config/Kryo.scala b/core/src/main/scala/org/apache/spark/internal/config/Kryo.scala index 7873141..717a099 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/Kryo.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/Kryo.scala @@ -34,11 +34,11 @@ private[spark] object Kryo { .toSequence .createWithDefault(Nil) - val KRYO_USE_UNSAFE = ConfigBuilder("spark.kyro.unsafe") + val KRYO_USE_UNSAFE = ConfigBuilder("spark.kryo.unsafe") .booleanConf .createWithDefault(false) - val KRYO_USE_POOL = ConfigBuilder("spark.kyro.pool") + val KRYO_USE_POOL = ConfigBuilder("spark.kryo.pool") .booleanConf .createWithDefault(true) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-27094][YARN] Work around RackResolver swallowing thread interrupt.
This is an automated email from the ASF dual-hosted git repository. vanzin pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new ec5e342 [SPARK-27094][YARN] Work around RackResolver swallowing thread interrupt. ec5e342 is described below commit ec5e34205a8b0e2f6bc4287b86e7eac269452ffb Author: Marcelo Vanzin AuthorDate: Wed Mar 20 11:48:06 2019 -0700 [SPARK-27094][YARN] Work around RackResolver swallowing thread interrupt. To avoid the case where the YARN libraries would swallow the exception and prevent YarnAllocator from shutting down, call the offending code in a separate thread, so that the parent thread can respond appropriately to the shut down. As a safeguard, also explicitly stop the executor launch thread pool when shutting down the application, to prevent new executors from coming up after the application started its shutdown. Tested with unit tests + some internal tests on real cluster. Closes #24017 from vanzin/SPARK-27094. Authored-by: Marcelo Vanzin Signed-off-by: Marcelo Vanzin --- .../spark/deploy/yarn/ApplicationMaster.scala | 154 +++-- .../apache/spark/deploy/yarn/YarnAllocator.scala | 45 +- 2 files changed, 120 insertions(+), 79 deletions(-) diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala index 9ed3b78..743c2e0 100644 --- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala +++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala @@ -550,88 +550,94 @@ private[spark] class ApplicationMaster( reporterThread.join() } - private def launchReporterThread(): Thread = { -// The number of failures in a row until Reporter thread give up + private def allocationThreadImpl(): Unit = { +// The number of failures in a row until the allocation thread gives up. val reporterMaxFailures = sparkConf.get(MAX_REPORTER_THREAD_FAILURES) - -val t = new Thread { - override def run() { -var failureCount = 0 -while (!finished) { - try { -if (allocator.getNumExecutorsFailed >= maxNumExecutorFailures) { - finish(FinalApplicationStatus.FAILED, -ApplicationMaster.EXIT_MAX_EXECUTOR_FAILURES, -s"Max number of executor failures ($maxNumExecutorFailures) reached") -} else if (allocator.isAllNodeBlacklisted) { - finish(FinalApplicationStatus.FAILED, -ApplicationMaster.EXIT_MAX_EXECUTOR_FAILURES, -"Due to executor failures all available nodes are blacklisted") -} else { - logDebug("Sending progress") - allocator.allocateResources() -} -failureCount = 0 - } catch { -case i: InterruptedException => // do nothing -case e: ApplicationAttemptNotFoundException => - failureCount += 1 - logError("Exception from Reporter thread.", e) - finish(FinalApplicationStatus.FAILED, ApplicationMaster.EXIT_REPORTER_FAILURE, -e.getMessage) -case e: Throwable => - failureCount += 1 - if (!NonFatal(e) || failureCount >= reporterMaxFailures) { -finish(FinalApplicationStatus.FAILED, - ApplicationMaster.EXIT_REPORTER_FAILURE, "Exception was thrown " + -s"$failureCount time(s) from Reporter thread.") - } else { -logWarning(s"Reporter thread fails $failureCount time(s) in a row.", e) - } +var failureCount = 0 +while (!finished) { + try { +if (allocator.getNumExecutorsFailed >= maxNumExecutorFailures) { + finish(FinalApplicationStatus.FAILED, +ApplicationMaster.EXIT_MAX_EXECUTOR_FAILURES, +s"Max number of executor failures ($maxNumExecutorFailures) reached") +} else if (allocator.isAllNodeBlacklisted) { + finish(FinalApplicationStatus.FAILED, +ApplicationMaster.EXIT_MAX_EXECUTOR_FAILURES, +"Due to executor failures all available nodes are blacklisted") +} else { + logDebug("Sending progress") + allocator.allocateResources() +} +failureCount = 0 + } catch { +case i: InterruptedException => // do nothing +case e: ApplicationAttemptNotFoundException => + failureCount += 1 + logError("Exception from Reporter thread.", e) + finish(FinalApplicationStatus.FAILED, ApplicationMaster.EXIT_REPORTER_FAILURE, +e.getMessage) +case e: Throwable => +
[spark] branch master updated: [SPARK-26839][SQL] Work around classloader changes in Java 9 for Hive isolation
This is an automated email from the ASF dual-hosted git repository. srowen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new c65f9b2 [SPARK-26839][SQL] Work around classloader changes in Java 9 for Hive isolation c65f9b2 is described below commit c65f9b2bc35c2926bb3658f65fe4f8a0b8e9fe4a Author: Sean Owen AuthorDate: Wed Mar 20 09:12:52 2019 -0500 [SPARK-26839][SQL] Work around classloader changes in Java 9 for Hive isolation Note, this doesn't really resolve the JIRA, but makes the changes we can make so far that would be required to solve it. ## What changes were proposed in this pull request? Java 9+ changed how ClassLoaders work. The two most salient points: - The boot classloader no longer 'sees' the platform classes. A new 'platform classloader' does and should be the parent of new ClassLoaders - The system classloader is no longer a URLClassLoader, so we can't get the URLs of JARs in its classpath ## How was this patch tested? We'll see whether Java 8 tests still pass here. Java 11 tests do not fully pass at this point; more notes below. This does make progress on the failures though. (NB: to test with Java 11, you need to build with Java 8 first, setting JAVA_HOME and java's executable correctly, then switch both to Java 11 for testing.) Closes #24057 from srowen/SPARK-26839. Authored-by: Sean Owen Signed-off-by: Sean Owen --- .../org/apache/spark/sql/hive/HiveUtils.scala | 20 +++--- .../sql/hive/client/IsolatedClientLoader.scala | 77 +- 2 files changed, 59 insertions(+), 38 deletions(-) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala index 38bbe64..a7f40c6 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala @@ -19,8 +19,6 @@ package org.apache.spark.sql.hive import java.io.File import java.net.{URL, URLClassLoader} -import java.nio.charset.StandardCharsets -import java.sql.Timestamp import java.util.Locale import java.util.concurrent.TimeUnit @@ -28,12 +26,11 @@ import scala.collection.JavaConverters._ import scala.collection.mutable.HashMap import scala.language.implicitConversions +import org.apache.commons.lang3.{JavaVersion, SystemUtils} import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.hive.common.`type`.HiveDecimal import org.apache.hadoop.hive.conf.HiveConf import org.apache.hadoop.hive.conf.HiveConf.ConfVars import org.apache.hadoop.hive.ql.session.SessionState -import org.apache.hadoop.hive.serde2.io.{DateWritable, TimestampWritable} import org.apache.hadoop.util.VersionInfo import org.apache.spark.{SparkConf, SparkContext} @@ -329,10 +326,17 @@ private[spark] object HiveUtils extends Logging { val classLoader = Utils.getContextOrSparkClassLoader val jars = allJars(classLoader) - if (jars.length == 0) { -throw new IllegalArgumentException( - "Unable to locate hive jars to connect to metastore. " + -s"Please set ${HIVE_METASTORE_JARS.key}.") + if (SystemUtils.isJavaVersionAtLeast(JavaVersion.JAVA_9)) { +// Do nothing. The system classloader is no longer a URLClassLoader in Java 9, +// so it won't match the case in allJars above. It no longer exposes URLs of +// the system classpath + } else { +// Verify at least one jar was found +if (jars.length == 0) { + throw new IllegalArgumentException( +"Unable to locate hive jars to connect to metastore. " + + s"Please set ${HIVE_METASTORE_JARS.key}.") +} } logInfo( diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala index efa97b2..98999eb 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala @@ -25,6 +25,7 @@ import java.util import scala.util.Try import org.apache.commons.io.{FileUtils, IOUtils} +import org.apache.commons.lang3.{JavaVersion, SystemUtils} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.hive.conf.HiveConf.ConfVars @@ -157,7 +158,6 @@ private[hive] object IsolatedClientLoader extends Logging { * @param isolationOn When true, custom versions of barrier classes will be constructed. Must be *true unless loading the version of hive that is on Sparks classloader. * @param sharesHadoopClasses When true, we will share Hadoop classes between Spark and - * @param rootClassLo
[spark] branch master updated: [SPARK-27201][WEBUI] Toggle full job description on click
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new ef2d63b [SPARK-27201][WEBUI] Toggle full job description on click ef2d63b is described below commit ef2d63bfb1859232d166320896957c0543c059cc Author: Gengliang Wang AuthorDate: Wed Mar 20 21:29:13 2019 +0900 [SPARK-27201][WEBUI] Toggle full job description on click ## What changes were proposed in this pull request? Previously, in https://github.com/apache/spark/pull/6646 there was an improvement to show full job description after double clicks. I think this is a bit hard to be noticed by some users. I suggest changing the event to one click. Also, after the full description is shown, another click should be able to hide the overflow text again. Before click: ![short](https://user-images.githubusercontent.com/1097932/54608784-79bfca80-4a8c-11e9-912b-30799be0d6cb.png) After click: ![full](https://user-images.githubusercontent.com/1097932/54608790-7b898e00-4a8c-11e9-9251-86061158db68.png) Click again: ![short](https://user-images.githubusercontent.com/1097932/54608784-79bfca80-4a8c-11e9-912b-30799be0d6cb.png) ## How was this patch tested? Manually check. Closes #24145 from gengliangwang/showDescriptionDetail. Authored-by: Gengliang Wang Signed-off-by: Hyukjin Kwon --- .../main/resources/org/apache/spark/ui/static/additional-metrics.js | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/core/src/main/resources/org/apache/spark/ui/static/additional-metrics.js b/core/src/main/resources/org/apache/spark/ui/static/additional-metrics.js index 3c8..3798dc4 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/additional-metrics.js +++ b/core/src/main/resources/org/apache/spark/ui/static/additional-metrics.js @@ -83,8 +83,8 @@ $(function() { $(this).parent().find('input[type="checkbox"]').trigger('click'); }); -// Trigger a double click on the span to show full job description. -$(".description-input").dblclick(function() { - $(this).removeClass("description-input").addClass("description-input-full"); +// Show/hide full job description on click event. +$(".description-input").click(function() { +$(this).toggleClass("description-input-full"); }); }); - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-27199][SQL] Replace TimeZone by ZoneId in TimestampFormatter API
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 1882912 [SPARK-27199][SQL] Replace TimeZone by ZoneId in TimestampFormatter API 1882912 is described below commit 1882912cca4921d3d8c8632b3bb34e69e8119791 Author: Maxim Gekk AuthorDate: Wed Mar 20 21:28:11 2019 +0900 [SPARK-27199][SQL] Replace TimeZone by ZoneId in TimestampFormatter API ## What changes were proposed in this pull request? In the PR, I propose to use `ZoneId` instead of `TimeZone` in: - the `apply` and `getFractionFormatter ` methods of the `TimestampFormatter` object, - and in implementations of the `TimestampFormatter` trait like `FractionTimestampFormatter`. The reason of the changes is to avoid unnecessary conversion from `TimeZone` to `ZoneId` because `ZoneId` is used in `TimestampFormatter` implementations internally, and the conversion is performed via `String` which is not for free. Also taking into account that `TimeZone` instances are converted from `String` in some cases, the worse case looks like `String` -> `TimeZone` -> `String` -> `ZoneId`. The PR eliminates the unneeded conversions. ## How was this patch tested? It was tested by `DateExpressionsSuite`, `DateTimeUtilsSuite` and `TimestampFormatterSuite`. Closes #24141 from MaxGekk/zone-id. Authored-by: Maxim Gekk Signed-off-by: Hyukjin Kwon --- .../spark/sql/catalyst/catalog/interface.scala | 3 +- .../spark/sql/catalyst/csv/CSVInferSchema.scala| 2 +- .../apache/spark/sql/catalyst/csv/CSVOptions.scala | 5 +-- .../sql/catalyst/csv/UnivocityGenerator.scala | 2 +- .../spark/sql/catalyst/csv/UnivocityParser.scala | 2 +- .../spark/sql/catalyst/expressions/Cast.scala | 2 +- .../catalyst/expressions/datetimeExpressions.scala | 21 ++-- .../spark/sql/catalyst/json/JSONOptions.scala | 5 +-- .../spark/sql/catalyst/json/JacksonGenerator.scala | 2 +- .../spark/sql/catalyst/json/JacksonParser.scala| 2 +- .../spark/sql/catalyst/json/JsonInferSchema.scala | 2 +- .../spark/sql/catalyst/util/DateTimeUtils.scala| 4 +-- .../sql/catalyst/util/TimestampFormatter.scala | 31 +- .../sql/catalyst/csv/UnivocityParserSuite.scala| 8 +++-- .../expressions/DateExpressionsSuite.scala | 3 +- .../sql/catalyst/util/DateTimeTestUtils.scala | 3 +- .../sql/catalyst/util/DateTimeUtilsSuite.scala | 2 +- .../spark/sql/util/TimestampFormatterSuite.scala | 31 -- .../apache/spark/sql/execution/HiveResult.scala| 2 +- .../execution/datasources/PartitioningUtils.scala | 37 +++--- .../execution/datasources/jdbc/JDBCRelation.scala | 2 +- .../parquet/ParquetPartitionDiscoverySuite.scala | 21 ++-- 22 files changed, 99 insertions(+), 93 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala index 69b5cb4..6006637 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala @@ -18,6 +18,7 @@ package org.apache.spark.sql.catalyst.catalog import java.net.URI +import java.time.ZoneOffset import java.util.Date import scala.collection.mutable @@ -477,7 +478,7 @@ object CatalogColumnStat extends Logging { val VERSION = 2 private def getTimestampFormatter(): TimestampFormatter = { -TimestampFormatter(format = "-MM-dd HH:mm:ss.SS", timeZone = DateTimeUtils.TimeZoneUTC) +TimestampFormatter(format = "-MM-dd HH:mm:ss.SS", zoneId = ZoneOffset.UTC) } /** diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala index 4dd4104..ae9f057 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala @@ -29,7 +29,7 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable { private val timestampParser = TimestampFormatter( options.timestampFormat, -options.timeZone, +options.zoneId, options.locale) private val decimalParser = { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala index 90c96d1..1268fcf 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions
[spark] branch master updated: [SPARK-27200][WEBUI][HISTORYSERVER] History Environment tab must sort Configurations/Properties by default
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 1f692e5 [SPARK-27200][WEBUI][HISTORYSERVER] History Environment tab must sort Configurations/Properties by default 1f692e5 is described below commit 1f692e522cefed0fcc10096c5ca5ce77650636f3 Author: Ajith AuthorDate: Wed Mar 20 20:16:17 2019 +0900 [SPARK-27200][WEBUI][HISTORYSERVER] History Environment tab must sort Configurations/Properties by default Environment Page in SparkUI have all the configuration sorted by key. But this is not the case in History server case, to keep UX same, we can have it sorted in history server too ## What changes were proposed in this pull request? On render of Env page the properties are sorted before creating page ## How was this patch tested? Manually tested in UI Closes #24143 from ajithme/historyenv. Authored-by: Ajith Signed-off-by: Hyukjin Kwon --- .../main/scala/org/apache/spark/ui/env/EnvironmentPage.scala | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/ui/env/EnvironmentPage.scala b/core/src/main/scala/org/apache/spark/ui/env/EnvironmentPage.scala index cbb8b3c..cf0815b 100644 --- a/core/src/main/scala/org/apache/spark/ui/env/EnvironmentPage.scala +++ b/core/src/main/scala/org/apache/spark/ui/env/EnvironmentPage.scala @@ -39,15 +39,15 @@ private[ui] class EnvironmentPage( "Scala Version" -> appEnv.runtime.scalaVersion) val runtimeInformationTable = UIUtils.listingTable( - propertyHeader, jvmRow, jvmInformation, fixedWidth = true) + propertyHeader, jvmRow, jvmInformation.toSeq.sorted, fixedWidth = true) val sparkPropertiesTable = UIUtils.listingTable(propertyHeader, propertyRow, - Utils.redact(conf, appEnv.sparkProperties.toSeq), fixedWidth = true) + Utils.redact(conf, appEnv.sparkProperties.sorted), fixedWidth = true) val hadoopPropertiesTable = UIUtils.listingTable(propertyHeader, propertyRow, - Utils.redact(conf, appEnv.hadoopProperties.toSeq), fixedWidth = true) + Utils.redact(conf, appEnv.hadoopProperties.sorted), fixedWidth = true) val systemPropertiesTable = UIUtils.listingTable( - propertyHeader, propertyRow, appEnv.systemProperties, fixedWidth = true) + propertyHeader, propertyRow, appEnv.systemProperties.sorted, fixedWidth = true) val classpathEntriesTable = UIUtils.listingTable( - classPathHeaders, classPathRow, appEnv.classpathEntries, fixedWidth = true) + classPathHeaders, classPathRow, appEnv.classpathEntries.sorted, fixedWidth = true) val content =
[spark] branch master updated: [SPARK-27099][SQL] Add 'xxhash64' for hashing arbitrary columns to Long
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new b67d369 [SPARK-27099][SQL] Add 'xxhash64' for hashing arbitrary columns to Long b67d369 is described below commit b67d36957287c1fbefa1996e6a4a009a75c4c3f8 Author: Huon Wilson AuthorDate: Wed Mar 20 16:34:34 2019 +0800 [SPARK-27099][SQL] Add 'xxhash64' for hashing arbitrary columns to Long ## What changes were proposed in this pull request? This introduces a new SQL function 'xxhash64' for getting a 64-bit hash of an arbitrary number of columns. This is designed to exactly mimic the 32-bit `hash`, which uses MurmurHash3. The name is designed to be more future-proof than the 'hash', by indicating the exact algorithm used, similar to md5 and the sha hashes. ## How was this patch tested? The tests for the existing `hash` function were duplicated to run with `xxhash64`. Closes #24019 from huonw/hash64. Authored-by: Huon Wilson Signed-off-by: Wenchen Fan --- R/pkg/NAMESPACE | 1 + R/pkg/R/functions.R | 19 +++ R/pkg/R/generics.R| 4 R/pkg/tests/fulltests/test_sparkSQL.R | 2 +- python/pyspark/sql/functions.py | 13 + .../apache/spark/sql/catalyst/analysis/Analyzer.scala | 5 + .../sql/catalyst/analysis/FunctionRegistry.scala | 1 + .../apache/spark/sql/catalyst/expressions/hash.scala | 9 - .../analysis/ExpressionTypeCheckingSuite.scala| 1 + .../catalyst/expressions/HashExpressionsSuite.scala | 5 + .../main/scala/org/apache/spark/sql/functions.scala | 13 + .../apache/spark/sql/DataFrameFunctionsSuite.scala| 4 +++- .../scala/org/apache/spark/sql/DataFrameSuite.scala | 19 +++ .../scala/org/apache/spark/sql/SQLQuerySuite.scala| 11 +++ 14 files changed, 104 insertions(+), 3 deletions(-) diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index 1dcad16..f9d9494 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -408,6 +408,7 @@ exportMethods("%<=>%", "weekofyear", "when", "window", + "xxhash64", "year") exportClasses("GroupedData") diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 8f425b1..d91896a 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -736,6 +736,25 @@ setMethod("hash", }) #' @details +#' \code{xxhash64}: Calculates the hash code of given columns using the 64-bit +#' variant of the xxHash algorithm, and returns the result as a long +#' column. +#' +#' @rdname column_misc_functions +#' @aliases xxhash64 xxhash64,Column-method +#' @note xxhash64 since 3.0.0 +setMethod("xxhash64", + signature(x = "Column"), + function(x, ...) { +jcols <- lapply(list(x, ...), function(x) { + stopifnot(class(x) == "Column") + x@jc +}) +jc <- callJStatic("org.apache.spark.sql.functions", "xxhash64", jcols) +column(jc) + }) + +#' @details #' \code{dayofmonth}: Extracts the day of the month as an integer from a #' given date/timestamp/string. #' diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index fcb511e..f849dd1 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -1394,6 +1394,10 @@ setGeneric("weekofyear", function(x) { standardGeneric("weekofyear") }) #' @name NULL setGeneric("window", function(x, ...) { standardGeneric("window") }) +#' @rdname column_misc_functions +#' @name NULL +setGeneric("xxhash64", function(x, ...) { standardGeneric("xxhash64") }) + #' @rdname column_datetime_functions #' @name NULL setGeneric("year", function(x) { standardGeneric("year") }) diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R index c9d6134..cebd0f8 100644 --- a/R/pkg/tests/fulltests/test_sparkSQL.R +++ b/R/pkg/tests/fulltests/test_sparkSQL.R @@ -1390,7 +1390,7 @@ test_that("column functions", { c9 <- signum(c) + sin(c) + sinh(c) + size(c) + stddev(c) + soundex(c) + sqrt(c) + sum(c) c10 <- sumDistinct(c) + tan(c) + tanh(c) + degrees(c) + radians(c) c11 <- to_date(c) + trim(c) + unbase64(c) + unhex(c) + upper(c) - c12 <- variance(c) + ltrim(c, "a") + rtrim(c, "b") + trim(c, "c") + c12 <- variance(c) + xxhash64(c) + ltrim(c, "a") + rtrim(c, "b") + trim(c, "c") c13 <- lead("col", 1) + lead(c, 1) + lag("col", 1) + lag(c, 1) c14 <- cume_dist() + ntile(1) + corr(c, c1) c15 <- dense_rank() + percent_rank() + rank() + row_number() diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 0326613..bc28c9d 100644 --- a/