spark git commit: [SPARK-21699][SQL] Remove unused getTableOption in ExternalCatalog
Repository: spark Updated Branches: refs/heads/branch-2.2 3ca55eaaf -> c90949698 [SPARK-21699][SQL] Remove unused getTableOption in ExternalCatalog ## What changes were proposed in this pull request? This patch removes the unused SessionCatalog.getTableMetadataOption and ExternalCatalog. getTableOption. ## How was this patch tested? Removed the test case. Author: Reynold Xin Closes #18912 from rxin/remove-getTableOption. (cherry picked from commit 584c7f14370cdfafdc6cd554b2760b7ce7709368) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c9094969 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c9094969 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c9094969 Branch: refs/heads/branch-2.2 Commit: c909496983314b48dd4d8587e586b553b04ff0ce Parents: 3ca55ea Author: Reynold Xin Authored: Thu Aug 10 18:56:25 2017 -0700 Committer: Reynold Xin Committed: Thu Aug 10 18:56:43 2017 -0700 -- .../sql/catalyst/catalog/ExternalCatalog.scala | 2 -- .../sql/catalyst/catalog/InMemoryCatalog.scala | 4 .../sql/catalyst/catalog/SessionCatalog.scala | 17 +++-- .../sql/catalyst/catalog/SessionCatalogSuite.scala | 11 --- .../spark/sql/hive/HiveExternalCatalog.scala | 4 5 files changed, 3 insertions(+), 35 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c9094969/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala index 974ef90..18644b0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala @@ -162,8 +162,6 @@ abstract class ExternalCatalog def getTable(db: String, table: String): CatalogTable - def getTableOption(db: String, table: String): Option[CatalogTable] - def tableExists(db: String, table: String): Boolean def listTables(db: String): Seq[String] http://git-wip-us.apache.org/repos/asf/spark/blob/c9094969/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala index 864ee48..bf8542c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala @@ -315,10 +315,6 @@ class InMemoryCatalog( catalog(db).tables(table).table } - override def getTableOption(db: String, table: String): Option[CatalogTable] = synchronized { -if (!tableExists(db, table)) None else Option(catalog(db).tables(table).table) - } - override def tableExists(db: String, table: String): Boolean = synchronized { requireDbExists(db) catalog(db).tables.contains(table) http://git-wip-us.apache.org/repos/asf/spark/blob/c9094969/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala index 57006bf..8d9fb4c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala @@ -388,9 +388,10 @@ class SessionCatalog( /** * Retrieve the metadata of an existing permanent table/view. If no database is specified, - * assume the table/view is in the current database. If the specified table/view is not found - * in the database then a [[NoSuchTableException]] is thrown. + * assume the table/view is in the current database. */ + @throws[NoSuchDatabaseException] + @throws[NoSuchTableException] def getTableMetadata(name: TableIdentifier): CatalogTable = { val db = formatDatabaseName(name.database.getOrElse(getCurrentDatabase)) val table = formatTableName(name.table) @@ -400,18 +401,6 @@ class SessionCatalog( } /** - * Retrieve the metadata of an existing metastore table. - * If no database is specified, assume the table is in the current database. - * If the specified table
spark git commit: [SPARK-21699][SQL] Remove unused getTableOption in ExternalCatalog
Repository: spark Updated Branches: refs/heads/master ca6955858 -> 584c7f143 [SPARK-21699][SQL] Remove unused getTableOption in ExternalCatalog ## What changes were proposed in this pull request? This patch removes the unused SessionCatalog.getTableMetadataOption and ExternalCatalog. getTableOption. ## How was this patch tested? Removed the test case. Author: Reynold Xin Closes #18912 from rxin/remove-getTableOption. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/584c7f14 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/584c7f14 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/584c7f14 Branch: refs/heads/master Commit: 584c7f14370cdfafdc6cd554b2760b7ce7709368 Parents: ca69558 Author: Reynold Xin Authored: Thu Aug 10 18:56:25 2017 -0700 Committer: Reynold Xin Committed: Thu Aug 10 18:56:25 2017 -0700 -- .../sql/catalyst/catalog/ExternalCatalog.scala | 2 -- .../sql/catalyst/catalog/InMemoryCatalog.scala | 4 .../sql/catalyst/catalog/SessionCatalog.scala | 17 +++-- .../sql/catalyst/catalog/SessionCatalogSuite.scala | 11 --- .../spark/sql/hive/HiveExternalCatalog.scala | 4 5 files changed, 3 insertions(+), 35 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/584c7f14/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala index 68644f4..d4c58db 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala @@ -167,8 +167,6 @@ abstract class ExternalCatalog def getTable(db: String, table: String): CatalogTable - def getTableOption(db: String, table: String): Option[CatalogTable] - def tableExists(db: String, table: String): Boolean def listTables(db: String): Seq[String] http://git-wip-us.apache.org/repos/asf/spark/blob/584c7f14/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala index 37e9eea..98370c1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala @@ -326,10 +326,6 @@ class InMemoryCatalog( catalog(db).tables(table).table } - override def getTableOption(db: String, table: String): Option[CatalogTable] = synchronized { -if (!tableExists(db, table)) None else Option(catalog(db).tables(table).table) - } - override def tableExists(db: String, table: String): Boolean = synchronized { requireDbExists(db) catalog(db).tables.contains(table) http://git-wip-us.apache.org/repos/asf/spark/blob/584c7f14/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala index b44d2ee..e3237a8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala @@ -387,9 +387,10 @@ class SessionCatalog( /** * Retrieve the metadata of an existing permanent table/view. If no database is specified, - * assume the table/view is in the current database. If the specified table/view is not found - * in the database then a [[NoSuchTableException]] is thrown. + * assume the table/view is in the current database. */ + @throws[NoSuchDatabaseException] + @throws[NoSuchTableException] def getTableMetadata(name: TableIdentifier): CatalogTable = { val db = formatDatabaseName(name.database.getOrElse(getCurrentDatabase)) val table = formatTableName(name.table) @@ -399,18 +400,6 @@ class SessionCatalog( } /** - * Retrieve the metadata of an existing metastore table. - * If no database is specified, assume the table is in the current database. - * If the specified table is not found in the database then return None if it doesn't exist. - */ - def getTableMetadataOption(
spark git commit: [SPARK-21638][ML] Fix RF/GBT Warning message error
Repository: spark Updated Branches: refs/heads/master 95ad960ca -> ca6955858 [SPARK-21638][ML] Fix RF/GBT Warning message error ## What changes were proposed in this pull request? When train RF model, there are many warning messages like this: > WARN RandomForest: Tree learning is using approximately 268492800 bytes per > iteration, which exceeds requested limit maxMemoryUsage=268435456. This > allows splitting 2622 nodes in this iteration. This warning message is unnecessary and the data is not accurate. Actually, if all the nodes cannot split in one iteration, it will show this warning. For most of the case, all the nodes cannot split just in one iteration, so for most of the case, it will show this warning for each iteration. ## How was this patch tested? The existing UT Author: Peng Meng Closes #18868 from mpjlu/fixRFwarning. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ca695585 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ca695585 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ca695585 Branch: refs/heads/master Commit: ca6955858cec868c878a2fd8528dbed0ef9edd3f Parents: 95ad960 Author: Peng Meng Authored: Thu Aug 10 21:38:03 2017 +0100 Committer: Sean Owen Committed: Thu Aug 10 21:38:03 2017 +0100 -- .../scala/org/apache/spark/ml/tree/impl/RandomForest.scala | 9 ++--- 1 file changed, 6 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ca695585/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala index 82e1ed8..f7d969f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala @@ -1089,7 +1089,8 @@ private[spark] object RandomForest extends Logging { var numNodesInGroup = 0 // If maxMemoryInMB is set very small, we want to still try to split 1 node, // so we allow one iteration if memUsage == 0. -while (nodeStack.nonEmpty && (memUsage < maxMemoryUsage || memUsage == 0)) { +var groupDone = false +while (nodeStack.nonEmpty && !groupDone) { val (treeIndex, node) = nodeStack.top // Choose subset of features for node (if subsampling). val featureSubset: Option[Array[Int]] = if (metadata.subsamplingFeatures) { @@ -1107,9 +1108,11 @@ private[spark] object RandomForest extends Logging { mutableTreeToNodeToIndexInfo .getOrElseUpdate(treeIndex, new mutable.HashMap[Int, NodeIndexInfo]())(node.id) = new NodeIndexInfo(numNodesInGroup, featureSubset) +numNodesInGroup += 1 +memUsage += nodeMemUsage + } else { +groupDone = true } - numNodesInGroup += 1 - memUsage += nodeMemUsage } if (memUsage > maxMemoryUsage) { // If maxMemoryUsage is 0, we should still allow splitting 1 node. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-21669] Internal API for collecting metrics/stats during FileFormatWriter jobs
Repository: spark Updated Branches: refs/heads/master 84454d7d3 -> 95ad960ca [SPARK-21669] Internal API for collecting metrics/stats during FileFormatWriter jobs ## What changes were proposed in this pull request? This patch introduces an internal interface for tracking metrics and/or statistics on data on the fly, as it is being written to disk during a `FileFormatWriter` job and partially reimplements SPARK-20703 in terms of it. The interface basically consists of 3 traits: - `WriteTaskStats`: just a tag for classes that represent statistics collected during a `WriteTask` The only constraint it adds is that the class should be `Serializable`, as instances of it will be collected on the driver from all executors at the end of the `WriteJob`. - `WriteTaskStatsTracker`: a trait for classes that can actually compute statistics based on tuples that are processed by a given `WriteTask` and eventually produce a `WriteTaskStats` instance. - `WriteJobStatsTracker`: a trait for classes that act as containers of `Serializable` state that's necessary for instantiating `WriteTaskStatsTracker` on executors and finally process the resulting collection of `WriteTaskStats`, once they're gathered back on the driver. Potential future use of this interface is e.g. CBO stats maintenance during `INSERT INTO table ... ` operations. ## How was this patch tested? Existing tests for SPARK-20703 exercise the new code: `hive/SQLMetricsSuite`, `sql/JavaDataFrameReaderWriterSuite`, etc. Author: Adrian Ionescu Closes #18884 from adrian-ionescu/write-stats-tracker-api. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/95ad960c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/95ad960c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/95ad960c Branch: refs/heads/master Commit: 95ad960caf009d843ec700ee41cbccc2fa3a68a5 Parents: 84454d7 Author: Adrian Ionescu Authored: Thu Aug 10 12:37:10 2017 -0700 Committer: Reynold Xin Committed: Thu Aug 10 12:37:10 2017 -0700 -- .../execution/command/DataWritingCommand.scala | 34 +-- .../datasources/BasicWriteStatsTracker.scala| 133 ++ .../datasources/FileFormatWriter.scala | 245 ++- .../InsertIntoHadoopFsRelationCommand.scala | 43 ++-- .../datasources/WriteStatsTracker.scala | 121 + .../execution/streaming/FileStreamSink.scala| 2 +- .../hive/execution/InsertIntoHiveTable.scala| 4 +- 7 files changed, 420 insertions(+), 162 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/95ad960c/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DataWritingCommand.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DataWritingCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DataWritingCommand.scala index 700f7f8..4e1c5e4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DataWritingCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DataWritingCommand.scala @@ -17,10 +17,13 @@ package org.apache.spark.sql.execution.command +import org.apache.hadoop.conf.Configuration + import org.apache.spark.SparkContext -import org.apache.spark.sql.execution.SQLExecution -import org.apache.spark.sql.execution.datasources.ExecutedWriteSummary +import org.apache.spark.sql.execution.datasources.BasicWriteJobStatsTracker import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} +import org.apache.spark.util.SerializableConfiguration + /** * A special `RunnableCommand` which writes data out and updates metrics. @@ -37,29 +40,8 @@ trait DataWritingCommand extends RunnableCommand { ) } - /** - * Callback function that update metrics collected from the writing operation. - */ - protected def updateWritingMetrics(writeSummaries: Seq[ExecutedWriteSummary]): Unit = { -val sparkContext = SparkContext.getActive.get -var numPartitions = 0 -var numFiles = 0 -var totalNumBytes: Long = 0L -var totalNumOutput: Long = 0L - -writeSummaries.foreach { summary => - numPartitions += summary.updatedPartitions.size - numFiles += summary.numOutputFile - totalNumBytes += summary.numOutputBytes - totalNumOutput += summary.numOutputRows -} - -metrics("numFiles").add(numFiles) -metrics("numOutputBytes").add(totalNumBytes) -metrics("numOutputRows").add(totalNumOutput) -metrics("numParts").add(numPartitions) - -val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY) -SQLMetrics.postDriverMetricUpdates(sparkContext, executionId, metrics.values.toList) + def basicWriteJobStatsTracker(hado
spark-website git commit: Update committer page
Repository: spark-website Updated Branches: refs/heads/asf-site eb51b33f0 -> 1a2e57670 Update committer page Project: http://git-wip-us.apache.org/repos/asf/spark-website/repo Commit: http://git-wip-us.apache.org/repos/asf/spark-website/commit/1a2e5767 Tree: http://git-wip-us.apache.org/repos/asf/spark-website/tree/1a2e5767 Diff: http://git-wip-us.apache.org/repos/asf/spark-website/diff/1a2e5767 Branch: refs/heads/asf-site Commit: 1a2e576708323e48f196f55e5e8ac0a9335d048c Parents: eb51b33 Author: Sameer Agarwal Authored: Mon Aug 7 12:05:05 2017 -0700 Committer: Sameer Agarwal Committed: Mon Aug 7 12:05:05 2017 -0700 -- committers.md| 1 + site/committers.html | 4 2 files changed, 5 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark-website/blob/1a2e5767/committers.md -- diff --git a/committers.md b/committers.md index a4965cb..040d419 100644 --- a/committers.md +++ b/committers.md @@ -10,6 +10,7 @@ navigation: |Name|Organization| ||| +|Sameer Agarwal|Databricks| |Michael Armbrust|Databricks| |Joseph Bradley|Databricks| |Felix Cheung|Microsoft| http://git-wip-us.apache.org/repos/asf/spark-website/blob/1a2e5767/site/committers.html -- diff --git a/site/committers.html b/site/committers.html index f69529d..0abad97 100644 --- a/site/committers.html +++ b/site/committers.html @@ -205,6 +205,10 @@ + Sameer Agarwal + Databricks + + Michael Armbrust Databricks - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org