spark git commit: [SPARK-8625] [CORE] Propagate user exceptions in tasks back to driver
Repository: spark Updated Branches: refs/heads/branch-1.5 5e6fdc659 - 0579f28df [SPARK-8625] [CORE] Propagate user exceptions in tasks back to driver This allows clients to retrieve the original exception from the cause field of the SparkException that is thrown by the driver. If the original exception is not in fact Serializable then it will not be returned, but the message and stacktrace will be. (All Java Throwables implement the Serializable interface, but this is no guarantee that a particular implementation can actually be serialized.) Author: Tom White t...@cloudera.com Closes #7014 from tomwhite/propagate-user-exceptions. (cherry picked from commit 2e680668f7b6fc158aa068aedd19c1878ecf759e) Signed-off-by: Imran Rashid iras...@cloudera.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0579f28d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0579f28d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0579f28d Branch: refs/heads/branch-1.5 Commit: 0579f28df246e9b7cb88bb848c7fb4e8607b8049 Parents: 5e6fdc6 Author: Tom White t...@cloudera.com Authored: Wed Aug 12 10:06:27 2015 -0500 Committer: Imran Rashid iras...@cloudera.com Committed: Wed Aug 12 10:08:19 2015 -0500 -- .../scala/org/apache/spark/TaskEndReason.scala | 44 - .../org/apache/spark/executor/Executor.scala| 14 - .../apache/spark/scheduler/DAGScheduler.scala | 44 - .../spark/scheduler/DAGSchedulerEvent.scala | 3 +- .../apache/spark/scheduler/TaskSetManager.scala | 12 ++-- .../org/apache/spark/util/JsonProtocol.scala| 2 +- .../spark/ExecutorAllocationManagerSuite.scala | 2 +- .../scala/org/apache/spark/FailureSuite.scala | 66 +++- .../spark/scheduler/DAGSchedulerSuite.scala | 2 +- .../spark/scheduler/TaskSetManagerSuite.scala | 5 +- .../ui/jobs/JobProgressListenerSuite.scala | 2 +- .../apache/spark/util/JsonProtocolSuite.scala | 3 +- 12 files changed, 165 insertions(+), 34 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0579f28d/core/src/main/scala/org/apache/spark/TaskEndReason.scala -- diff --git a/core/src/main/scala/org/apache/spark/TaskEndReason.scala b/core/src/main/scala/org/apache/spark/TaskEndReason.scala index 48fd3e7..934d00d 100644 --- a/core/src/main/scala/org/apache/spark/TaskEndReason.scala +++ b/core/src/main/scala/org/apache/spark/TaskEndReason.scala @@ -17,6 +17,8 @@ package org.apache.spark +import java.io.{IOException, ObjectInputStream, ObjectOutputStream} + import org.apache.spark.annotation.DeveloperApi import org.apache.spark.executor.TaskMetrics import org.apache.spark.storage.BlockManagerId @@ -90,6 +92,10 @@ case class FetchFailed( * * `fullStackTrace` is a better representation of the stack trace because it contains the whole * stack trace including the exception and its causes + * + * `exception` is the actual exception that caused the task to fail. It may be `None` in + * the case that the exception is not in fact serializable. If a task fails more than + * once (due to retries), `exception` is that one that caused the last failure. */ @DeveloperApi case class ExceptionFailure( @@ -97,11 +103,26 @@ case class ExceptionFailure( description: String, stackTrace: Array[StackTraceElement], fullStackTrace: String, -metrics: Option[TaskMetrics]) +metrics: Option[TaskMetrics], +private val exceptionWrapper: Option[ThrowableSerializationWrapper]) extends TaskFailedReason { + /** + * `preserveCause` is used to keep the exception itself so it is available to the + * driver. This may be set to `false` in the event that the exception is not in fact + * serializable. + */ + private[spark] def this(e: Throwable, metrics: Option[TaskMetrics], preserveCause: Boolean) { +this(e.getClass.getName, e.getMessage, e.getStackTrace, Utils.exceptionString(e), metrics, + if (preserveCause) Some(new ThrowableSerializationWrapper(e)) else None) + } + private[spark] def this(e: Throwable, metrics: Option[TaskMetrics]) { -this(e.getClass.getName, e.getMessage, e.getStackTrace, Utils.exceptionString(e), metrics) +this(e, metrics, preserveCause = true) + } + + def exception: Option[Throwable] = exceptionWrapper.flatMap { +(w: ThrowableSerializationWrapper) = Option(w.exception) } override def toErrorString: String = @@ -128,6 +149,25 @@ case class ExceptionFailure( } /** + * A class for recovering from exceptions when deserializing a Throwable that was + * thrown in user task code. If the Throwable cannot be deserialized it will be null, + * but the stacktrace and message will be preserved correctly in SparkException. + */
spark git commit: [SPARK-8625] [CORE] Propagate user exceptions in tasks back to driver
Repository: spark Updated Branches: refs/heads/master 3ecb37943 - 2e680668f [SPARK-8625] [CORE] Propagate user exceptions in tasks back to driver This allows clients to retrieve the original exception from the cause field of the SparkException that is thrown by the driver. If the original exception is not in fact Serializable then it will not be returned, but the message and stacktrace will be. (All Java Throwables implement the Serializable interface, but this is no guarantee that a particular implementation can actually be serialized.) Author: Tom White t...@cloudera.com Closes #7014 from tomwhite/propagate-user-exceptions. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2e680668 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2e680668 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2e680668 Branch: refs/heads/master Commit: 2e680668f7b6fc158aa068aedd19c1878ecf759e Parents: 3ecb379 Author: Tom White t...@cloudera.com Authored: Wed Aug 12 10:06:27 2015 -0500 Committer: Imran Rashid iras...@cloudera.com Committed: Wed Aug 12 10:07:11 2015 -0500 -- .../scala/org/apache/spark/TaskEndReason.scala | 44 - .../org/apache/spark/executor/Executor.scala| 14 - .../apache/spark/scheduler/DAGScheduler.scala | 44 - .../spark/scheduler/DAGSchedulerEvent.scala | 3 +- .../apache/spark/scheduler/TaskSetManager.scala | 12 ++-- .../org/apache/spark/util/JsonProtocol.scala| 2 +- .../spark/ExecutorAllocationManagerSuite.scala | 2 +- .../scala/org/apache/spark/FailureSuite.scala | 66 +++- .../spark/scheduler/DAGSchedulerSuite.scala | 2 +- .../spark/scheduler/TaskSetManagerSuite.scala | 5 +- .../ui/jobs/JobProgressListenerSuite.scala | 2 +- .../apache/spark/util/JsonProtocolSuite.scala | 3 +- 12 files changed, 165 insertions(+), 34 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2e680668/core/src/main/scala/org/apache/spark/TaskEndReason.scala -- diff --git a/core/src/main/scala/org/apache/spark/TaskEndReason.scala b/core/src/main/scala/org/apache/spark/TaskEndReason.scala index 48fd3e7..934d00d 100644 --- a/core/src/main/scala/org/apache/spark/TaskEndReason.scala +++ b/core/src/main/scala/org/apache/spark/TaskEndReason.scala @@ -17,6 +17,8 @@ package org.apache.spark +import java.io.{IOException, ObjectInputStream, ObjectOutputStream} + import org.apache.spark.annotation.DeveloperApi import org.apache.spark.executor.TaskMetrics import org.apache.spark.storage.BlockManagerId @@ -90,6 +92,10 @@ case class FetchFailed( * * `fullStackTrace` is a better representation of the stack trace because it contains the whole * stack trace including the exception and its causes + * + * `exception` is the actual exception that caused the task to fail. It may be `None` in + * the case that the exception is not in fact serializable. If a task fails more than + * once (due to retries), `exception` is that one that caused the last failure. */ @DeveloperApi case class ExceptionFailure( @@ -97,11 +103,26 @@ case class ExceptionFailure( description: String, stackTrace: Array[StackTraceElement], fullStackTrace: String, -metrics: Option[TaskMetrics]) +metrics: Option[TaskMetrics], +private val exceptionWrapper: Option[ThrowableSerializationWrapper]) extends TaskFailedReason { + /** + * `preserveCause` is used to keep the exception itself so it is available to the + * driver. This may be set to `false` in the event that the exception is not in fact + * serializable. + */ + private[spark] def this(e: Throwable, metrics: Option[TaskMetrics], preserveCause: Boolean) { +this(e.getClass.getName, e.getMessage, e.getStackTrace, Utils.exceptionString(e), metrics, + if (preserveCause) Some(new ThrowableSerializationWrapper(e)) else None) + } + private[spark] def this(e: Throwable, metrics: Option[TaskMetrics]) { -this(e.getClass.getName, e.getMessage, e.getStackTrace, Utils.exceptionString(e), metrics) +this(e, metrics, preserveCause = true) + } + + def exception: Option[Throwable] = exceptionWrapper.flatMap { +(w: ThrowableSerializationWrapper) = Option(w.exception) } override def toErrorString: String = @@ -128,6 +149,25 @@ case class ExceptionFailure( } /** + * A class for recovering from exceptions when deserializing a Throwable that was + * thrown in user task code. If the Throwable cannot be deserialized it will be null, + * but the stacktrace and message will be preserved correctly in SparkException. + */ +private[spark] class ThrowableSerializationWrapper(var exception: Throwable) extends +Serializable with Logging { + private
spark git commit: [SPARK-9789] [ML] Added logreg threshold param back
Repository: spark Updated Branches: refs/heads/branch-1.5 65b5b2172 - bdf8dc15d [SPARK-9789] [ML] Added logreg threshold param back Reinstated LogisticRegression.threshold Param for binary compatibility. Param thresholds overrides threshold, if set. CC: mengxr dbtsai feynmanliang Author: Joseph K. Bradley jos...@databricks.com Closes #8079 from jkbradley/logreg-reinstate-threshold. (cherry picked from commit 551def5d6972440365bd7436d484a67138d9a8f3) Signed-off-by: Joseph K. Bradley jos...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bdf8dc15 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bdf8dc15 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bdf8dc15 Branch: refs/heads/branch-1.5 Commit: bdf8dc15d3b310c8cd84c71999b1bca4d9bc825e Parents: 65b5b21 Author: Joseph K. Bradley jos...@databricks.com Authored: Wed Aug 12 14:27:13 2015 -0700 Committer: Joseph K. Bradley jos...@databricks.com Committed: Wed Aug 12 14:27:21 2015 -0700 -- .../ml/classification/LogisticRegression.scala | 127 +++ .../ml/param/shared/SharedParamsCodeGen.scala | 4 +- .../spark/ml/param/shared/sharedParams.scala| 6 +- .../JavaLogisticRegressionSuite.java| 7 +- .../LogisticRegressionSuite.scala | 33 +++-- python/pyspark/ml/classification.py | 98 -- 6 files changed, 199 insertions(+), 76 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/bdf8dc15/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index f55134d..5bcd711 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -34,8 +34,7 @@ import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{DataFrame, Row, SQLContext} -import org.apache.spark.sql.functions.{col, udf} +import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.storage.StorageLevel /** @@ -43,44 +42,115 @@ import org.apache.spark.storage.StorageLevel */ private[classification] trait LogisticRegressionParams extends ProbabilisticClassifierParams with HasRegParam with HasElasticNetParam with HasMaxIter with HasFitIntercept with HasTol - with HasStandardization { + with HasStandardization with HasThreshold { /** - * Version of setThresholds() for binary classification, available for backwards - * compatibility. + * Set threshold in binary classification, in range [0, 1]. * - * Calling this with threshold p will effectively call `setThresholds(Array(1-p, p))`. + * If the estimated probability of class label 1 is threshold, then predict 1, else 0. + * A high threshold encourages the model to predict 0 more often; + * a low threshold encourages the model to predict 1 more often. + * + * Note: Calling this with threshold p is equivalent to calling `setThresholds(Array(1-p, p))`. + * When [[setThreshold()]] is called, any user-set value for [[thresholds]] will be cleared. + * If both [[threshold]] and [[thresholds]] are set in a ParamMap, then they must be + * equivalent. + * + * Default is 0.5. + * @group setParam + */ + def setThreshold(value: Double): this.type = { +if (isSet(thresholds)) clear(thresholds) +set(threshold, value) + } + + /** + * Get threshold for binary classification. + * + * If [[threshold]] is set, returns that value. + * Otherwise, if [[thresholds]] is set with length 2 (i.e., binary classification), + * this returns the equivalent threshold: {{{1 / (1 + thresholds(0) / thresholds(1))}}}. + * Otherwise, returns [[threshold]] default value. + * + * @group getParam + * @throws IllegalArgumentException if [[thresholds]] is set to an array of length other than 2. + */ + override def getThreshold: Double = { +checkThresholdConsistency() +if (isSet(thresholds)) { + val ts = $(thresholds) + require(ts.length == 2, Logistic Regression getThreshold only applies to + + binary classification, but thresholds has length != 2. thresholds: + ts.mkString(,)) + 1.0 / (1.0 + ts(0) / ts(1)) +} else { + $(threshold) +} + } + + /** + * Set thresholds in multiclass (or binary) classification to adjust the
spark git commit: [SPARK-9789] [ML] Added logreg threshold param back
Repository: spark Updated Branches: refs/heads/master 762bacc16 - 551def5d6 [SPARK-9789] [ML] Added logreg threshold param back Reinstated LogisticRegression.threshold Param for binary compatibility. Param thresholds overrides threshold, if set. CC: mengxr dbtsai feynmanliang Author: Joseph K. Bradley jos...@databricks.com Closes #8079 from jkbradley/logreg-reinstate-threshold. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/551def5d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/551def5d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/551def5d Branch: refs/heads/master Commit: 551def5d6972440365bd7436d484a67138d9a8f3 Parents: 762bacc Author: Joseph K. Bradley jos...@databricks.com Authored: Wed Aug 12 14:27:13 2015 -0700 Committer: Joseph K. Bradley jos...@databricks.com Committed: Wed Aug 12 14:27:13 2015 -0700 -- .../ml/classification/LogisticRegression.scala | 127 +++ .../ml/param/shared/SharedParamsCodeGen.scala | 4 +- .../spark/ml/param/shared/sharedParams.scala| 6 +- .../JavaLogisticRegressionSuite.java| 7 +- .../LogisticRegressionSuite.scala | 33 +++-- python/pyspark/ml/classification.py | 98 -- 6 files changed, 199 insertions(+), 76 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/551def5d/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index f55134d..5bcd711 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -34,8 +34,7 @@ import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{DataFrame, Row, SQLContext} -import org.apache.spark.sql.functions.{col, udf} +import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.storage.StorageLevel /** @@ -43,44 +42,115 @@ import org.apache.spark.storage.StorageLevel */ private[classification] trait LogisticRegressionParams extends ProbabilisticClassifierParams with HasRegParam with HasElasticNetParam with HasMaxIter with HasFitIntercept with HasTol - with HasStandardization { + with HasStandardization with HasThreshold { /** - * Version of setThresholds() for binary classification, available for backwards - * compatibility. + * Set threshold in binary classification, in range [0, 1]. * - * Calling this with threshold p will effectively call `setThresholds(Array(1-p, p))`. + * If the estimated probability of class label 1 is threshold, then predict 1, else 0. + * A high threshold encourages the model to predict 0 more often; + * a low threshold encourages the model to predict 1 more often. + * + * Note: Calling this with threshold p is equivalent to calling `setThresholds(Array(1-p, p))`. + * When [[setThreshold()]] is called, any user-set value for [[thresholds]] will be cleared. + * If both [[threshold]] and [[thresholds]] are set in a ParamMap, then they must be + * equivalent. + * + * Default is 0.5. + * @group setParam + */ + def setThreshold(value: Double): this.type = { +if (isSet(thresholds)) clear(thresholds) +set(threshold, value) + } + + /** + * Get threshold for binary classification. + * + * If [[threshold]] is set, returns that value. + * Otherwise, if [[thresholds]] is set with length 2 (i.e., binary classification), + * this returns the equivalent threshold: {{{1 / (1 + thresholds(0) / thresholds(1))}}}. + * Otherwise, returns [[threshold]] default value. + * + * @group getParam + * @throws IllegalArgumentException if [[thresholds]] is set to an array of length other than 2. + */ + override def getThreshold: Double = { +checkThresholdConsistency() +if (isSet(thresholds)) { + val ts = $(thresholds) + require(ts.length == 2, Logistic Regression getThreshold only applies to + + binary classification, but thresholds has length != 2. thresholds: + ts.mkString(,)) + 1.0 / (1.0 + ts(0) / ts(1)) +} else { + $(threshold) +} + } + + /** + * Set thresholds in multiclass (or binary) classification to adjust the probability of + * predicting each class. Array must have length equal to the number of classes, with values = 0. + * The class with
spark git commit: [SPARK-9826] [CORE] Fix cannot use custom classes in log4j.properties
Repository: spark Updated Branches: refs/heads/branch-1.5 8537e51d3 - 74c9dcec3 [SPARK-9826] [CORE] Fix cannot use custom classes in log4j.properties Refactor Utils class and create ShutdownHookManager. NOTE: Wasn't able to run /dev/run-tests on windows machine. Manual tests were conducted locally using custom log4j.properties file with Redis appender and logstash formatter (bundled in the fat-jar submitted to spark) ex: log4j.rootCategory=WARN,console,redis log4j.appender.console=org.apache.log4j.ConsoleAppender log4j.appender.console.target=System.err log4j.appender.console.layout=org.apache.log4j.PatternLayout log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n log4j.logger.org.eclipse.jetty=WARN log4j.logger.org.eclipse.jetty.util.component.AbstractLifeCycle=ERROR log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO log4j.logger.org.apache.spark.graphx.Pregel=INFO log4j.appender.redis=com.ryantenney.log4j.FailoverRedisAppender log4j.appender.redis.endpoints=hostname:port log4j.appender.redis.key=mykey log4j.appender.redis.alwaysBatch=false log4j.appender.redis.layout=net.logstash.log4j.JSONEventLayoutV1 Author: michellemay mle...@gmail.com Closes #8109 from michellemay/SPARK-9826. (cherry picked from commit ab7e721cfec63155641e81e72b4ad43cf6a7d4c7) Signed-off-by: Marcelo Vanzin van...@cloudera.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/74c9dcec Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/74c9dcec Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/74c9dcec Branch: refs/heads/branch-1.5 Commit: 74c9dcec34214213955baa23098d7d23bc733346 Parents: 8537e51 Author: Michel Lemay mle...@gmail.com Authored: Wed Aug 12 16:17:58 2015 -0700 Committer: Marcelo Vanzin van...@cloudera.com Committed: Wed Aug 12 16:41:58 2015 -0700 -- .../scala/org/apache/spark/SparkContext.scala | 5 +- .../spark/deploy/history/HistoryServer.scala| 4 +- .../spark/deploy/worker/ExecutorRunner.scala| 7 +- .../scala/org/apache/spark/rdd/HadoopRDD.scala | 4 +- .../org/apache/spark/rdd/NewHadoopRDD.scala | 4 +- .../org/apache/spark/rdd/SqlNewHadoopRDD.scala | 4 +- .../apache/spark/storage/DiskBlockManager.scala | 10 +- .../spark/storage/TachyonBlockManager.scala | 6 +- .../apache/spark/util/ShutdownHookManager.scala | 266 +++ .../util/SparkUncaughtExceptionHandler.scala| 2 +- .../scala/org/apache/spark/util/Utils.scala | 222 +--- .../hive/thriftserver/HiveThriftServer2.scala | 4 +- .../hive/thriftserver/SparkSQLCLIDriver.scala | 4 +- .../apache/spark/sql/hive/test/TestHive.scala | 4 +- .../spark/streaming/StreamingContext.scala | 8 +- .../spark/deploy/yarn/ApplicationMaster.scala | 5 +- 16 files changed, 307 insertions(+), 252 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/74c9dcec/core/src/main/scala/org/apache/spark/SparkContext.scala -- diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index 207a0c1..2e01a9a 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -563,7 +563,8 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli // Make sure the context is stopped if the user forgets about it. This avoids leaving // unfinished event logs around after the JVM exits cleanly. It doesn't help if the JVM // is killed, though. -_shutdownHookRef = Utils.addShutdownHook(Utils.SPARK_CONTEXT_SHUTDOWN_PRIORITY) { () = +_shutdownHookRef = ShutdownHookManager.addShutdownHook( + ShutdownHookManager.SPARK_CONTEXT_SHUTDOWN_PRIORITY) { () = logInfo(Invoking stop() from shutdown hook) stop() } @@ -1671,7 +1672,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli return } if (_shutdownHookRef != null) { - Utils.removeShutdownHook(_shutdownHookRef) + ShutdownHookManager.removeShutdownHook(_shutdownHookRef) } Utils.tryLogNonFatalError { http://git-wip-us.apache.org/repos/asf/spark/blob/74c9dcec/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala index a076a9c..d4f327c 100644 ---
spark git commit: [SPARK-9826] [CORE] Fix cannot use custom classes in log4j.properties
Repository: spark Updated Branches: refs/heads/master 738f35398 - ab7e721cf [SPARK-9826] [CORE] Fix cannot use custom classes in log4j.properties Refactor Utils class and create ShutdownHookManager. NOTE: Wasn't able to run /dev/run-tests on windows machine. Manual tests were conducted locally using custom log4j.properties file with Redis appender and logstash formatter (bundled in the fat-jar submitted to spark) ex: log4j.rootCategory=WARN,console,redis log4j.appender.console=org.apache.log4j.ConsoleAppender log4j.appender.console.target=System.err log4j.appender.console.layout=org.apache.log4j.PatternLayout log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n log4j.logger.org.eclipse.jetty=WARN log4j.logger.org.eclipse.jetty.util.component.AbstractLifeCycle=ERROR log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO log4j.logger.org.apache.spark.graphx.Pregel=INFO log4j.appender.redis=com.ryantenney.log4j.FailoverRedisAppender log4j.appender.redis.endpoints=hostname:port log4j.appender.redis.key=mykey log4j.appender.redis.alwaysBatch=false log4j.appender.redis.layout=net.logstash.log4j.JSONEventLayoutV1 Author: michellemay mle...@gmail.com Closes #8109 from michellemay/SPARK-9826. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ab7e721c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ab7e721c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ab7e721c Branch: refs/heads/master Commit: ab7e721cfec63155641e81e72b4ad43cf6a7d4c7 Parents: 738f353 Author: Michel Lemay mle...@gmail.com Authored: Wed Aug 12 16:17:58 2015 -0700 Committer: Marcelo Vanzin van...@cloudera.com Committed: Wed Aug 12 16:41:35 2015 -0700 -- .../scala/org/apache/spark/SparkContext.scala | 5 +- .../spark/deploy/history/HistoryServer.scala| 4 +- .../spark/deploy/worker/ExecutorRunner.scala| 7 +- .../scala/org/apache/spark/rdd/HadoopRDD.scala | 4 +- .../org/apache/spark/rdd/NewHadoopRDD.scala | 4 +- .../org/apache/spark/rdd/SqlNewHadoopRDD.scala | 4 +- .../apache/spark/storage/DiskBlockManager.scala | 10 +- .../spark/storage/TachyonBlockManager.scala | 6 +- .../apache/spark/util/ShutdownHookManager.scala | 266 +++ .../util/SparkUncaughtExceptionHandler.scala| 2 +- .../scala/org/apache/spark/util/Utils.scala | 222 +--- .../hive/thriftserver/HiveThriftServer2.scala | 4 +- .../hive/thriftserver/SparkSQLCLIDriver.scala | 4 +- .../apache/spark/sql/hive/test/TestHive.scala | 4 +- .../spark/streaming/StreamingContext.scala | 8 +- .../spark/deploy/yarn/ApplicationMaster.scala | 5 +- 16 files changed, 307 insertions(+), 252 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ab7e721c/core/src/main/scala/org/apache/spark/SparkContext.scala -- diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index 207a0c1..2e01a9a 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -563,7 +563,8 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli // Make sure the context is stopped if the user forgets about it. This avoids leaving // unfinished event logs around after the JVM exits cleanly. It doesn't help if the JVM // is killed, though. -_shutdownHookRef = Utils.addShutdownHook(Utils.SPARK_CONTEXT_SHUTDOWN_PRIORITY) { () = +_shutdownHookRef = ShutdownHookManager.addShutdownHook( + ShutdownHookManager.SPARK_CONTEXT_SHUTDOWN_PRIORITY) { () = logInfo(Invoking stop() from shutdown hook) stop() } @@ -1671,7 +1672,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli return } if (_shutdownHookRef != null) { - Utils.removeShutdownHook(_shutdownHookRef) + ShutdownHookManager.removeShutdownHook(_shutdownHookRef) } Utils.tryLogNonFatalError { http://git-wip-us.apache.org/repos/asf/spark/blob/ab7e721c/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala index a076a9c..d4f327c 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala @@ -30,7 +30,7 @@ import
spark git commit: [SPARK-9907] [SQL] Python crc32 is mistakenly calling md5
Repository: spark Updated Branches: refs/heads/branch-1.5 6a7582ea2 - b28295fe0 [SPARK-9907] [SQL] Python crc32 is mistakenly calling md5 Author: Reynold Xin r...@databricks.com Closes #8138 from rxin/SPARK-9907. (cherry picked from commit a17384fa343628cec44437da5b80b9403ecd5838) Signed-off-by: Reynold Xin r...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b28295fe Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b28295fe Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b28295fe Branch: refs/heads/branch-1.5 Commit: b28295fe0c91405ade86e215083de25ec5bb52d9 Parents: 6a7582e Author: Reynold Xin r...@databricks.com Authored: Wed Aug 12 15:27:52 2015 -0700 Committer: Reynold Xin r...@databricks.com Committed: Wed Aug 12 15:27:58 2015 -0700 -- python/pyspark/sql/functions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b28295fe/python/pyspark/sql/functions.py -- diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 95f4604..e989795 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -885,10 +885,10 @@ def crc32(col): returns the value as a bigint. sqlContext.createDataFrame([('ABC',)], ['a']).select(crc32('a').alias('crc32')).collect() -[Row(crc32=u'902fbdd2b1df0c4f70b4a5d23525e932')] +[Row(crc32=2743272264)] sc = SparkContext._active_spark_context -return Column(sc._jvm.functions.md5(_to_java_column(col))) +return Column(sc._jvm.functions.crc32(_to_java_column(col))) @ignore_unicode_prefix - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9894] [SQL] Json writer should handle MapData.
Repository: spark Updated Branches: refs/heads/master ab7e721cf - 7035d880a [SPARK-9894] [SQL] Json writer should handle MapData. https://issues.apache.org/jira/browse/SPARK-9894 Author: Yin Huai yh...@databricks.com Closes #8137 from yhuai/jsonMapData. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7035d880 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7035d880 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7035d880 Branch: refs/heads/master Commit: 7035d880a0cf06910c19b4afd49645124c620f14 Parents: ab7e721 Author: Yin Huai yh...@databricks.com Authored: Wed Aug 12 16:45:15 2015 -0700 Committer: Michael Armbrust mich...@databricks.com Committed: Wed Aug 12 16:45:15 2015 -0700 -- .../datasources/json/JacksonGenerator.scala | 10 +-- .../sql/sources/JsonHadoopFsRelationSuite.scala | 78 .../SimpleTextHadoopFsRelationSuite.scala | 30 3 files changed, 83 insertions(+), 35 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7035d880/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala index 37c2b5a..99ac773 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala @@ -107,12 +107,12 @@ private[sql] object JacksonGenerator { v.foreach(ty, (_, value) = valWriter(ty, value)) gen.writeEndArray() - case (MapType(kv, vv, _), v: Map[_, _]) = + case (MapType(kt, vt, _), v: MapData) = gen.writeStartObject() -v.foreach { p = - gen.writeFieldName(p._1.toString) - valWriter(vv, p._2) -} +v.foreach(kt, vt, { (k, v) = + gen.writeFieldName(k.toString) + valWriter(vt, v) +}) gen.writeEndObject() case (StructType(ty), v: InternalRow) = http://git-wip-us.apache.org/repos/asf/spark/blob/7035d880/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala new file mode 100644 index 000..ed6d512 --- /dev/null +++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the License); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.sources + +import org.apache.hadoop.fs.Path + +import org.apache.spark.deploy.SparkHadoopUtil +import org.apache.spark.sql.Row +import org.apache.spark.sql.types._ + +class JsonHadoopFsRelationSuite extends HadoopFsRelationTest { + override val dataSourceName: String = json + + import sqlContext._ + + test(save()/load() - partitioned table - simple queries - partition columns in data) { +withTempDir { file = + val basePath = new Path(file.getCanonicalPath) + val fs = basePath.getFileSystem(SparkHadoopUtil.get.conf) + val qualifiedBasePath = fs.makeQualified(basePath) + + for (p1 - 1 to 2; p2 - Seq(foo, bar)) { +val partitionDir = new Path(qualifiedBasePath, sp1=$p1/p2=$p2) +sparkContext + .parallelize(for (i - 1 to 3) yield s{a:$i,b:val_$i}) + .saveAsTextFile(partitionDir.toString) + } + + val dataSchemaWithPartition = +StructType(dataSchema.fields :+ StructField(p1, IntegerType, nullable = true)) + + checkQueries( +read.format(dataSourceName) + .option(dataSchema, dataSchemaWithPartition.json) +
spark git commit: [SPARK-9909] [ML] [TRIVIAL] move weightCol to shared params
Repository: spark Updated Branches: refs/heads/master caa14d9dc - 6e409bc13 [SPARK-9909] [ML] [TRIVIAL] move weightCol to shared params As per the TODO move weightCol to Shared Params. Author: Holden Karau hol...@pigscanfly.ca Closes #8144 from holdenk/SPARK-9909-move-weightCol-toSharedParams. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6e409bc1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6e409bc1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6e409bc1 Branch: refs/heads/master Commit: 6e409bc1357f49de2efdfc4226d074b943fb1153 Parents: caa14d9 Author: Holden Karau hol...@pigscanfly.ca Authored: Wed Aug 12 16:54:45 2015 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Wed Aug 12 16:54:45 2015 -0700 -- .../spark/ml/param/shared/SharedParamsCodeGen.scala | 4 +++- .../apache/spark/ml/param/shared/sharedParams.scala | 15 +++ .../spark/ml/regression/IsotonicRegression.scala| 16 ++-- 3 files changed, 20 insertions(+), 15 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6e409bc1/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala index 9e12f18..8c16c61 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala @@ -70,7 +70,9 @@ private[shared] object SharedParamsCodeGen { For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty., isValid = ParamValidators.inRange(0, 1)), ParamDesc[Double](tol, the convergence tolerance for iterative algorithms), - ParamDesc[Double](stepSize, Step size to be used for each iteration of optimization.)) + ParamDesc[Double](stepSize, Step size to be used for each iteration of optimization.), + ParamDesc[String](weightCol, weight column name. If this is not set or empty, we treat + +all instance weights as 1.0.)) val code = genSharedParams(params) val file = src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala http://git-wip-us.apache.org/repos/asf/spark/blob/6e409bc1/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala index a17d4ea..c267689 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala @@ -342,4 +342,19 @@ private[ml] trait HasStepSize extends Params { /** @group getParam */ final def getStepSize: Double = $(stepSize) } + +/** + * Trait for shared param weightCol. + */ +private[ml] trait HasWeightCol extends Params { + + /** + * Param for weight column name. If this is not set or empty, we treat all instance weights as 1.0.. + * @group param + */ + final val weightCol: Param[String] = new Param[String](this, weightCol, weight column name. If this is not set or empty, we treat all instance weights as 1.0.) + + /** @group getParam */ + final def getWeightCol: String = $(weightCol) +} // scalastyle:on http://git-wip-us.apache.org/repos/asf/spark/blob/6e409bc1/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala index f570590..0f33bae 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala @@ -21,7 +21,7 @@ import org.apache.spark.Logging import org.apache.spark.annotation.Experimental import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.param._ -import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasLabelCol, HasPredictionCol} +import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasLabelCol, HasPredictionCol, HasWeightCol} import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.linalg.{Vector, VectorUDT, Vectors} import org.apache.spark.mllib.regression.{IsotonicRegression = MLlibIsotonicRegression, IsotonicRegressionModel =
spark git commit: [SPARK-9909] [ML] [TRIVIAL] move weightCol to shared params
Repository: spark Updated Branches: refs/heads/branch-1.5 6aca0cf34 - 2f8793b5f [SPARK-9909] [ML] [TRIVIAL] move weightCol to shared params As per the TODO move weightCol to Shared Params. Author: Holden Karau hol...@pigscanfly.ca Closes #8144 from holdenk/SPARK-9909-move-weightCol-toSharedParams. (cherry picked from commit 6e409bc1357f49de2efdfc4226d074b943fb1153) Signed-off-by: Xiangrui Meng m...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2f8793b5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2f8793b5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2f8793b5 Branch: refs/heads/branch-1.5 Commit: 2f8793b5f47ec7c17b27715bc9b1026266061cea Parents: 6aca0cf Author: Holden Karau hol...@pigscanfly.ca Authored: Wed Aug 12 16:54:45 2015 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Wed Aug 12 16:54:52 2015 -0700 -- .../spark/ml/param/shared/SharedParamsCodeGen.scala | 4 +++- .../apache/spark/ml/param/shared/sharedParams.scala | 15 +++ .../spark/ml/regression/IsotonicRegression.scala| 16 ++-- 3 files changed, 20 insertions(+), 15 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2f8793b5/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala index 5cb7235..3899df6 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala @@ -66,7 +66,9 @@ private[shared] object SharedParamsCodeGen { For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty., isValid = ParamValidators.inRange(0, 1)), ParamDesc[Double](tol, the convergence tolerance for iterative algorithms), - ParamDesc[Double](stepSize, Step size to be used for each iteration of optimization.)) + ParamDesc[Double](stepSize, Step size to be used for each iteration of optimization.), + ParamDesc[String](weightCol, weight column name. If this is not set or empty, we treat + +all instance weights as 1.0.)) val code = genSharedParams(params) val file = src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala http://git-wip-us.apache.org/repos/asf/spark/blob/2f8793b5/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala index d4c89e6..e8e58aa 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala @@ -327,4 +327,19 @@ private[ml] trait HasStepSize extends Params { /** @group getParam */ final def getStepSize: Double = $(stepSize) } + +/** + * Trait for shared param weightCol. + */ +private[ml] trait HasWeightCol extends Params { + + /** + * Param for weight column name. If this is not set or empty, we treat all instance weights as 1.0.. + * @group param + */ + final val weightCol: Param[String] = new Param[String](this, weightCol, weight column name. If this is not set or empty, we treat all instance weights as 1.0.) + + /** @group getParam */ + final def getWeightCol: String = $(weightCol) +} // scalastyle:on http://git-wip-us.apache.org/repos/asf/spark/blob/2f8793b5/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala index f570590..0f33bae 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala @@ -21,7 +21,7 @@ import org.apache.spark.Logging import org.apache.spark.annotation.Experimental import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.param._ -import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasLabelCol, HasPredictionCol} +import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasLabelCol, HasPredictionCol, HasWeightCol} import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.linalg.{Vector, VectorUDT, Vectors} import
spark git commit: [SPARK-9913] [MLLIB] LDAUtils should be private
Repository: spark Updated Branches: refs/heads/branch-1.5 08f767a1e - 6aca0cf34 [SPARK-9913] [MLLIB] LDAUtils should be private feynmanliang Author: Xiangrui Meng m...@databricks.com Closes #8142 from mengxr/SPARK-9913. (cherry picked from commit caa14d9dc9e2eb1102052b22445b63b0e004e3c7) Signed-off-by: Xiangrui Meng m...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6aca0cf3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6aca0cf3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6aca0cf3 Branch: refs/heads/branch-1.5 Commit: 6aca0cf348ca0731ef72155f5a5d7739b796bb3b Parents: 08f767a Author: Xiangrui Meng m...@databricks.com Authored: Wed Aug 12 16:53:47 2015 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Wed Aug 12 16:53:56 2015 -0700 -- .../main/scala/org/apache/spark/mllib/clustering/LDAUtils.scala| 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6aca0cf3/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAUtils.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAUtils.scala index f7e5ce1..a9ba7b6 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAUtils.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAUtils.scala @@ -22,7 +22,7 @@ import breeze.numerics._ /** * Utility methods for LDA. */ -object LDAUtils { +private[clustering] object LDAUtils { /** * Log Sum Exp with overflow protection using the identity: * For any a: \log \sum_{n=1}^N \exp\{x_n\} = a + \log \sum_{n=1}^N \exp\{x_n - a\} - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9449] [SQL] Include MetastoreRelation's inputFiles
Repository: spark Updated Branches: refs/heads/branch-1.5 ed73f5439 - 3298fb69f [SPARK-9449] [SQL] Include MetastoreRelation's inputFiles Author: Michael Armbrust mich...@databricks.com Closes #8119 from marmbrus/metastoreInputFiles. (cherry picked from commit 660e6dcff8125b83cc73dbe00c90cbe58744bc66) Signed-off-by: Michael Armbrust mich...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3298fb69 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3298fb69 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3298fb69 Branch: refs/heads/branch-1.5 Commit: 3298fb69ff118ea53c4d5b204638c61d016d0506 Parents: ed73f54 Author: Michael Armbrust mich...@databricks.com Authored: Wed Aug 12 17:07:29 2015 -0700 Committer: Michael Armbrust mich...@databricks.com Committed: Wed Aug 12 17:08:01 2015 -0700 -- .../scala/org/apache/spark/sql/DataFrame.scala | 10 --- .../spark/sql/execution/FileRelation.scala | 28 .../apache/spark/sql/sources/interfaces.scala | 6 +++-- .../org/apache/spark/sql/DataFrameSuite.scala | 26 +- .../spark/sql/hive/HiveMetastoreCatalog.scala | 16 +-- 5 files changed, 66 insertions(+), 20 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3298fb69/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala index 27b994f..c466d9e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala @@ -34,10 +34,10 @@ import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis._ import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.plans.logical.{Filter, _} +import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.plans.{Inner, JoinType} import org.apache.spark.sql.catalyst.{CatalystTypeConverters, ScalaReflection, SqlParser} -import org.apache.spark.sql.execution.{EvaluatePython, ExplainCommand, LogicalRDD, SQLExecution} +import org.apache.spark.sql.execution.{EvaluatePython, ExplainCommand, FileRelation, LogicalRDD, SQLExecution} import org.apache.spark.sql.execution.datasources.{CreateTableUsingAsSelect, LogicalRelation} import org.apache.spark.sql.execution.datasources.json.JacksonGenerator import org.apache.spark.sql.sources.HadoopFsRelation @@ -1560,8 +1560,10 @@ class DataFrame private[sql]( */ def inputFiles: Array[String] = { val files: Seq[String] = logicalPlan.collect { - case LogicalRelation(fsBasedRelation: HadoopFsRelation) = -fsBasedRelation.paths.toSeq + case LogicalRelation(fsBasedRelation: FileRelation) = +fsBasedRelation.inputFiles + case fr: FileRelation = +fr.inputFiles }.flatten files.toSet.toArray } http://git-wip-us.apache.org/repos/asf/spark/blob/3298fb69/sql/core/src/main/scala/org/apache/spark/sql/execution/FileRelation.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/FileRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/FileRelation.scala new file mode 100644 index 000..7a2a9ee --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/FileRelation.scala @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the License); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution + +/** + * An interface for relations that are backed by files. When a class implements this interface, + * the list of paths that it returns will be returned to a user who calls `inputPaths` on any + * DataFrame that queries this relation. + */ +private[sql] trait FileRelation { + /** Returns the list of files that will be
spark git commit: [SPARK-9826] [CORE] Fix cannot use custom classes in log4j.properties
Repository: spark Updated Branches: refs/heads/branch-1.4 89c8aea94 - 8ce86b23f [SPARK-9826] [CORE] Fix cannot use custom classes in log4j.properties Refactor Utils class and create ShutdownHookManager. NOTE: Wasn't able to run /dev/run-tests on windows machine. Manual tests were conducted locally using custom log4j.properties file with Redis appender and logstash formatter (bundled in the fat-jar submitted to spark) ex: log4j.rootCategory=WARN,console,redis log4j.appender.console=org.apache.log4j.ConsoleAppender log4j.appender.console.target=System.err log4j.appender.console.layout=org.apache.log4j.PatternLayout log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n log4j.logger.org.eclipse.jetty=WARN log4j.logger.org.eclipse.jetty.util.component.AbstractLifeCycle=ERROR log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO log4j.logger.org.apache.spark.graphx.Pregel=INFO log4j.appender.redis=com.ryantenney.log4j.FailoverRedisAppender log4j.appender.redis.endpoints=hostname:port log4j.appender.redis.key=mykey log4j.appender.redis.alwaysBatch=false log4j.appender.redis.layout=net.logstash.log4j.JSONEventLayoutV1 Author: michellemay mle...@gmail.com Closes #8109 from michellemay/SPARK-9826. (cherry picked from commit ab7e721cfec63155641e81e72b4ad43cf6a7d4c7) Conflicts: core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala core/src/main/scala/org/apache/spark/util/Utils.scala sql/core/src/main/scala/org/apache/spark/sql/sources/SqlNewHadoopRDD.scala sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8ce86b23 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8ce86b23 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8ce86b23 Branch: refs/heads/branch-1.4 Commit: 8ce86b23f41fe6d1db4c64ec92fb8ddb438bb655 Parents: 89c8aea Author: Michel Lemay mle...@gmail.com Authored: Wed Aug 12 16:17:58 2015 -0700 Committer: Marcelo Vanzin van...@cloudera.com Committed: Wed Aug 12 16:56:52 2015 -0700 -- .../scala/org/apache/spark/SparkContext.scala | 5 +- .../spark/deploy/history/HistoryServer.scala| 4 +- .../spark/deploy/worker/ExecutorRunner.scala| 7 +- .../scala/org/apache/spark/rdd/HadoopRDD.scala | 4 +- .../org/apache/spark/rdd/NewHadoopRDD.scala | 4 +- .../apache/spark/storage/DiskBlockManager.scala | 10 +- .../spark/storage/TachyonBlockManager.scala | 6 +- .../apache/spark/util/ShutdownHookManager.scala | 266 +++ .../util/SparkUncaughtExceptionHandler.scala| 2 +- .../scala/org/apache/spark/util/Utils.scala | 222 +--- .../spark/sql/sources/SqlNewHadoopRDD.scala | 4 +- .../hive/thriftserver/HiveThriftServer2.scala | 4 +- .../hive/thriftserver/SparkSQLCLIDriver.scala | 4 +- .../apache/spark/sql/hive/test/TestHive.scala | 4 +- .../spark/streaming/StreamingContext.scala | 8 +- .../spark/deploy/yarn/ApplicationMaster.scala | 5 +- 16 files changed, 307 insertions(+), 252 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8ce86b23/core/src/main/scala/org/apache/spark/SparkContext.scala -- diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index 3caae87..25eae38 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -554,7 +554,8 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli // Make sure the context is stopped if the user forgets about it. This avoids leaving // unfinished event logs around after the JVM exits cleanly. It doesn't help if the JVM // is killed, though. -_shutdownHookRef = Utils.addShutdownHook(Utils.SPARK_CONTEXT_SHUTDOWN_PRIORITY) { () = +_shutdownHookRef = ShutdownHookManager.addShutdownHook( + ShutdownHookManager.SPARK_CONTEXT_SHUTDOWN_PRIORITY) { () = logInfo(Invoking stop() from shutdown hook) stop() } @@ -1627,7 +1628,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli return } if (_shutdownHookRef != null) { - Utils.removeShutdownHook(_shutdownHookRef) +
spark git commit: [SPARK-9913] [MLLIB] LDAUtils should be private
Repository: spark Updated Branches: refs/heads/master 7035d880a - caa14d9dc [SPARK-9913] [MLLIB] LDAUtils should be private feynmanliang Author: Xiangrui Meng m...@databricks.com Closes #8142 from mengxr/SPARK-9913. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/caa14d9d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/caa14d9d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/caa14d9d Branch: refs/heads/master Commit: caa14d9dc9e2eb1102052b22445b63b0e004e3c7 Parents: 7035d88 Author: Xiangrui Meng m...@databricks.com Authored: Wed Aug 12 16:53:47 2015 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Wed Aug 12 16:53:47 2015 -0700 -- .../main/scala/org/apache/spark/mllib/clustering/LDAUtils.scala| 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/caa14d9d/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAUtils.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAUtils.scala index f7e5ce1..a9ba7b6 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAUtils.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAUtils.scala @@ -22,7 +22,7 @@ import breeze.numerics._ /** * Utility methods for LDA. */ -object LDAUtils { +private[clustering] object LDAUtils { /** * Log Sum Exp with overflow protection using the identity: * For any a: \log \sum_{n=1}^N \exp\{x_n\} = a + \log \sum_{n=1}^N \exp\{x_n - a\} - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9780] [STREAMING] [KAFKA] prevent NPE if KafkaRDD instantiation …
Repository: spark Updated Branches: refs/heads/master 660e6dcff - 8ce60963c [SPARK-9780] [STREAMING] [KAFKA] prevent NPE if KafkaRDD instantiation … …fails Author: cody koeninger c...@koeninger.org Closes #8133 from koeninger/SPARK-9780 and squashes the following commits: 406259d [cody koeninger] [SPARK-9780][Streaming][Kafka] prevent NPE if KafkaRDD instantiation fails Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8ce60963 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8ce60963 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8ce60963 Branch: refs/heads/master Commit: 8ce60963cb0928058ef7b6e29ff94eb69d1143af Parents: 660e6dc Author: cody koeninger c...@koeninger.org Authored: Wed Aug 12 17:44:16 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Wed Aug 12 17:44:16 2015 -0700 -- .../main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8ce60963/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala -- diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala index 1a9d78c..ea5f842 100644 --- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala +++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala @@ -197,7 +197,11 @@ class KafkaRDD[ .dropWhile(_.offset requestOffset) } -override def close(): Unit = consumer.close() +override def close(): Unit = { + if (consumer != null) { +consumer.close() + } +} override def getNext(): R = { if (iter == null || !iter.hasNext) { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9780] [STREAMING] [KAFKA] prevent NPE if KafkaRDD instantiation …
Repository: spark Updated Branches: refs/heads/branch-1.5 3298fb69f - 62ab2a4c6 [SPARK-9780] [STREAMING] [KAFKA] prevent NPE if KafkaRDD instantiation … …fails Author: cody koeninger c...@koeninger.org Closes #8133 from koeninger/SPARK-9780 and squashes the following commits: 406259d [cody koeninger] [SPARK-9780][Streaming][Kafka] prevent NPE if KafkaRDD instantiation fails (cherry picked from commit 8ce60963cb0928058ef7b6e29ff94eb69d1143af) Signed-off-by: Tathagata Das tathagata.das1...@gmail.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/62ab2a4c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/62ab2a4c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/62ab2a4c Branch: refs/heads/branch-1.5 Commit: 62ab2a4c6b4b0cf4875ac1291562660b4b77cac4 Parents: 3298fb6 Author: cody koeninger c...@koeninger.org Authored: Wed Aug 12 17:44:16 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Wed Aug 12 17:44:27 2015 -0700 -- .../main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/62ab2a4c/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala -- diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala index 1a9d78c..ea5f842 100644 --- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala +++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala @@ -197,7 +197,11 @@ class KafkaRDD[ .dropWhile(_.offset requestOffset) } -override def close(): Unit = consumer.close() +override def close(): Unit = { + if (consumer != null) { +consumer.close() + } +} override def getNext(): R = { if (iter == null || !iter.hasNext) { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9915] [ML] stopWords should use StringArrayParam
Repository: spark Updated Branches: refs/heads/master e6aef5576 - fc1c7fd66 [SPARK-9915] [ML] stopWords should use StringArrayParam hhbyyh Author: Xiangrui Meng m...@databricks.com Closes #8141 from mengxr/SPARK-9915. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fc1c7fd6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fc1c7fd6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fc1c7fd6 Branch: refs/heads/master Commit: fc1c7fd66e64ccea53b31cd2fbb98bc6d307329c Parents: e6aef55 Author: Xiangrui Meng m...@databricks.com Authored: Wed Aug 12 17:06:12 2015 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Wed Aug 12 17:06:12 2015 -0700 -- .../scala/org/apache/spark/ml/feature/StopWordsRemover.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/fc1c7fd6/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala index 3cc4142..5d77ea0 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala @@ -19,12 +19,12 @@ package org.apache.spark.ml.feature import org.apache.spark.annotation.Experimental import org.apache.spark.ml.Transformer +import org.apache.spark.ml.param.{BooleanParam, ParamMap, StringArrayParam} import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} -import org.apache.spark.ml.param.{ParamMap, BooleanParam, Param} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.DataFrame -import org.apache.spark.sql.types.{StringType, StructField, ArrayType, StructType} import org.apache.spark.sql.functions.{col, udf} +import org.apache.spark.sql.types.{ArrayType, StringType, StructField, StructType} /** * stop words list @@ -100,7 +100,7 @@ class StopWordsRemover(override val uid: String) * the stop words set to be filtered out * @group param */ - val stopWords: Param[Array[String]] = new Param(this, stopWords, stop words) + val stopWords: StringArrayParam = new StringArrayParam(this, stopWords, stop words) /** @group setParam */ def setStopWords(value: Array[String]): this.type = set(stopWords, value) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9092] Fixed incompatibility when both num-executors and dynamic...
Repository: spark Updated Branches: refs/heads/master a17384fa3 - 738f35398 [SPARK-9092] Fixed incompatibility when both num-executors and dynamic... … allocation are set. Now, dynamic allocation is set to false when num-executors is explicitly specified as an argument. Consequently, executorAllocationManager in not initialized in the SparkContext. Author: Niranjan Padmanabhan niranjan.padmanab...@cloudera.com Closes #7657 from neurons/SPARK-9092. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/738f3539 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/738f3539 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/738f3539 Branch: refs/heads/master Commit: 738f353988dbf02704bd63f5e35d94402c59ed79 Parents: a17384f Author: Niranjan Padmanabhan niranjan.padmanab...@cloudera.com Authored: Wed Aug 12 16:10:21 2015 -0700 Committer: Marcelo Vanzin van...@cloudera.com Committed: Wed Aug 12 16:10:21 2015 -0700 -- .../main/scala/org/apache/spark/SparkConf.scala | 19 +++ .../scala/org/apache/spark/SparkContext.scala| 6 +- .../org/apache/spark/deploy/SparkSubmit.scala| 4 ++-- .../main/scala/org/apache/spark/util/Utils.scala | 11 +++ .../org/apache/spark/SparkContextSuite.scala | 8 .../apache/spark/deploy/SparkSubmitSuite.scala | 1 - docs/running-on-yarn.md | 2 +- .../spark/deploy/yarn/ApplicationMaster.scala| 4 ++-- .../deploy/yarn/ApplicationMasterArguments.scala | 5 - .../org/apache/spark/deploy/yarn/Client.scala| 5 - .../spark/deploy/yarn/ClientArguments.scala | 8 +--- .../apache/spark/deploy/yarn/YarnAllocator.scala | 9 - .../cluster/YarnClientSchedulerBackend.scala | 3 --- .../spark/deploy/yarn/YarnAllocatorSuite.scala | 5 +++-- 14 files changed, 64 insertions(+), 26 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/738f3539/core/src/main/scala/org/apache/spark/SparkConf.scala -- diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala index 8ff154f..b344b5e 100644 --- a/core/src/main/scala/org/apache/spark/SparkConf.scala +++ b/core/src/main/scala/org/apache/spark/SparkConf.scala @@ -389,6 +389,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging { val driverOptsKey = spark.driver.extraJavaOptions val driverClassPathKey = spark.driver.extraClassPath val driverLibraryPathKey = spark.driver.extraLibraryPath +val sparkExecutorInstances = spark.executor.instances // Used by Yarn in 1.1 and before sys.props.get(spark.driver.libraryPath).foreach { value = @@ -476,6 +477,24 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging { } } } + +if (!contains(sparkExecutorInstances)) { + sys.env.get(SPARK_WORKER_INSTANCES).foreach { value = +val warning = + s + |SPARK_WORKER_INSTANCES was detected (set to '$value'). + |This is deprecated in Spark 1.0+. + | + |Please instead use: + | - ./spark-submit with --num-executors to specify the number of executors + | - Or set SPARK_EXECUTOR_INSTANCES + | - spark.executor.instances to configure the number of instances in the spark config. +.stripMargin +logWarning(warning) + +set(spark.executor.instances, value) + } +} } /** http://git-wip-us.apache.org/repos/asf/spark/blob/738f3539/core/src/main/scala/org/apache/spark/SparkContext.scala -- diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index 6aafb4c..207a0c1 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -528,7 +528,11 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli } // Optionally scale number of executors dynamically based on workload. Exposed for testing. -val dynamicAllocationEnabled = _conf.getBoolean(spark.dynamicAllocation.enabled, false) +val dynamicAllocationEnabled = Utils.isDynamicAllocationEnabled(_conf) +if (!dynamicAllocationEnabled _conf.getBoolean(spark.dynamicAllocation.enabled, false)) { + logInfo(Dynamic Allocation and num executors both set, thus dynamic allocation disabled.) +} + _executorAllocationManager = if (dynamicAllocationEnabled) { Some(new ExecutorAllocationManager(this, listenerBus, _conf))
spark git commit: [SPARK-9092] Fixed incompatibility when both num-executors and dynamic...
Repository: spark Updated Branches: refs/heads/branch-1.5 b28295fe0 - 8537e51d3 [SPARK-9092] Fixed incompatibility when both num-executors and dynamic... … allocation are set. Now, dynamic allocation is set to false when num-executors is explicitly specified as an argument. Consequently, executorAllocationManager in not initialized in the SparkContext. Author: Niranjan Padmanabhan niranjan.padmanab...@cloudera.com Closes #7657 from neurons/SPARK-9092. (cherry picked from commit 738f353988dbf02704bd63f5e35d94402c59ed79) Signed-off-by: Marcelo Vanzin van...@cloudera.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8537e51d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8537e51d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8537e51d Branch: refs/heads/branch-1.5 Commit: 8537e51d39f693c58732b07ceb6b4ad308d5a0ba Parents: b28295f Author: Niranjan Padmanabhan niranjan.padmanab...@cloudera.com Authored: Wed Aug 12 16:10:21 2015 -0700 Committer: Marcelo Vanzin van...@cloudera.com Committed: Wed Aug 12 16:10:43 2015 -0700 -- .../main/scala/org/apache/spark/SparkConf.scala | 19 +++ .../scala/org/apache/spark/SparkContext.scala| 6 +- .../org/apache/spark/deploy/SparkSubmit.scala| 4 ++-- .../main/scala/org/apache/spark/util/Utils.scala | 11 +++ .../org/apache/spark/SparkContextSuite.scala | 8 .../apache/spark/deploy/SparkSubmitSuite.scala | 1 - docs/running-on-yarn.md | 2 +- .../spark/deploy/yarn/ApplicationMaster.scala| 4 ++-- .../deploy/yarn/ApplicationMasterArguments.scala | 5 - .../org/apache/spark/deploy/yarn/Client.scala| 5 - .../spark/deploy/yarn/ClientArguments.scala | 8 +--- .../apache/spark/deploy/yarn/YarnAllocator.scala | 9 - .../cluster/YarnClientSchedulerBackend.scala | 3 --- .../spark/deploy/yarn/YarnAllocatorSuite.scala | 5 +++-- 14 files changed, 64 insertions(+), 26 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8537e51d/core/src/main/scala/org/apache/spark/SparkConf.scala -- diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala index 8ff154f..b344b5e 100644 --- a/core/src/main/scala/org/apache/spark/SparkConf.scala +++ b/core/src/main/scala/org/apache/spark/SparkConf.scala @@ -389,6 +389,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging { val driverOptsKey = spark.driver.extraJavaOptions val driverClassPathKey = spark.driver.extraClassPath val driverLibraryPathKey = spark.driver.extraLibraryPath +val sparkExecutorInstances = spark.executor.instances // Used by Yarn in 1.1 and before sys.props.get(spark.driver.libraryPath).foreach { value = @@ -476,6 +477,24 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging { } } } + +if (!contains(sparkExecutorInstances)) { + sys.env.get(SPARK_WORKER_INSTANCES).foreach { value = +val warning = + s + |SPARK_WORKER_INSTANCES was detected (set to '$value'). + |This is deprecated in Spark 1.0+. + | + |Please instead use: + | - ./spark-submit with --num-executors to specify the number of executors + | - Or set SPARK_EXECUTOR_INSTANCES + | - spark.executor.instances to configure the number of instances in the spark config. +.stripMargin +logWarning(warning) + +set(spark.executor.instances, value) + } +} } /** http://git-wip-us.apache.org/repos/asf/spark/blob/8537e51d/core/src/main/scala/org/apache/spark/SparkContext.scala -- diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index 6aafb4c..207a0c1 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -528,7 +528,11 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli } // Optionally scale number of executors dynamically based on workload. Exposed for testing. -val dynamicAllocationEnabled = _conf.getBoolean(spark.dynamicAllocation.enabled, false) +val dynamicAllocationEnabled = Utils.isDynamicAllocationEnabled(_conf) +if (!dynamicAllocationEnabled _conf.getBoolean(spark.dynamicAllocation.enabled, false)) { + logInfo(Dynamic Allocation and num executors both set, thus dynamic allocation disabled.) +} + _executorAllocationManager = if
spark git commit: [SPARK-9766] [ML] [PySpark] check and add miss docs for PySpark ML
Repository: spark Updated Branches: refs/heads/master 60103ecd3 - 762bacc16 [SPARK-9766] [ML] [PySpark] check and add miss docs for PySpark ML Check and add miss docs for PySpark ML (this issue only check miss docs for o.a.s.ml not o.a.s.mllib). Author: Yanbo Liang yblia...@gmail.com Closes #8059 from yanboliang/SPARK-9766. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/762bacc1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/762bacc1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/762bacc1 Branch: refs/heads/master Commit: 762bacc16ac5e74c8b05a7c1e3e367d1d1633cef Parents: 60103ec Author: Yanbo Liang yblia...@gmail.com Authored: Wed Aug 12 13:24:18 2015 -0700 Committer: Joseph K. Bradley jos...@databricks.com Committed: Wed Aug 12 13:24:18 2015 -0700 -- python/pyspark/ml/classification.py | 12 ++-- python/pyspark/ml/clustering.py | 4 +++- python/pyspark/ml/evaluation.py | 3 ++- python/pyspark/ml/feature.py| 9 + 4 files changed, 20 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/762bacc1/python/pyspark/ml/classification.py -- diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 5978d8f..6702dce 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -34,6 +34,7 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti HasRegParam, HasTol, HasProbabilityCol, HasRawPredictionCol): Logistic regression. +Currently, this class only supports binary classification. from pyspark.sql import Row from pyspark.mllib.linalg import Vectors @@ -96,8 +97,8 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti # is an L2 penalty. For alpha = 1, it is an L1 penalty. self.elasticNetParam = \ Param(self, elasticNetParam, - the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty + - is an L2 penalty. For alpha = 1, it is an L1 penalty.) + the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, + + the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.) #: param for whether to fit an intercept term. self.fitIntercept = Param(self, fitIntercept, whether to fit an intercept term.) #: param for threshold in binary classification prediction, in range [0, 1]. @@ -656,6 +657,13 @@ class NaiveBayes(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, H HasRawPredictionCol): Naive Bayes Classifiers. +It supports both Multinomial and Bernoulli NB. Multinomial NB + (`http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html`) +can handle finitely supported discrete data. For example, by converting documents into +TF-IDF vectors, it can be used for document classification. By making every vector a +binary (0/1) data, it can also be used as Bernoulli NB + (`http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html`). +The input feature values must be nonnegative. from pyspark.sql import Row from pyspark.mllib.linalg import Vectors http://git-wip-us.apache.org/repos/asf/spark/blob/762bacc1/python/pyspark/ml/clustering.py -- diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index b5e9b65..4833871 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -37,7 +37,9 @@ class KMeansModel(JavaModel): @inherit_doc class KMeans(JavaEstimator, HasFeaturesCol, HasMaxIter, HasSeed): -K-means Clustering +K-means clustering with support for multiple parallel runs and a k-means++ like initialization +mode (the k-means|| algorithm by Bahmani et al). When multiple concurrent runs are requested, +they are executed together with joint passes over the data for efficiency. from pyspark.mllib.linalg import Vectors data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),), http://git-wip-us.apache.org/repos/asf/spark/blob/762bacc1/python/pyspark/ml/evaluation.py -- diff --git a/python/pyspark/ml/evaluation.py b/python/pyspark/ml/evaluation.py index 06e8093..2734092 100644 --- a/python/pyspark/ml/evaluation.py +++ b/python/pyspark/ml/evaluation.py @@ -23,7 +23,8 @@ from pyspark.ml.param.shared import HasLabelCol, HasPredictionCol, HasRawPredict from
spark git commit: [SPARK-8967] [DOC] add Since annotation
Repository: spark Updated Branches: refs/heads/branch-1.5 bdf8dc15d - 6a7582ea2 [SPARK-8967] [DOC] add Since annotation Add `Since` as a Scala annotation. The benefit is that we can use it without having explicit JavaDoc. This is useful for inherited methods. The limitation is that is doesn't show up in the generated Java API documentation. This might be fixed by modifying genjavadoc. I think we could leave it as a TODO. This is how the generated Scala doc looks: `since` JavaDoc tag: ![screen shot 2015-08-11 at 10 00 37 pm](https://cloud.githubusercontent.com/assets/829644/9230761/fa72865c-40d8-11e5-807e-0f3c815c5acd.png) `Since` annotation: ![screen shot 2015-08-11 at 10 00 28 pm](https://cloud.githubusercontent.com/assets/829644/9230764/0041d7f4-40d9-11e5-8124-c3f3e5d5b31f.png) rxin Author: Xiangrui Meng m...@databricks.com Closes #8131 from mengxr/SPARK-8967. (cherry picked from commit 6f60298b1d7aa97268a42eca1e3b4851a7e88cb5) Signed-off-by: Xiangrui Meng m...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6a7582ea Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6a7582ea Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6a7582ea Branch: refs/heads/branch-1.5 Commit: 6a7582ea2d232982c3480e7d4ee357ea45d0b303 Parents: bdf8dc1 Author: Xiangrui Meng m...@databricks.com Authored: Wed Aug 12 14:28:23 2015 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Wed Aug 12 14:28:34 2015 -0700 -- .../org/apache/spark/annotation/Since.scala | 28 1 file changed, 28 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6a7582ea/core/src/main/scala/org/apache/spark/annotation/Since.scala -- diff --git a/core/src/main/scala/org/apache/spark/annotation/Since.scala b/core/src/main/scala/org/apache/spark/annotation/Since.scala new file mode 100644 index 000..fa59393 --- /dev/null +++ b/core/src/main/scala/org/apache/spark/annotation/Since.scala @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the License); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.annotation + +import scala.annotation.StaticAnnotation + +/** + * A Scala annotation that specifies the Spark version when a definition was added. + * Different from the `@since` tag in JavaDoc, this annotation does not require explicit JavaDoc and + * hence works for overridden methods that inherit API documentation directly from parents. + * The limitation is that it does not show up in the generated Java API documentation. + */ +private[spark] class Since(version: String) extends StaticAnnotation - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9894] [SQL] Json writer should handle MapData.
Repository: spark Updated Branches: refs/heads/branch-1.5 74c9dcec3 - 08f767a1e [SPARK-9894] [SQL] Json writer should handle MapData. https://issues.apache.org/jira/browse/SPARK-9894 Author: Yin Huai yh...@databricks.com Closes #8137 from yhuai/jsonMapData. (cherry picked from commit 7035d880a0cf06910c19b4afd49645124c620f14) Signed-off-by: Michael Armbrust mich...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/08f767a1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/08f767a1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/08f767a1 Branch: refs/heads/branch-1.5 Commit: 08f767a1eb4b093938c2d6ddfba7e3133a59a754 Parents: 74c9dce Author: Yin Huai yh...@databricks.com Authored: Wed Aug 12 16:45:15 2015 -0700 Committer: Michael Armbrust mich...@databricks.com Committed: Wed Aug 12 16:45:32 2015 -0700 -- .../datasources/json/JacksonGenerator.scala | 10 +-- .../sql/sources/JsonHadoopFsRelationSuite.scala | 78 .../SimpleTextHadoopFsRelationSuite.scala | 30 3 files changed, 83 insertions(+), 35 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/08f767a1/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala index 37c2b5a..99ac773 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala @@ -107,12 +107,12 @@ private[sql] object JacksonGenerator { v.foreach(ty, (_, value) = valWriter(ty, value)) gen.writeEndArray() - case (MapType(kv, vv, _), v: Map[_, _]) = + case (MapType(kt, vt, _), v: MapData) = gen.writeStartObject() -v.foreach { p = - gen.writeFieldName(p._1.toString) - valWriter(vv, p._2) -} +v.foreach(kt, vt, { (k, v) = + gen.writeFieldName(k.toString) + valWriter(vt, v) +}) gen.writeEndObject() case (StructType(ty), v: InternalRow) = http://git-wip-us.apache.org/repos/asf/spark/blob/08f767a1/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala new file mode 100644 index 000..ed6d512 --- /dev/null +++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the License); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.sources + +import org.apache.hadoop.fs.Path + +import org.apache.spark.deploy.SparkHadoopUtil +import org.apache.spark.sql.Row +import org.apache.spark.sql.types._ + +class JsonHadoopFsRelationSuite extends HadoopFsRelationTest { + override val dataSourceName: String = json + + import sqlContext._ + + test(save()/load() - partitioned table - simple queries - partition columns in data) { +withTempDir { file = + val basePath = new Path(file.getCanonicalPath) + val fs = basePath.getFileSystem(SparkHadoopUtil.get.conf) + val qualifiedBasePath = fs.makeQualified(basePath) + + for (p1 - 1 to 2; p2 - Seq(foo, bar)) { +val partitionDir = new Path(qualifiedBasePath, sp1=$p1/p2=$p2) +sparkContext + .parallelize(for (i - 1 to 3) yield s{a:$i,b:val_$i}) + .saveAsTextFile(partitionDir.toString) + } + + val dataSchemaWithPartition = +StructType(dataSchema.fields :+ StructField(p1, IntegerType, nullable = true)) + + checkQueries(
spark git commit: [SPARK-9912] [MLLIB] QRDecomposition should use QType and RType for type names instead of UType and VType
Repository: spark Updated Branches: refs/heads/master 6e409bc13 - e6aef5576 [SPARK-9912] [MLLIB] QRDecomposition should use QType and RType for type names instead of UType and VType hhbyyh Author: Xiangrui Meng m...@databricks.com Closes #8140 from mengxr/SPARK-9912. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e6aef557 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e6aef557 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e6aef557 Branch: refs/heads/master Commit: e6aef55766d0e2a48e0f9cb6eda0e31a71b962f3 Parents: 6e409bc Author: Xiangrui Meng m...@databricks.com Authored: Wed Aug 12 17:04:31 2015 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Wed Aug 12 17:04:31 2015 -0700 -- .../org/apache/spark/mllib/linalg/SingularValueDecomposition.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e6aef557/mllib/src/main/scala/org/apache/spark/mllib/linalg/SingularValueDecomposition.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SingularValueDecomposition.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SingularValueDecomposition.scala index b416d50..cff5dbe 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SingularValueDecomposition.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SingularValueDecomposition.scala @@ -31,5 +31,5 @@ case class SingularValueDecomposition[UType, VType](U: UType, s: Vector, V: VTyp * Represents QR factors. */ @Experimental -case class QRDecomposition[UType, VType](Q: UType, R: VType) +case class QRDecomposition[QType, RType](Q: QType, R: RType) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9912] [MLLIB] QRDecomposition should use QType and RType for type names instead of UType and VType
Repository: spark Updated Branches: refs/heads/branch-1.5 2f8793b5f - 31b7fdc06 [SPARK-9912] [MLLIB] QRDecomposition should use QType and RType for type names instead of UType and VType hhbyyh Author: Xiangrui Meng m...@databricks.com Closes #8140 from mengxr/SPARK-9912. (cherry picked from commit e6aef55766d0e2a48e0f9cb6eda0e31a71b962f3) Signed-off-by: Xiangrui Meng m...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/31b7fdc0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/31b7fdc0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/31b7fdc0 Branch: refs/heads/branch-1.5 Commit: 31b7fdc06fc21fa38ac4530f9c70dd27b3b71578 Parents: 2f8793b Author: Xiangrui Meng m...@databricks.com Authored: Wed Aug 12 17:04:31 2015 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Wed Aug 12 17:04:37 2015 -0700 -- .../org/apache/spark/mllib/linalg/SingularValueDecomposition.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/31b7fdc0/mllib/src/main/scala/org/apache/spark/mllib/linalg/SingularValueDecomposition.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SingularValueDecomposition.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SingularValueDecomposition.scala index b416d50..cff5dbe 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SingularValueDecomposition.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SingularValueDecomposition.scala @@ -31,5 +31,5 @@ case class SingularValueDecomposition[UType, VType](U: UType, s: Vector, V: VTyp * Represents QR factors. */ @Experimental -case class QRDecomposition[UType, VType](Q: UType, R: VType) +case class QRDecomposition[QType, RType](Q: QType, R: RType) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9449] [SQL] Include MetastoreRelation's inputFiles
Repository: spark Updated Branches: refs/heads/master fc1c7fd66 - 660e6dcff [SPARK-9449] [SQL] Include MetastoreRelation's inputFiles Author: Michael Armbrust mich...@databricks.com Closes #8119 from marmbrus/metastoreInputFiles. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/660e6dcf Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/660e6dcf Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/660e6dcf Branch: refs/heads/master Commit: 660e6dcff8125b83cc73dbe00c90cbe58744bc66 Parents: fc1c7fd Author: Michael Armbrust mich...@databricks.com Authored: Wed Aug 12 17:07:29 2015 -0700 Committer: Michael Armbrust mich...@databricks.com Committed: Wed Aug 12 17:07:29 2015 -0700 -- .../scala/org/apache/spark/sql/DataFrame.scala | 10 --- .../spark/sql/execution/FileRelation.scala | 28 .../apache/spark/sql/sources/interfaces.scala | 6 +++-- .../org/apache/spark/sql/DataFrameSuite.scala | 26 +- .../spark/sql/hive/HiveMetastoreCatalog.scala | 16 +-- 5 files changed, 66 insertions(+), 20 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/660e6dcf/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala index 27b994f..c466d9e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala @@ -34,10 +34,10 @@ import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis._ import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.plans.logical.{Filter, _} +import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.plans.{Inner, JoinType} import org.apache.spark.sql.catalyst.{CatalystTypeConverters, ScalaReflection, SqlParser} -import org.apache.spark.sql.execution.{EvaluatePython, ExplainCommand, LogicalRDD, SQLExecution} +import org.apache.spark.sql.execution.{EvaluatePython, ExplainCommand, FileRelation, LogicalRDD, SQLExecution} import org.apache.spark.sql.execution.datasources.{CreateTableUsingAsSelect, LogicalRelation} import org.apache.spark.sql.execution.datasources.json.JacksonGenerator import org.apache.spark.sql.sources.HadoopFsRelation @@ -1560,8 +1560,10 @@ class DataFrame private[sql]( */ def inputFiles: Array[String] = { val files: Seq[String] = logicalPlan.collect { - case LogicalRelation(fsBasedRelation: HadoopFsRelation) = -fsBasedRelation.paths.toSeq + case LogicalRelation(fsBasedRelation: FileRelation) = +fsBasedRelation.inputFiles + case fr: FileRelation = +fr.inputFiles }.flatten files.toSet.toArray } http://git-wip-us.apache.org/repos/asf/spark/blob/660e6dcf/sql/core/src/main/scala/org/apache/spark/sql/execution/FileRelation.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/FileRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/FileRelation.scala new file mode 100644 index 000..7a2a9ee --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/FileRelation.scala @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the License); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution + +/** + * An interface for relations that are backed by files. When a class implements this interface, + * the list of paths that it returns will be returned to a user who calls `inputPaths` on any + * DataFrame that queries this relation. + */ +private[sql] trait FileRelation { + /** Returns the list of files that will be read when scanning this relation. */ + def inputFiles: Array[String] +}
spark git commit: [SPARK-9915] [ML] stopWords should use StringArrayParam
Repository: spark Updated Branches: refs/heads/branch-1.5 31b7fdc06 - ed73f5439 [SPARK-9915] [ML] stopWords should use StringArrayParam hhbyyh Author: Xiangrui Meng m...@databricks.com Closes #8141 from mengxr/SPARK-9915. (cherry picked from commit fc1c7fd66e64ccea53b31cd2fbb98bc6d307329c) Signed-off-by: Xiangrui Meng m...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ed73f543 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ed73f543 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ed73f543 Branch: refs/heads/branch-1.5 Commit: ed73f5439bbe3a09adf9a770c34b5d87b35499c8 Parents: 31b7fdc Author: Xiangrui Meng m...@databricks.com Authored: Wed Aug 12 17:06:12 2015 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Wed Aug 12 17:06:19 2015 -0700 -- .../scala/org/apache/spark/ml/feature/StopWordsRemover.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ed73f543/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala index 3cc4142..5d77ea0 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala @@ -19,12 +19,12 @@ package org.apache.spark.ml.feature import org.apache.spark.annotation.Experimental import org.apache.spark.ml.Transformer +import org.apache.spark.ml.param.{BooleanParam, ParamMap, StringArrayParam} import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} -import org.apache.spark.ml.param.{ParamMap, BooleanParam, Param} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.DataFrame -import org.apache.spark.sql.types.{StringType, StructField, ArrayType, StructType} import org.apache.spark.sql.functions.{col, udf} +import org.apache.spark.sql.types.{ArrayType, StringType, StructField, StructType} /** * stop words list @@ -100,7 +100,7 @@ class StopWordsRemover(override val uid: String) * the stop words set to be filtered out * @group param */ - val stopWords: Param[Array[String]] = new Param(this, stopWords, stop words) + val stopWords: StringArrayParam = new StringArrayParam(this, stopWords, stop words) /** @group setParam */ def setStopWords(value: Array[String]): this.type = set(stopWords, value) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9724] [WEB UI] Avoid unnecessary redirects in the Spark Web UI.
Repository: spark Updated Branches: refs/heads/master 8ce60963c - 0d1d146c2 [SPARK-9724] [WEB UI] Avoid unnecessary redirects in the Spark Web UI. Author: Rohit Agarwal roh...@qubole.com Closes #8014 from mindprince/SPARK-9724 and squashes the following commits: a7af5ff [Rohit Agarwal] [SPARK-9724] [WEB UI] Inline attachPrefix and attachPrefixForRedirect. Fix logic of attachPrefix 8a977cd [Rohit Agarwal] [SPARK-9724] [WEB UI] Address review comments: Remove unneeded code, update scaladoc. b257844 [Rohit Agarwal] [SPARK-9724] [WEB UI] Avoid unnecessary redirects in the Spark Web UI. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0d1d146c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0d1d146c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0d1d146c Branch: refs/heads/master Commit: 0d1d146c220f0d47d0e62b368d5b94d3bd9dd197 Parents: 8ce6096 Author: Rohit Agarwal roh...@qubole.com Authored: Wed Aug 12 17:48:43 2015 -0700 Committer: Marcelo Vanzin van...@cloudera.com Committed: Wed Aug 12 17:48:43 2015 -0700 -- .../main/scala/org/apache/spark/ui/JettyUtils.scala| 13 ++--- core/src/main/scala/org/apache/spark/ui/SparkUI.scala | 4 ++-- 2 files changed, 8 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0d1d146c/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala -- diff --git a/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala b/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala index c835646..779c0ba 100644 --- a/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala +++ b/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala @@ -106,7 +106,11 @@ private[spark] object JettyUtils extends Logging { path: String, servlet: HttpServlet, basePath: String): ServletContextHandler = { -val prefixedPath = attachPrefix(basePath, path) +val prefixedPath = if (basePath == path == /) { + path +} else { + (basePath + path).stripSuffix(/) +} val contextHandler = new ServletContextHandler val holder = new ServletHolder(servlet) contextHandler.setContextPath(prefixedPath) @@ -121,7 +125,7 @@ private[spark] object JettyUtils extends Logging { beforeRedirect: HttpServletRequest = Unit = x = (), basePath: String = , httpMethods: Set[String] = Set(GET)): ServletContextHandler = { -val prefixedDestPath = attachPrefix(basePath, destPath) +val prefixedDestPath = basePath + destPath val servlet = new HttpServlet { override def doGet(request: HttpServletRequest, response: HttpServletResponse): Unit = { if (httpMethods.contains(GET)) { @@ -246,11 +250,6 @@ private[spark] object JettyUtils extends Logging { val (server, boundPort) = Utils.startServiceOnPort[Server](port, connect, conf, serverName) ServerInfo(server, boundPort, collection) } - - /** Attach a prefix to the given path, but avoid returning an empty path */ - private def attachPrefix(basePath: String, relativePath: String): String = { -if (basePath == ) relativePath else (basePath + relativePath).stripSuffix(/) - } } private[spark] case class ServerInfo( http://git-wip-us.apache.org/repos/asf/spark/blob/0d1d146c/core/src/main/scala/org/apache/spark/ui/SparkUI.scala -- diff --git a/core/src/main/scala/org/apache/spark/ui/SparkUI.scala b/core/src/main/scala/org/apache/spark/ui/SparkUI.scala index 3788916..d8b9056 100644 --- a/core/src/main/scala/org/apache/spark/ui/SparkUI.scala +++ b/core/src/main/scala/org/apache/spark/ui/SparkUI.scala @@ -64,11 +64,11 @@ private[spark] class SparkUI private ( attachTab(new EnvironmentTab(this)) attachTab(new ExecutorsTab(this)) attachHandler(createStaticHandler(SparkUI.STATIC_RESOURCE_DIR, /static)) -attachHandler(createRedirectHandler(/, /jobs, basePath = basePath)) +attachHandler(createRedirectHandler(/, /jobs/, basePath = basePath)) attachHandler(ApiRootResource.getServletHandler(this)) // This should be POST only, but, the YARN AM proxy won't proxy POSTs attachHandler(createRedirectHandler( - /stages/stage/kill, /stages, stagesTab.handleKillRequest, + /stages/stage/kill, /stages/, stagesTab.handleKillRequest, httpMethods = Set(GET, POST))) } initialize() - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9827] [SQL] fix fd leak in UnsafeRowSerializer
Repository: spark Updated Branches: refs/heads/master 7b13ed27c - 7c35746c9 [SPARK-9827] [SQL] fix fd leak in UnsafeRowSerializer Currently, UnsafeRowSerializer does not close the InputStream, will cause fd leak if the InputStream has an open fd in it. TODO: the fd could still be leaked, if any items in the stream is not consumed. Currently it replies on GC to close the fd in this case. cc JoshRosen Author: Davies Liu dav...@databricks.com Closes #8116 from davies/fd_leak. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7c35746c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7c35746c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7c35746c Branch: refs/heads/master Commit: 7c35746c916cf0019367850e75a080d7e739dba0 Parents: 7b13ed2 Author: Davies Liu dav...@databricks.com Authored: Wed Aug 12 20:02:55 2015 -0700 Committer: Reynold Xin r...@databricks.com Committed: Wed Aug 12 20:02:55 2015 -0700 -- .../sql/execution/UnsafeRowSerializer.scala | 2 ++ .../execution/UnsafeRowSerializerSuite.scala| 31 ++-- 2 files changed, 30 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7c35746c/sql/core/src/main/scala/org/apache/spark/sql/execution/UnsafeRowSerializer.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/UnsafeRowSerializer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/UnsafeRowSerializer.scala index 3860c4b..5c18558 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/UnsafeRowSerializer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/UnsafeRowSerializer.scala @@ -108,6 +108,7 @@ private class UnsafeRowSerializerInstance(numFields: Int) extends SerializerInst override def asKeyValueIterator: Iterator[(Int, UnsafeRow)] = { new Iterator[(Int, UnsafeRow)] { private[this] var rowSize: Int = dIn.readInt() + if (rowSize == EOF) dIn.close() override def hasNext: Boolean = rowSize != EOF @@ -119,6 +120,7 @@ private class UnsafeRowSerializerInstance(numFields: Int) extends SerializerInst row.pointTo(rowBuffer, Platform.BYTE_ARRAY_OFFSET, numFields, rowSize) rowSize = dIn.readInt() // read the next row's size if (rowSize == EOF) { // We are returning the last row in this stream + dIn.close() val _rowTuple = rowTuple // Null these out so that the byte array can be garbage collected once the entire // iterator has been consumed http://git-wip-us.apache.org/repos/asf/spark/blob/7c35746c/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala index 40b47ae..bd02c73 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.execution -import java.io.{ByteArrayInputStream, ByteArrayOutputStream} +import java.io.{DataOutputStream, ByteArrayInputStream, ByteArrayOutputStream} import org.apache.spark.SparkFunSuite import org.apache.spark.sql.Row @@ -25,6 +25,18 @@ import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.expressions.{UnsafeProjection, UnsafeRow} import org.apache.spark.sql.types._ + +/** + * used to test close InputStream in UnsafeRowSerializer + */ +class ClosableByteArrayInputStream(buf: Array[Byte]) extends ByteArrayInputStream(buf) { + var closed: Boolean = false + override def close(): Unit = { +closed = true +super.close() + } +} + class UnsafeRowSerializerSuite extends SparkFunSuite { private def toUnsafeRow(row: Row, schema: Array[DataType]): UnsafeRow = { @@ -52,8 +64,8 @@ class UnsafeRowSerializerSuite extends SparkFunSuite { serializerStream.writeValue(unsafeRow) } serializerStream.close() -val deserializerIter = serializer.deserializeStream( - new ByteArrayInputStream(baos.toByteArray)).asKeyValueIterator +val input = new ClosableByteArrayInputStream(baos.toByteArray) +val deserializerIter = serializer.deserializeStream(input).asKeyValueIterator for (expectedRow - unsafeRows) { val actualRow = deserializerIter.next().asInstanceOf[(Integer, UnsafeRow)]._2 assert(expectedRow.getSizeInBytes ===
spark git commit: [SPARK-9827] [SQL] fix fd leak in UnsafeRowSerializer
Repository: spark Updated Branches: refs/heads/branch-1.5 4b547b91d - eebb3f945 [SPARK-9827] [SQL] fix fd leak in UnsafeRowSerializer Currently, UnsafeRowSerializer does not close the InputStream, will cause fd leak if the InputStream has an open fd in it. TODO: the fd could still be leaked, if any items in the stream is not consumed. Currently it replies on GC to close the fd in this case. cc JoshRosen Author: Davies Liu dav...@databricks.com Closes #8116 from davies/fd_leak. (cherry picked from commit 7c35746c916cf0019367850e75a080d7e739dba0) Signed-off-by: Reynold Xin r...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/eebb3f94 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/eebb3f94 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/eebb3f94 Branch: refs/heads/branch-1.5 Commit: eebb3f9451be43c4958ecfd1eff7d0f27cd452ae Parents: 4b547b9 Author: Davies Liu dav...@databricks.com Authored: Wed Aug 12 20:02:55 2015 -0700 Committer: Reynold Xin r...@databricks.com Committed: Wed Aug 12 20:03:09 2015 -0700 -- .../sql/execution/UnsafeRowSerializer.scala | 2 ++ .../execution/UnsafeRowSerializerSuite.scala| 31 ++-- 2 files changed, 30 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/eebb3f94/sql/core/src/main/scala/org/apache/spark/sql/execution/UnsafeRowSerializer.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/UnsafeRowSerializer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/UnsafeRowSerializer.scala index 3860c4b..5c18558 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/UnsafeRowSerializer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/UnsafeRowSerializer.scala @@ -108,6 +108,7 @@ private class UnsafeRowSerializerInstance(numFields: Int) extends SerializerInst override def asKeyValueIterator: Iterator[(Int, UnsafeRow)] = { new Iterator[(Int, UnsafeRow)] { private[this] var rowSize: Int = dIn.readInt() + if (rowSize == EOF) dIn.close() override def hasNext: Boolean = rowSize != EOF @@ -119,6 +120,7 @@ private class UnsafeRowSerializerInstance(numFields: Int) extends SerializerInst row.pointTo(rowBuffer, Platform.BYTE_ARRAY_OFFSET, numFields, rowSize) rowSize = dIn.readInt() // read the next row's size if (rowSize == EOF) { // We are returning the last row in this stream + dIn.close() val _rowTuple = rowTuple // Null these out so that the byte array can be garbage collected once the entire // iterator has been consumed http://git-wip-us.apache.org/repos/asf/spark/blob/eebb3f94/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala index 40b47ae..bd02c73 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.execution -import java.io.{ByteArrayInputStream, ByteArrayOutputStream} +import java.io.{DataOutputStream, ByteArrayInputStream, ByteArrayOutputStream} import org.apache.spark.SparkFunSuite import org.apache.spark.sql.Row @@ -25,6 +25,18 @@ import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.expressions.{UnsafeProjection, UnsafeRow} import org.apache.spark.sql.types._ + +/** + * used to test close InputStream in UnsafeRowSerializer + */ +class ClosableByteArrayInputStream(buf: Array[Byte]) extends ByteArrayInputStream(buf) { + var closed: Boolean = false + override def close(): Unit = { +closed = true +super.close() + } +} + class UnsafeRowSerializerSuite extends SparkFunSuite { private def toUnsafeRow(row: Row, schema: Array[DataType]): UnsafeRow = { @@ -52,8 +64,8 @@ class UnsafeRowSerializerSuite extends SparkFunSuite { serializerStream.writeValue(unsafeRow) } serializerStream.close() -val deserializerIter = serializer.deserializeStream( - new ByteArrayInputStream(baos.toByteArray)).asKeyValueIterator +val input = new ClosableByteArrayInputStream(baos.toByteArray) +val deserializerIter = serializer.deserializeStream(input).asKeyValueIterator for (expectedRow - unsafeRows) { val
spark git commit: [SPARK-9908] [SQL] When spark.sql.tungsten.enabled is false, broadcast join does not work
Repository: spark Updated Branches: refs/heads/branch-1.5 eebb3f945 - 71ea61f90 [SPARK-9908] [SQL] When spark.sql.tungsten.enabled is false, broadcast join does not work https://issues.apache.org/jira/browse/SPARK-9908 Author: Yin Huai yh...@databricks.com Closes #8149 from yhuai/SPARK-9908. (cherry picked from commit 4413d0855aaba5cb00f737dc6934a0b92d9bc05d) Signed-off-by: Reynold Xin r...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/71ea61f9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/71ea61f9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/71ea61f9 Branch: refs/heads/branch-1.5 Commit: 71ea61f9063a83d4039347fab52255fccada19f1 Parents: eebb3f9 Author: Yin Huai yh...@databricks.com Authored: Wed Aug 12 20:03:55 2015 -0700 Committer: Reynold Xin r...@databricks.com Committed: Wed Aug 12 20:04:04 2015 -0700 -- .../org/apache/spark/sql/execution/joins/HashedRelation.scala | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/71ea61f9/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala index 076afe6..bb333b4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala @@ -66,7 +66,8 @@ private[joins] final class GeneralHashedRelation( private var hashTable: JavaHashMap[InternalRow, CompactBuffer[InternalRow]]) extends HashedRelation with Externalizable { - private def this() = this(null) // Needed for serialization + // Needed for serialization (it is public to make Java serialization work) + def this() = this(null) override def get(key: InternalRow): Seq[InternalRow] = hashTable.get(key) @@ -88,7 +89,8 @@ private[joins] final class UniqueKeyHashedRelation(private var hashTable: JavaHashMap[InternalRow, InternalRow]) extends HashedRelation with Externalizable { - private def this() = this(null) // Needed for serialization + // Needed for serialization (it is public to make Java serialization work) + def this() = this(null) override def get(key: InternalRow): Seq[InternalRow] = { val v = hashTable.get(key) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9908] [SQL] When spark.sql.tungsten.enabled is false, broadcast join does not work
Repository: spark Updated Branches: refs/heads/master 7c35746c9 - 4413d0855 [SPARK-9908] [SQL] When spark.sql.tungsten.enabled is false, broadcast join does not work https://issues.apache.org/jira/browse/SPARK-9908 Author: Yin Huai yh...@databricks.com Closes #8149 from yhuai/SPARK-9908. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4413d085 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4413d085 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4413d085 Branch: refs/heads/master Commit: 4413d0855aaba5cb00f737dc6934a0b92d9bc05d Parents: 7c35746 Author: Yin Huai yh...@databricks.com Authored: Wed Aug 12 20:03:55 2015 -0700 Committer: Reynold Xin r...@databricks.com Committed: Wed Aug 12 20:03:55 2015 -0700 -- .../org/apache/spark/sql/execution/joins/HashedRelation.scala | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4413d085/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala index 076afe6..bb333b4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala @@ -66,7 +66,8 @@ private[joins] final class GeneralHashedRelation( private var hashTable: JavaHashMap[InternalRow, CompactBuffer[InternalRow]]) extends HashedRelation with Externalizable { - private def this() = this(null) // Needed for serialization + // Needed for serialization (it is public to make Java serialization work) + def this() = this(null) override def get(key: InternalRow): Seq[InternalRow] = hashTable.get(key) @@ -88,7 +89,8 @@ private[joins] final class UniqueKeyHashedRelation(private var hashTable: JavaHashMap[InternalRow, InternalRow]) extends HashedRelation with Externalizable { - private def this() = this(null) // Needed for serialization + // Needed for serialization (it is public to make Java serialization work) + def this() = this(null) override def get(key: InternalRow): Seq[InternalRow] = { val v = hashTable.get(key) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9855] [SPARKR] Add expression functions into SparkR whose params are simple
Repository: spark Updated Branches: refs/heads/master 0d1d146c2 - f4bc01f1f [SPARK-9855] [SPARKR] Add expression functions into SparkR whose params are simple I added lots of expression functions for SparkR. This PR includes only functions whose params are only `(Column)` or `(Column, Column)`. And I think we need to improve how to test those functions. However, it would be better to work on another issue. ## Diff Summary - Add lots of functions in `functions.R` and their generic in `generic.R` - Add aliases for `ceiling` and `sign` - Move expression functions from `column.R` to `functions.R` - Modify `rdname` from `column` to `functions` I haven't supported `not` function, because the name has a collesion with `testthat` package. I didn't think of the way to define it. ## New Supported Functions ``` approxCountDistinct ascii base64 bin bitwiseNOT ceil (alias: ceiling) crc32 dayofmonth dayofyear explode factorial hex hour initcap isNaN last_day length log2 ltrim md5 minute month negate quarter reverse round rtrim second sha1 signum (alias: sign) size soundex to_date trim unbase64 unhex weekofyear year datediff levenshtein months_between nanvl pmod ``` ## JIRA [[SPARK-9855] Add expression functions into SparkR whose params are simple - ASF JIRA](https://issues.apache.org/jira/browse/SPARK-9855) Author: Yu ISHIKAWA yuu.ishik...@gmail.com Closes #8123 from yu-iskw/SPARK-9855. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f4bc01f1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f4bc01f1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f4bc01f1 Branch: refs/heads/master Commit: f4bc01f1f33a93e6affe5c8a3e33ffbd92d03f38 Parents: 0d1d146 Author: Yu ISHIKAWA yuu.ishik...@gmail.com Authored: Wed Aug 12 18:33:27 2015 -0700 Committer: Shivaram Venkataraman shiva...@cs.berkeley.edu Committed: Wed Aug 12 18:33:27 2015 -0700 -- R/pkg/DESCRIPTION| 1 + R/pkg/R/column.R | 81 --- R/pkg/R/functions.R | 123 ++ R/pkg/R/generics.R | 185 +++--- R/pkg/inst/tests/test_sparkSQL.R | 21 ++-- 5 files changed, 309 insertions(+), 102 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f4bc01f1/R/pkg/DESCRIPTION -- diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION index 4949d86..83e6489 100644 --- a/R/pkg/DESCRIPTION +++ b/R/pkg/DESCRIPTION @@ -29,6 +29,7 @@ Collate: 'client.R' 'context.R' 'deserialize.R' +'functions.R' 'mllib.R' 'serialize.R' 'sparkR.R' http://git-wip-us.apache.org/repos/asf/spark/blob/f4bc01f1/R/pkg/R/column.R -- diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R index eeaf9f1..328f595 100644 --- a/R/pkg/R/column.R +++ b/R/pkg/R/column.R @@ -60,12 +60,6 @@ operators - list( ) column_functions1 - c(asc, desc, isNull, isNotNull) column_functions2 - c(like, rlike, startsWith, endsWith, getField, getItem, contains) -functions - c(min, max, sum, avg, mean, count, abs, sqrt, - first, last, lower, upper, sumDistinct, - acos, asin, atan, cbrt, ceiling, cos, cosh, exp, - expm1, floor, log, log10, log1p, rint, sign, - sin, sinh, tan, tanh, toDegrees, toRadians) -binary_mathfunctions - c(atan2, hypot) createOperator - function(op) { setMethod(op, @@ -111,33 +105,6 @@ createColumnFunction2 - function(name) { }) } -createStaticFunction - function(name) { - setMethod(name, -signature(x = Column), -function(x) { - if (name == ceiling) { - name - ceil - } - if (name == sign) { - name - signum - } - jc - callJStatic(org.apache.spark.sql.functions, name, x@jc) - column(jc) -}) -} - -createBinaryMathfunctions - function(name) { - setMethod(name, -signature(y = Column), -function(y, x) { - if (class(x) == Column) { -x - x@jc - } - jc - callJStatic(org.apache.spark.sql.functions, name, y@jc, x) - column(jc) -}) -} - createMethods - function() { for (op in names(operators)) { createOperator(op) @@ -148,12 +115,6 @@ createMethods - function() { for (name in column_functions2) { createColumnFunction2(name) } - for (x in functions) { -createStaticFunction(x) - } - for (name in binary_mathfunctions) { -createBinaryMathfunctions(name) - } } createMethods() @@ -242,45 +203,3 @@ setMethod(%in%, jc -
spark git commit: [SPARK-9855] [SPARKR] Add expression functions into SparkR whose params are simple
Repository: spark Updated Branches: refs/heads/branch-1.5 62ab2a4c6 - ca39c9e91 [SPARK-9855] [SPARKR] Add expression functions into SparkR whose params are simple I added lots of expression functions for SparkR. This PR includes only functions whose params are only `(Column)` or `(Column, Column)`. And I think we need to improve how to test those functions. However, it would be better to work on another issue. ## Diff Summary - Add lots of functions in `functions.R` and their generic in `generic.R` - Add aliases for `ceiling` and `sign` - Move expression functions from `column.R` to `functions.R` - Modify `rdname` from `column` to `functions` I haven't supported `not` function, because the name has a collesion with `testthat` package. I didn't think of the way to define it. ## New Supported Functions ``` approxCountDistinct ascii base64 bin bitwiseNOT ceil (alias: ceiling) crc32 dayofmonth dayofyear explode factorial hex hour initcap isNaN last_day length log2 ltrim md5 minute month negate quarter reverse round rtrim second sha1 signum (alias: sign) size soundex to_date trim unbase64 unhex weekofyear year datediff levenshtein months_between nanvl pmod ``` ## JIRA [[SPARK-9855] Add expression functions into SparkR whose params are simple - ASF JIRA](https://issues.apache.org/jira/browse/SPARK-9855) Author: Yu ISHIKAWA yuu.ishik...@gmail.com Closes #8123 from yu-iskw/SPARK-9855. (cherry picked from commit f4bc01f1f33a93e6affe5c8a3e33ffbd92d03f38) Signed-off-by: Shivaram Venkataraman shiva...@cs.berkeley.edu Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ca39c9e9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ca39c9e9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ca39c9e9 Branch: refs/heads/branch-1.5 Commit: ca39c9e91602223f5665ab6942b917c4900bd996 Parents: 62ab2a4 Author: Yu ISHIKAWA yuu.ishik...@gmail.com Authored: Wed Aug 12 18:33:27 2015 -0700 Committer: Shivaram Venkataraman shiva...@cs.berkeley.edu Committed: Wed Aug 12 18:33:35 2015 -0700 -- R/pkg/DESCRIPTION| 1 + R/pkg/R/column.R | 81 --- R/pkg/R/functions.R | 123 ++ R/pkg/R/generics.R | 185 +++--- R/pkg/inst/tests/test_sparkSQL.R | 21 ++-- 5 files changed, 309 insertions(+), 102 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ca39c9e9/R/pkg/DESCRIPTION -- diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION index 4949d86..83e6489 100644 --- a/R/pkg/DESCRIPTION +++ b/R/pkg/DESCRIPTION @@ -29,6 +29,7 @@ Collate: 'client.R' 'context.R' 'deserialize.R' +'functions.R' 'mllib.R' 'serialize.R' 'sparkR.R' http://git-wip-us.apache.org/repos/asf/spark/blob/ca39c9e9/R/pkg/R/column.R -- diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R index eeaf9f1..328f595 100644 --- a/R/pkg/R/column.R +++ b/R/pkg/R/column.R @@ -60,12 +60,6 @@ operators - list( ) column_functions1 - c(asc, desc, isNull, isNotNull) column_functions2 - c(like, rlike, startsWith, endsWith, getField, getItem, contains) -functions - c(min, max, sum, avg, mean, count, abs, sqrt, - first, last, lower, upper, sumDistinct, - acos, asin, atan, cbrt, ceiling, cos, cosh, exp, - expm1, floor, log, log10, log1p, rint, sign, - sin, sinh, tan, tanh, toDegrees, toRadians) -binary_mathfunctions - c(atan2, hypot) createOperator - function(op) { setMethod(op, @@ -111,33 +105,6 @@ createColumnFunction2 - function(name) { }) } -createStaticFunction - function(name) { - setMethod(name, -signature(x = Column), -function(x) { - if (name == ceiling) { - name - ceil - } - if (name == sign) { - name - signum - } - jc - callJStatic(org.apache.spark.sql.functions, name, x@jc) - column(jc) -}) -} - -createBinaryMathfunctions - function(name) { - setMethod(name, -signature(y = Column), -function(y, x) { - if (class(x) == Column) { -x - x@jc - } - jc - callJStatic(org.apache.spark.sql.functions, name, y@jc, x) - column(jc) -}) -} - createMethods - function() { for (op in names(operators)) { createOperator(op) @@ -148,12 +115,6 @@ createMethods - function() { for (name in column_functions2) { createColumnFunction2(name) } - for (x in functions) { -createStaticFunction(x) - } - for (name in
spark git commit: [SPARK-9832] [SQL] add a thread-safe lookup for BytesToBytseMap
Repository: spark Updated Branches: refs/heads/master 227821905 - a8ab2634c [SPARK-9832] [SQL] add a thread-safe lookup for BytesToBytseMap This patch add a thread-safe lookup for BytesToBytseMap, and use that in broadcasted HashedRelation. Author: Davies Liu dav...@databricks.com Closes #8151 from davies/safeLookup. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a8ab2634 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a8ab2634 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a8ab2634 Branch: refs/heads/master Commit: a8ab2634c1eee143a4deaf309204df8add727f9e Parents: 2278219 Author: Davies Liu dav...@databricks.com Authored: Wed Aug 12 21:26:00 2015 -0700 Committer: Reynold Xin r...@databricks.com Committed: Wed Aug 12 21:26:00 2015 -0700 -- .../spark/unsafe/map/BytesToBytesMap.java | 30 ++-- .../sql/execution/joins/HashedRelation.scala| 6 ++-- 2 files changed, 26 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a8ab2634/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java -- diff --git a/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java b/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java index 87ed47e..5f3a4fc 100644 --- a/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java +++ b/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java @@ -17,25 +17,24 @@ package org.apache.spark.unsafe.map; -import java.lang.Override; -import java.lang.UnsupportedOperationException; +import javax.annotation.Nullable; import java.util.Iterator; import java.util.LinkedList; import java.util.List; -import javax.annotation.Nullable; - import com.google.common.annotations.VisibleForTesting; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.spark.shuffle.ShuffleMemoryManager; -import org.apache.spark.unsafe.*; +import org.apache.spark.unsafe.Platform; import org.apache.spark.unsafe.array.ByteArrayMethods; import org.apache.spark.unsafe.array.LongArray; import org.apache.spark.unsafe.bitset.BitSet; import org.apache.spark.unsafe.hash.Murmur3_x86_32; -import org.apache.spark.unsafe.memory.*; +import org.apache.spark.unsafe.memory.MemoryBlock; +import org.apache.spark.unsafe.memory.MemoryLocation; +import org.apache.spark.unsafe.memory.TaskMemoryManager; /** * An append-only hash map where keys and values are contiguous regions of bytes. @@ -328,6 +327,20 @@ public final class BytesToBytesMap { Object keyBaseObject, long keyBaseOffset, int keyRowLengthBytes) { +safeLookup(keyBaseObject, keyBaseOffset, keyRowLengthBytes, loc); +return loc; + } + + /** + * Looks up a key, and saves the result in provided `loc`. + * + * This is a thread-safe version of `lookup`, could be used by multiple threads. + */ + public void safeLookup( + Object keyBaseObject, + long keyBaseOffset, + int keyRowLengthBytes, + Location loc) { assert(bitset != null); assert(longArray != null); @@ -343,7 +356,8 @@ public final class BytesToBytesMap { } if (!bitset.isSet(pos)) { // This is a new key. -return loc.with(pos, hashcode, false); +loc.with(pos, hashcode, false); +return; } else { long stored = longArray.get(pos * 2 + 1); if ((int) (stored) == hashcode) { @@ -361,7 +375,7 @@ public final class BytesToBytesMap { keyRowLengthBytes ); if (areEqual) { - return loc; + return; } else { if (enablePerfMetrics) { numHashCollisions++; http://git-wip-us.apache.org/repos/asf/spark/blob/a8ab2634/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala index bb333b4..ea02076 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala @@ -215,8 +215,10 @@ private[joins] final class UnsafeHashedRelation( if (binaryMap != null) { // Used in Broadcast join - val loc = binaryMap.lookup(unsafeKey.getBaseObject, unsafeKey.getBaseOffset, -unsafeKey.getSizeInBytes) + val map = binaryMap // avoid the compiler error + val loc = new map.Location // this could be allocated in stack +
spark git commit: [SPARK-9832] [SQL] add a thread-safe lookup for BytesToBytseMap
Repository: spark Updated Branches: refs/heads/branch-1.5 3b1b8ea3e - 8229437c3 [SPARK-9832] [SQL] add a thread-safe lookup for BytesToBytseMap This patch add a thread-safe lookup for BytesToBytseMap, and use that in broadcasted HashedRelation. Author: Davies Liu dav...@databricks.com Closes #8151 from davies/safeLookup. (cherry picked from commit a8ab2634c1eee143a4deaf309204df8add727f9e) Signed-off-by: Reynold Xin r...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8229437c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8229437c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8229437c Branch: refs/heads/branch-1.5 Commit: 8229437c31db3a059b48ba26633d9f038cac74b8 Parents: 3b1b8ea Author: Davies Liu dav...@databricks.com Authored: Wed Aug 12 21:26:00 2015 -0700 Committer: Reynold Xin r...@databricks.com Committed: Wed Aug 12 21:26:08 2015 -0700 -- .../spark/unsafe/map/BytesToBytesMap.java | 30 ++-- .../sql/execution/joins/HashedRelation.scala| 6 ++-- 2 files changed, 26 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8229437c/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java -- diff --git a/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java b/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java index 87ed47e..5f3a4fc 100644 --- a/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java +++ b/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java @@ -17,25 +17,24 @@ package org.apache.spark.unsafe.map; -import java.lang.Override; -import java.lang.UnsupportedOperationException; +import javax.annotation.Nullable; import java.util.Iterator; import java.util.LinkedList; import java.util.List; -import javax.annotation.Nullable; - import com.google.common.annotations.VisibleForTesting; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.spark.shuffle.ShuffleMemoryManager; -import org.apache.spark.unsafe.*; +import org.apache.spark.unsafe.Platform; import org.apache.spark.unsafe.array.ByteArrayMethods; import org.apache.spark.unsafe.array.LongArray; import org.apache.spark.unsafe.bitset.BitSet; import org.apache.spark.unsafe.hash.Murmur3_x86_32; -import org.apache.spark.unsafe.memory.*; +import org.apache.spark.unsafe.memory.MemoryBlock; +import org.apache.spark.unsafe.memory.MemoryLocation; +import org.apache.spark.unsafe.memory.TaskMemoryManager; /** * An append-only hash map where keys and values are contiguous regions of bytes. @@ -328,6 +327,20 @@ public final class BytesToBytesMap { Object keyBaseObject, long keyBaseOffset, int keyRowLengthBytes) { +safeLookup(keyBaseObject, keyBaseOffset, keyRowLengthBytes, loc); +return loc; + } + + /** + * Looks up a key, and saves the result in provided `loc`. + * + * This is a thread-safe version of `lookup`, could be used by multiple threads. + */ + public void safeLookup( + Object keyBaseObject, + long keyBaseOffset, + int keyRowLengthBytes, + Location loc) { assert(bitset != null); assert(longArray != null); @@ -343,7 +356,8 @@ public final class BytesToBytesMap { } if (!bitset.isSet(pos)) { // This is a new key. -return loc.with(pos, hashcode, false); +loc.with(pos, hashcode, false); +return; } else { long stored = longArray.get(pos * 2 + 1); if ((int) (stored) == hashcode) { @@ -361,7 +375,7 @@ public final class BytesToBytesMap { keyRowLengthBytes ); if (areEqual) { - return loc; + return; } else { if (enablePerfMetrics) { numHashCollisions++; http://git-wip-us.apache.org/repos/asf/spark/blob/8229437c/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala index bb333b4..ea02076 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala @@ -215,8 +215,10 @@ private[joins] final class UnsafeHashedRelation( if (binaryMap != null) { // Used in Broadcast join - val loc = binaryMap.lookup(unsafeKey.getBaseObject, unsafeKey.getBaseOffset, -unsafeKey.getSizeInBytes) + val map =
spark git commit: [SPARK-9927] [SQL] Revert 8049 since it's pushing wrong filter down
Repository: spark Updated Branches: refs/heads/master d7eb371eb - d0b18919d [SPARK-9927] [SQL] Revert 8049 since it's pushing wrong filter down I made a mistake in #8049 by casting literal value to attribute's data type, which would cause simply truncate the literal value and push a wrong filter down. JIRA: https://issues.apache.org/jira/browse/SPARK-9927 Author: Yijie Shen henry.yijies...@gmail.com Closes #8157 from yjshen/rever8049. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d0b18919 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d0b18919 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d0b18919 Branch: refs/heads/master Commit: d0b18919d16e6a2f19159516bd2767b60b595279 Parents: d7eb371 Author: Yijie Shen henry.yijies...@gmail.com Authored: Thu Aug 13 13:33:39 2015 +0800 Committer: Cheng Lian l...@databricks.com Committed: Thu Aug 13 13:33:39 2015 +0800 -- .../datasources/DataSourceStrategy.scala| 30 ++--- .../execution/datasources/jdbc/JDBCRDD.scala| 2 +- .../org/apache/spark/sql/jdbc/JDBCSuite.scala | 35 3 files changed, 3 insertions(+), 64 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d0b18919/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala index 9eea2b0..2a4c40d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala @@ -20,13 +20,13 @@ package org.apache.spark.sql.execution.datasources import org.apache.spark.{Logging, TaskContext} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.rdd.{MapPartitionsRDD, RDD, UnionRDD} -import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow, expressions} +import org.apache.spark.sql.catalyst.{InternalRow, expressions} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.planning.PhysicalOperation import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.sources._ -import org.apache.spark.sql.types.{TimestampType, DateType, StringType, StructType} +import org.apache.spark.sql.types.{StringType, StructType} import org.apache.spark.sql.{SaveMode, Strategy, execution, sources, _} import org.apache.spark.unsafe.types.UTF8String import org.apache.spark.util.{SerializableConfiguration, Utils} @@ -343,17 +343,11 @@ private[sql] object DataSourceStrategy extends Strategy with Logging { * and convert them. */ protected[sql] def selectFilters(filters: Seq[Expression]) = { -import CatalystTypeConverters._ - def translate(predicate: Expression): Option[Filter] = predicate match { case expressions.EqualTo(a: Attribute, Literal(v, _)) = Some(sources.EqualTo(a.name, v)) case expressions.EqualTo(Literal(v, _), a: Attribute) = Some(sources.EqualTo(a.name, v)) - case expressions.EqualTo(Cast(a: Attribute, _), l: Literal) = -Some(sources.EqualTo(a.name, convertToScala(Cast(l, a.dataType).eval(), a.dataType))) - case expressions.EqualTo(l: Literal, Cast(a: Attribute, _)) = -Some(sources.EqualTo(a.name, convertToScala(Cast(l, a.dataType).eval(), a.dataType))) case expressions.EqualNullSafe(a: Attribute, Literal(v, _)) = Some(sources.EqualNullSafe(a.name, v)) @@ -364,41 +358,21 @@ private[sql] object DataSourceStrategy extends Strategy with Logging { Some(sources.GreaterThan(a.name, v)) case expressions.GreaterThan(Literal(v, _), a: Attribute) = Some(sources.LessThan(a.name, v)) - case expressions.GreaterThan(Cast(a: Attribute, _), l: Literal) = -Some(sources.GreaterThan(a.name, convertToScala(Cast(l, a.dataType).eval(), a.dataType))) - case expressions.GreaterThan(l: Literal, Cast(a: Attribute, _)) = -Some(sources.LessThan(a.name, convertToScala(Cast(l, a.dataType).eval(), a.dataType))) case expressions.LessThan(a: Attribute, Literal(v, _)) = Some(sources.LessThan(a.name, v)) case expressions.LessThan(Literal(v, _), a: Attribute) = Some(sources.GreaterThan(a.name, v)) - case expressions.LessThan(Cast(a: Attribute, _), l: Literal) = -Some(sources.LessThan(a.name, convertToScala(Cast(l, a.dataType).eval(), a.dataType))) - case expressions.LessThan(l:
spark git commit: [SPARK-9903] [MLLIB] skip local processing in PrefixSpan if there are no small prefixes
Repository: spark Updated Branches: refs/heads/branch-1.5 a06860c2f - af470a757 [SPARK-9903] [MLLIB] skip local processing in PrefixSpan if there are no small prefixes There exists a chance that the prefixes keep growing to the maximum pattern length. Then the final local processing step becomes unnecessary. feynmanliang Author: Xiangrui Meng m...@databricks.com Closes #8136 from mengxr/SPARK-9903. (cherry picked from commit d7053bea985679c514b3add029631ea23e1730ce) Signed-off-by: Xiangrui Meng m...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/af470a75 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/af470a75 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/af470a75 Branch: refs/heads/branch-1.5 Commit: af470a757c7aed81d626634590a0fb395f0241f5 Parents: a06860c Author: Xiangrui Meng m...@databricks.com Authored: Wed Aug 12 20:44:40 2015 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Wed Aug 12 20:44:49 2015 -0700 -- .../org/apache/spark/mllib/fpm/PrefixSpan.scala | 37 +++- 1 file changed, 21 insertions(+), 16 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/af470a75/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala index ad6715b5..dc4ae1d 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala @@ -282,25 +282,30 @@ object PrefixSpan extends Logging { largePrefixes = newLargePrefixes } -// Switch to local processing. -val bcSmallPrefixes = sc.broadcast(smallPrefixes) -val distributedFreqPattern = postfixes.flatMap { postfix = - bcSmallPrefixes.value.values.map { prefix = -(prefix.id, postfix.project(prefix).compressed) - }.filter(_._2.nonEmpty) -}.groupByKey().flatMap { case (id, projPostfixes) = - val prefix = bcSmallPrefixes.value(id) - val localPrefixSpan = new LocalPrefixSpan(minCount, maxPatternLength - prefix.length) - // TODO: We collect projected postfixes into memory. We should also compare the performance - // TODO: of keeping them on shuffle files. - localPrefixSpan.run(projPostfixes.toArray).map { case (pattern, count) = -(prefix.items ++ pattern, count) +var freqPatterns = sc.parallelize(localFreqPatterns, 1) + +val numSmallPrefixes = smallPrefixes.size +logInfo(snumber of small prefixes for local processing: $numSmallPrefixes) +if (numSmallPrefixes 0) { + // Switch to local processing. + val bcSmallPrefixes = sc.broadcast(smallPrefixes) + val distributedFreqPattern = postfixes.flatMap { postfix = +bcSmallPrefixes.value.values.map { prefix = + (prefix.id, postfix.project(prefix).compressed) +}.filter(_._2.nonEmpty) + }.groupByKey().flatMap { case (id, projPostfixes) = +val prefix = bcSmallPrefixes.value(id) +val localPrefixSpan = new LocalPrefixSpan(minCount, maxPatternLength - prefix.length) +// TODO: We collect projected postfixes into memory. We should also compare the performance +// TODO: of keeping them on shuffle files. +localPrefixSpan.run(projPostfixes.toArray).map { case (pattern, count) = + (prefix.items ++ pattern, count) +} } + // Union local frequent patterns and distributed ones. + freqPatterns = freqPatterns ++ distributedFreqPattern } -// Union local frequent patterns and distributed ones. -val freqPatterns = (sc.parallelize(localFreqPatterns, 1) ++ distributedFreqPattern) - .persist(StorageLevel.MEMORY_AND_DISK) freqPatterns } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9920] [SQL] The simpleString of TungstenAggregate does not show its output
Repository: spark Updated Branches: refs/heads/master 2fb4901b7 - 227821905 [SPARK-9920] [SQL] The simpleString of TungstenAggregate does not show its output https://issues.apache.org/jira/browse/SPARK-9920 Taking `sqlContext.sql(select i, sum(j1) as sum from testAgg group by i).explain()` as an example, the output of our current master is ``` == Physical Plan == TungstenAggregate(key=[i#0], value=[(sum(cast(j1#1 as bigint)),mode=Final,isDistinct=false)] TungstenExchange hashpartitioning(i#0) TungstenAggregate(key=[i#0], value=[(sum(cast(j1#1 as bigint)),mode=Partial,isDistinct=false)] Scan ParquetRelation[file:/user/hive/warehouse/testagg][i#0,j1#1] ``` With this PR, the output will be ``` == Physical Plan == TungstenAggregate(key=[i#0], functions=[(sum(cast(j1#1 as bigint)),mode=Final,isDistinct=false)], output=[i#0,sum#18L]) TungstenExchange hashpartitioning(i#0) TungstenAggregate(key=[i#0], functions=[(sum(cast(j1#1 as bigint)),mode=Partial,isDistinct=false)], output=[i#0,currentSum#22L]) Scan ParquetRelation[file:/user/hive/warehouse/testagg][i#0,j1#1] ``` Author: Yin Huai yh...@databricks.com Closes #8150 from yhuai/SPARK-9920. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/22782190 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/22782190 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/22782190 Branch: refs/heads/master Commit: 2278219054314f1d31ffc358a59aa5067f9f5de9 Parents: 2fb4901 Author: Yin Huai yh...@databricks.com Authored: Wed Aug 12 21:24:15 2015 -0700 Committer: Reynold Xin r...@databricks.com Committed: Wed Aug 12 21:24:15 2015 -0700 -- .../spark/sql/execution/aggregate/SortBasedAggregate.scala| 6 +- .../spark/sql/execution/aggregate/TungstenAggregate.scala | 7 --- 2 files changed, 9 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/22782190/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregate.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregate.scala index ab26f9c..f4c14a9 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregate.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregate.scala @@ -108,6 +108,10 @@ case class SortBasedAggregate( override def simpleString: String = { val allAggregateExpressions = nonCompleteAggregateExpressions ++ completeAggregateExpressions -sSortBasedAggregate ${groupingExpressions} ${allAggregateExpressions} + +val keyString = groupingExpressions.mkString([, ,, ]) +val functionString = allAggregateExpressions.mkString([, ,, ]) +val outputString = output.mkString([, ,, ]) +sSortBasedAggregate(key=$keyString, functions=$functionString, output=$outputString) } } http://git-wip-us.apache.org/repos/asf/spark/blob/22782190/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala index c40ca97..99f51ba 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala @@ -127,11 +127,12 @@ case class TungstenAggregate( testFallbackStartsAt match { case None = val keyString = groupingExpressions.mkString([, ,, ]) -val valueString = allAggregateExpressions.mkString([, ,, ]) -sTungstenAggregate(key=$keyString, value=$valueString +val functionString = allAggregateExpressions.mkString([, ,, ]) +val outputString = output.mkString([, ,, ]) +sTungstenAggregate(key=$keyString, functions=$functionString, output=$outputString) case Some(fallbackStartsAt) = sTungstenAggregateWithControlledFallback $groupingExpressions + - s$allAggregateExpressions fallbackStartsAt=$fallbackStartsAt + s$allAggregateExpressions $resultExpressions fallbackStartsAt=$fallbackStartsAt } } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9917] [ML] add getMin/getMax and doc for originalMin/origianlMax in MinMaxScaler
Repository: spark Updated Branches: refs/heads/branch-1.5 8229437c3 - 16f4bf4ca [SPARK-9917] [ML] add getMin/getMax and doc for originalMin/origianlMax in MinMaxScaler hhbyyh Author: Xiangrui Meng m...@databricks.com Closes #8145 from mengxr/SPARK-9917. (cherry picked from commit 5fc058a1fc5d83ad53feec936475484aef3800b3) Signed-off-by: Xiangrui Meng m...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/16f4bf4c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/16f4bf4c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/16f4bf4c Branch: refs/heads/branch-1.5 Commit: 16f4bf4caa9c6a1403252485470466266d6b1b65 Parents: 8229437 Author: Xiangrui Meng m...@databricks.com Authored: Wed Aug 12 21:33:38 2015 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Wed Aug 12 21:33:46 2015 -0700 -- .../scala/org/apache/spark/ml/feature/MinMaxScaler.scala | 10 +- 1 file changed, 9 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/16f4bf4c/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala index b30adf3..9a473dd 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala @@ -41,6 +41,9 @@ private[feature] trait MinMaxScalerParams extends Params with HasInputCol with H val min: DoubleParam = new DoubleParam(this, min, lower bound of the output feature range) + /** @group getParam */ + def getMin: Double = $(min) + /** * upper bound after transformation, shared by all features * Default: 1.0 @@ -49,6 +52,9 @@ private[feature] trait MinMaxScalerParams extends Params with HasInputCol with H val max: DoubleParam = new DoubleParam(this, max, upper bound of the output feature range) + /** @group getParam */ + def getMax: Double = $(max) + /** Validates and transforms the input schema. */ protected def validateAndTransformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType @@ -115,6 +121,9 @@ class MinMaxScaler(override val uid: String) * :: Experimental :: * Model fitted by [[MinMaxScaler]]. * + * @param originalMin min value for each original column during fitting + * @param originalMax max value for each original column during fitting + * * TODO: The transformer does not yet set the metadata in the output column (SPARK-8529). */ @Experimental @@ -136,7 +145,6 @@ class MinMaxScalerModel private[ml] ( /** @group setParam */ def setMax(value: Double): this.type = set(max, value) - override def transform(dataset: DataFrame): DataFrame = { val originalRange = (originalMax.toBreeze - originalMin.toBreeze).toArray val minArray = originalMin.toArray - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9917] [ML] add getMin/getMax and doc for originalMin/origianlMax in MinMaxScaler
Repository: spark Updated Branches: refs/heads/master a8ab2634c - 5fc058a1f [SPARK-9917] [ML] add getMin/getMax and doc for originalMin/origianlMax in MinMaxScaler hhbyyh Author: Xiangrui Meng m...@databricks.com Closes #8145 from mengxr/SPARK-9917. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5fc058a1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5fc058a1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5fc058a1 Branch: refs/heads/master Commit: 5fc058a1fc5d83ad53feec936475484aef3800b3 Parents: a8ab263 Author: Xiangrui Meng m...@databricks.com Authored: Wed Aug 12 21:33:38 2015 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Wed Aug 12 21:33:38 2015 -0700 -- .../scala/org/apache/spark/ml/feature/MinMaxScaler.scala | 10 +- 1 file changed, 9 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5fc058a1/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala index b30adf3..9a473dd 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala @@ -41,6 +41,9 @@ private[feature] trait MinMaxScalerParams extends Params with HasInputCol with H val min: DoubleParam = new DoubleParam(this, min, lower bound of the output feature range) + /** @group getParam */ + def getMin: Double = $(min) + /** * upper bound after transformation, shared by all features * Default: 1.0 @@ -49,6 +52,9 @@ private[feature] trait MinMaxScalerParams extends Params with HasInputCol with H val max: DoubleParam = new DoubleParam(this, max, upper bound of the output feature range) + /** @group getParam */ + def getMax: Double = $(max) + /** Validates and transforms the input schema. */ protected def validateAndTransformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType @@ -115,6 +121,9 @@ class MinMaxScaler(override val uid: String) * :: Experimental :: * Model fitted by [[MinMaxScaler]]. * + * @param originalMin min value for each original column during fitting + * @param originalMax max value for each original column during fitting + * * TODO: The transformer does not yet set the metadata in the output column (SPARK-8529). */ @Experimental @@ -136,7 +145,6 @@ class MinMaxScalerModel private[ml] ( /** @group setParam */ def setMax(value: Double): this.type = set(max, value) - override def transform(dataset: DataFrame): DataFrame = { val originalRange = (originalMax.toBreeze - originalMin.toBreeze).toArray val minArray = originalMin.toArray - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8922] [DOCUMENTATION, MLLIB] Add @since tags to mllib.evaluation
Repository: spark Updated Branches: refs/heads/master 5fc058a1f - df5438921 [SPARK-8922] [DOCUMENTATION, MLLIB] Add @since tags to mllib.evaluation Author: shikai.tang tar.sk...@gmail.com Closes #7429 from mosessky/master. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/df543892 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/df543892 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/df543892 Branch: refs/heads/master Commit: df543892122342b97e5137b266959ba97589b3ef Parents: 5fc058a Author: shikai.tang tar.sk...@gmail.com Authored: Wed Aug 12 21:53:15 2015 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Wed Aug 12 21:53:15 2015 -0700 -- .../BinaryClassificationMetrics.scala | 32 +--- .../mllib/evaluation/MulticlassMetrics.scala| 9 ++ .../mllib/evaluation/MultilabelMetrics.scala| 4 +++ .../spark/mllib/evaluation/RankingMetrics.scala | 4 +++ .../mllib/evaluation/RegressionMetrics.scala| 6 5 files changed, 50 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/df543892/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala index c1d1a22..486741e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala @@ -41,6 +41,7 @@ import org.apache.spark.sql.DataFrame *of bins may not exactly equal numBins. The last bin in each partition may *be smaller as a result, meaning there may be an extra sample at *partition boundaries. + * @since 1.3.0 */ @Experimental class BinaryClassificationMetrics( @@ -51,6 +52,7 @@ class BinaryClassificationMetrics( /** * Defaults `numBins` to 0. + * @since 1.0.0 */ def this(scoreAndLabels: RDD[(Double, Double)]) = this(scoreAndLabels, 0) @@ -61,12 +63,18 @@ class BinaryClassificationMetrics( private[mllib] def this(scoreAndLabels: DataFrame) = this(scoreAndLabels.map(r = (r.getDouble(0), r.getDouble(1 - /** Unpersist intermediate RDDs used in the computation. */ + /** + * Unpersist intermediate RDDs used in the computation. + * @since 1.0.0 + */ def unpersist() { cumulativeCounts.unpersist() } - /** Returns thresholds in descending order. */ + /** + * Returns thresholds in descending order. + * @since 1.0.0 + */ def thresholds(): RDD[Double] = cumulativeCounts.map(_._1) /** @@ -74,6 +82,7 @@ class BinaryClassificationMetrics( * which is an RDD of (false positive rate, true positive rate) * with (0.0, 0.0) prepended and (1.0, 1.0) appended to it. * @see http://en.wikipedia.org/wiki/Receiver_operating_characteristic + * @since 1.0.0 */ def roc(): RDD[(Double, Double)] = { val rocCurve = createCurve(FalsePositiveRate, Recall) @@ -85,6 +94,7 @@ class BinaryClassificationMetrics( /** * Computes the area under the receiver operating characteristic (ROC) curve. + * @since 1.0.0 */ def areaUnderROC(): Double = AreaUnderCurve.of(roc()) @@ -92,6 +102,7 @@ class BinaryClassificationMetrics( * Returns the precision-recall curve, which is an RDD of (recall, precision), * NOT (precision, recall), with (0.0, 1.0) prepended to it. * @see http://en.wikipedia.org/wiki/Precision_and_recall + * @since 1.0.0 */ def pr(): RDD[(Double, Double)] = { val prCurve = createCurve(Recall, Precision) @@ -102,6 +113,7 @@ class BinaryClassificationMetrics( /** * Computes the area under the precision-recall curve. + * @since 1.0.0 */ def areaUnderPR(): Double = AreaUnderCurve.of(pr()) @@ -110,16 +122,26 @@ class BinaryClassificationMetrics( * @param beta the beta factor in F-Measure computation. * @return an RDD of (threshold, F-Measure) pairs. * @see http://en.wikipedia.org/wiki/F1_score + * @since 1.0.0 */ def fMeasureByThreshold(beta: Double): RDD[(Double, Double)] = createCurve(FMeasure(beta)) - /** Returns the (threshold, F-Measure) curve with beta = 1.0. */ + /** + * Returns the (threshold, F-Measure) curve with beta = 1.0. + * @since 1.0.0 + */ def fMeasureByThreshold(): RDD[(Double, Double)] = fMeasureByThreshold(1.0) - /** Returns the (threshold, precision) curve. */ + /** + * Returns the (threshold, precision) curve. + * @since 1.0.0 + */ def precisionByThreshold(): RDD[(Double,
spark git commit: [SPARK-8922] [DOCUMENTATION, MLLIB] Add @since tags to mllib.evaluation
Repository: spark Updated Branches: refs/heads/branch-1.5 8f055e595 - 690284037 [SPARK-8922] [DOCUMENTATION, MLLIB] Add @since tags to mllib.evaluation Author: shikai.tang tar.sk...@gmail.com Closes #7429 from mosessky/master. (cherry picked from commit df543892122342b97e5137b266959ba97589b3ef) Signed-off-by: Xiangrui Meng m...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/69028403 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/69028403 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/69028403 Branch: refs/heads/branch-1.5 Commit: 690284037ecd880d48d5e835b150a2f31feb7c73 Parents: 8f055e5 Author: shikai.tang tar.sk...@gmail.com Authored: Wed Aug 12 21:53:15 2015 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Wed Aug 12 21:53:24 2015 -0700 -- .../BinaryClassificationMetrics.scala | 32 +--- .../mllib/evaluation/MulticlassMetrics.scala| 9 ++ .../mllib/evaluation/MultilabelMetrics.scala| 4 +++ .../spark/mllib/evaluation/RankingMetrics.scala | 4 +++ .../mllib/evaluation/RegressionMetrics.scala| 6 5 files changed, 50 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/69028403/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala index c1d1a22..486741e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala @@ -41,6 +41,7 @@ import org.apache.spark.sql.DataFrame *of bins may not exactly equal numBins. The last bin in each partition may *be smaller as a result, meaning there may be an extra sample at *partition boundaries. + * @since 1.3.0 */ @Experimental class BinaryClassificationMetrics( @@ -51,6 +52,7 @@ class BinaryClassificationMetrics( /** * Defaults `numBins` to 0. + * @since 1.0.0 */ def this(scoreAndLabels: RDD[(Double, Double)]) = this(scoreAndLabels, 0) @@ -61,12 +63,18 @@ class BinaryClassificationMetrics( private[mllib] def this(scoreAndLabels: DataFrame) = this(scoreAndLabels.map(r = (r.getDouble(0), r.getDouble(1 - /** Unpersist intermediate RDDs used in the computation. */ + /** + * Unpersist intermediate RDDs used in the computation. + * @since 1.0.0 + */ def unpersist() { cumulativeCounts.unpersist() } - /** Returns thresholds in descending order. */ + /** + * Returns thresholds in descending order. + * @since 1.0.0 + */ def thresholds(): RDD[Double] = cumulativeCounts.map(_._1) /** @@ -74,6 +82,7 @@ class BinaryClassificationMetrics( * which is an RDD of (false positive rate, true positive rate) * with (0.0, 0.0) prepended and (1.0, 1.0) appended to it. * @see http://en.wikipedia.org/wiki/Receiver_operating_characteristic + * @since 1.0.0 */ def roc(): RDD[(Double, Double)] = { val rocCurve = createCurve(FalsePositiveRate, Recall) @@ -85,6 +94,7 @@ class BinaryClassificationMetrics( /** * Computes the area under the receiver operating characteristic (ROC) curve. + * @since 1.0.0 */ def areaUnderROC(): Double = AreaUnderCurve.of(roc()) @@ -92,6 +102,7 @@ class BinaryClassificationMetrics( * Returns the precision-recall curve, which is an RDD of (recall, precision), * NOT (precision, recall), with (0.0, 1.0) prepended to it. * @see http://en.wikipedia.org/wiki/Precision_and_recall + * @since 1.0.0 */ def pr(): RDD[(Double, Double)] = { val prCurve = createCurve(Recall, Precision) @@ -102,6 +113,7 @@ class BinaryClassificationMetrics( /** * Computes the area under the precision-recall curve. + * @since 1.0.0 */ def areaUnderPR(): Double = AreaUnderCurve.of(pr()) @@ -110,16 +122,26 @@ class BinaryClassificationMetrics( * @param beta the beta factor in F-Measure computation. * @return an RDD of (threshold, F-Measure) pairs. * @see http://en.wikipedia.org/wiki/F1_score + * @since 1.0.0 */ def fMeasureByThreshold(beta: Double): RDD[(Double, Double)] = createCurve(FMeasure(beta)) - /** Returns the (threshold, F-Measure) curve with beta = 1.0. */ + /** + * Returns the (threshold, F-Measure) curve with beta = 1.0. + * @since 1.0.0 + */ def fMeasureByThreshold(): RDD[(Double, Double)] = fMeasureByThreshold(1.0) - /** Returns the (threshold, precision) curve. */
spark git commit: [SPARK-9903] [MLLIB] skip local processing in PrefixSpan if there are no small prefixes
Repository: spark Updated Branches: refs/heads/master d2d5e7fe2 - d7053bea9 [SPARK-9903] [MLLIB] skip local processing in PrefixSpan if there are no small prefixes There exists a chance that the prefixes keep growing to the maximum pattern length. Then the final local processing step becomes unnecessary. feynmanliang Author: Xiangrui Meng m...@databricks.com Closes #8136 from mengxr/SPARK-9903. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d7053bea Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d7053bea Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d7053bea Branch: refs/heads/master Commit: d7053bea985679c514b3add029631ea23e1730ce Parents: d2d5e7f Author: Xiangrui Meng m...@databricks.com Authored: Wed Aug 12 20:44:40 2015 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Wed Aug 12 20:44:40 2015 -0700 -- .../org/apache/spark/mllib/fpm/PrefixSpan.scala | 37 +++- 1 file changed, 21 insertions(+), 16 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d7053bea/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala index ad6715b5..dc4ae1d 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala @@ -282,25 +282,30 @@ object PrefixSpan extends Logging { largePrefixes = newLargePrefixes } -// Switch to local processing. -val bcSmallPrefixes = sc.broadcast(smallPrefixes) -val distributedFreqPattern = postfixes.flatMap { postfix = - bcSmallPrefixes.value.values.map { prefix = -(prefix.id, postfix.project(prefix).compressed) - }.filter(_._2.nonEmpty) -}.groupByKey().flatMap { case (id, projPostfixes) = - val prefix = bcSmallPrefixes.value(id) - val localPrefixSpan = new LocalPrefixSpan(minCount, maxPatternLength - prefix.length) - // TODO: We collect projected postfixes into memory. We should also compare the performance - // TODO: of keeping them on shuffle files. - localPrefixSpan.run(projPostfixes.toArray).map { case (pattern, count) = -(prefix.items ++ pattern, count) +var freqPatterns = sc.parallelize(localFreqPatterns, 1) + +val numSmallPrefixes = smallPrefixes.size +logInfo(snumber of small prefixes for local processing: $numSmallPrefixes) +if (numSmallPrefixes 0) { + // Switch to local processing. + val bcSmallPrefixes = sc.broadcast(smallPrefixes) + val distributedFreqPattern = postfixes.flatMap { postfix = +bcSmallPrefixes.value.values.map { prefix = + (prefix.id, postfix.project(prefix).compressed) +}.filter(_._2.nonEmpty) + }.groupByKey().flatMap { case (id, projPostfixes) = +val prefix = bcSmallPrefixes.value(id) +val localPrefixSpan = new LocalPrefixSpan(minCount, maxPatternLength - prefix.length) +// TODO: We collect projected postfixes into memory. We should also compare the performance +// TODO: of keeping them on shuffle files. +localPrefixSpan.run(projPostfixes.toArray).map { case (pattern, count) = + (prefix.items ++ pattern, count) +} } + // Union local frequent patterns and distributed ones. + freqPatterns = freqPatterns ++ distributedFreqPattern } -// Union local frequent patterns and distributed ones. -val freqPatterns = (sc.parallelize(localFreqPatterns, 1) ++ distributedFreqPattern) - .persist(StorageLevel.MEMORY_AND_DISK) freqPatterns } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9916] [BUILD] [SPARKR] removed left-over sparkr.zip copy/create commands from codebase
Repository: spark Updated Branches: refs/heads/master d7053bea9 - 2fb4901b7 [SPARK-9916] [BUILD] [SPARKR] removed left-over sparkr.zip copy/create commands from codebase sparkr.zip is now built by SparkSubmit on a need-to-build basis. cc shivaram Author: Burak Yavuz brk...@gmail.com Closes #8147 from brkyvz/make-dist-fix. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2fb4901b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2fb4901b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2fb4901b Branch: refs/heads/master Commit: 2fb4901b71cee65d40a43e61e3f4411c30cdefc3 Parents: d7053be Author: Burak Yavuz brk...@gmail.com Authored: Wed Aug 12 20:59:38 2015 -0700 Committer: Shivaram Venkataraman shiva...@cs.berkeley.edu Committed: Wed Aug 12 20:59:38 2015 -0700 -- R/install-dev.bat| 5 - make-distribution.sh | 1 - 2 files changed, 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2fb4901b/R/install-dev.bat -- diff --git a/R/install-dev.bat b/R/install-dev.bat index f32670b..008a5c6 100644 --- a/R/install-dev.bat +++ b/R/install-dev.bat @@ -25,8 +25,3 @@ set SPARK_HOME=%~dp0.. MKDIR %SPARK_HOME%\R\lib R.exe CMD INSTALL --library=%SPARK_HOME%\R\lib %SPARK_HOME%\R\pkg\ - -rem Zip the SparkR package so that it can be distributed to worker nodes on YARN -pushd %SPARK_HOME%\R\lib -%JAVA_HOME%\bin\jar.exe cfM %SPARK_HOME%\R\lib\sparkr.zip SparkR -popd http://git-wip-us.apache.org/repos/asf/spark/blob/2fb4901b/make-distribution.sh -- diff --git a/make-distribution.sh b/make-distribution.sh index 4789b0e..247a813 100755 --- a/make-distribution.sh +++ b/make-distribution.sh @@ -219,7 +219,6 @@ cp -r $SPARK_HOME/ec2 $DISTDIR if [ -d $SPARK_HOME/R/lib/SparkR ]; then mkdir -p $DISTDIR/R/lib cp -r $SPARK_HOME/R/lib/SparkR $DISTDIR/R/lib - cp $SPARK_HOME/R/lib/sparkr.zip $DISTDIR/R/lib fi # Download and copy in tachyon, if requested - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9916] [BUILD] [SPARKR] removed left-over sparkr.zip copy/create commands from codebase
Repository: spark Updated Branches: refs/heads/branch-1.5 af470a757 - 3d1b9f007 [SPARK-9916] [BUILD] [SPARKR] removed left-over sparkr.zip copy/create commands from codebase sparkr.zip is now built by SparkSubmit on a need-to-build basis. cc shivaram Author: Burak Yavuz brk...@gmail.com Closes #8147 from brkyvz/make-dist-fix. (cherry picked from commit 2fb4901b71cee65d40a43e61e3f4411c30cdefc3) Signed-off-by: Shivaram Venkataraman shiva...@cs.berkeley.edu Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3d1b9f00 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3d1b9f00 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3d1b9f00 Branch: refs/heads/branch-1.5 Commit: 3d1b9f007b9b6a9bb4e146de32bd34affa723e12 Parents: af470a7 Author: Burak Yavuz brk...@gmail.com Authored: Wed Aug 12 20:59:38 2015 -0700 Committer: Shivaram Venkataraman shiva...@cs.berkeley.edu Committed: Wed Aug 12 20:59:47 2015 -0700 -- R/install-dev.bat| 5 - make-distribution.sh | 1 - 2 files changed, 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3d1b9f00/R/install-dev.bat -- diff --git a/R/install-dev.bat b/R/install-dev.bat index f32670b..008a5c6 100644 --- a/R/install-dev.bat +++ b/R/install-dev.bat @@ -25,8 +25,3 @@ set SPARK_HOME=%~dp0.. MKDIR %SPARK_HOME%\R\lib R.exe CMD INSTALL --library=%SPARK_HOME%\R\lib %SPARK_HOME%\R\pkg\ - -rem Zip the SparkR package so that it can be distributed to worker nodes on YARN -pushd %SPARK_HOME%\R\lib -%JAVA_HOME%\bin\jar.exe cfM %SPARK_HOME%\R\lib\sparkr.zip SparkR -popd http://git-wip-us.apache.org/repos/asf/spark/blob/3d1b9f00/make-distribution.sh -- diff --git a/make-distribution.sh b/make-distribution.sh index 8589255..04ad005 100755 --- a/make-distribution.sh +++ b/make-distribution.sh @@ -219,7 +219,6 @@ cp -r $SPARK_HOME/ec2 $DISTDIR if [ -d $SPARK_HOME/R/lib/SparkR ]; then mkdir -p $DISTDIR/R/lib cp -r $SPARK_HOME/R/lib/SparkR $DISTDIR/R/lib - cp $SPARK_HOME/R/lib/sparkr.zip $DISTDIR/R/lib fi # Download and copy in tachyon, if requested - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9920] [SQL] The simpleString of TungstenAggregate does not show its output
Repository: spark Updated Branches: refs/heads/branch-1.5 3d1b9f007 - 3b1b8ea3e [SPARK-9920] [SQL] The simpleString of TungstenAggregate does not show its output https://issues.apache.org/jira/browse/SPARK-9920 Taking `sqlContext.sql(select i, sum(j1) as sum from testAgg group by i).explain()` as an example, the output of our current master is ``` == Physical Plan == TungstenAggregate(key=[i#0], value=[(sum(cast(j1#1 as bigint)),mode=Final,isDistinct=false)] TungstenExchange hashpartitioning(i#0) TungstenAggregate(key=[i#0], value=[(sum(cast(j1#1 as bigint)),mode=Partial,isDistinct=false)] Scan ParquetRelation[file:/user/hive/warehouse/testagg][i#0,j1#1] ``` With this PR, the output will be ``` == Physical Plan == TungstenAggregate(key=[i#0], functions=[(sum(cast(j1#1 as bigint)),mode=Final,isDistinct=false)], output=[i#0,sum#18L]) TungstenExchange hashpartitioning(i#0) TungstenAggregate(key=[i#0], functions=[(sum(cast(j1#1 as bigint)),mode=Partial,isDistinct=false)], output=[i#0,currentSum#22L]) Scan ParquetRelation[file:/user/hive/warehouse/testagg][i#0,j1#1] ``` Author: Yin Huai yh...@databricks.com Closes #8150 from yhuai/SPARK-9920. (cherry picked from commit 2278219054314f1d31ffc358a59aa5067f9f5de9) Signed-off-by: Reynold Xin r...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3b1b8ea3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3b1b8ea3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3b1b8ea3 Branch: refs/heads/branch-1.5 Commit: 3b1b8ea3e7dfb93e7017fbd97bd2794f4815c8f0 Parents: 3d1b9f0 Author: Yin Huai yh...@databricks.com Authored: Wed Aug 12 21:24:15 2015 -0700 Committer: Reynold Xin r...@databricks.com Committed: Wed Aug 12 21:24:23 2015 -0700 -- .../spark/sql/execution/aggregate/SortBasedAggregate.scala| 6 +- .../spark/sql/execution/aggregate/TungstenAggregate.scala | 7 --- 2 files changed, 9 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3b1b8ea3/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregate.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregate.scala index ab26f9c..f4c14a9 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregate.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregate.scala @@ -108,6 +108,10 @@ case class SortBasedAggregate( override def simpleString: String = { val allAggregateExpressions = nonCompleteAggregateExpressions ++ completeAggregateExpressions -sSortBasedAggregate ${groupingExpressions} ${allAggregateExpressions} + +val keyString = groupingExpressions.mkString([, ,, ]) +val functionString = allAggregateExpressions.mkString([, ,, ]) +val outputString = output.mkString([, ,, ]) +sSortBasedAggregate(key=$keyString, functions=$functionString, output=$outputString) } } http://git-wip-us.apache.org/repos/asf/spark/blob/3b1b8ea3/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala index c40ca97..99f51ba 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala @@ -127,11 +127,12 @@ case class TungstenAggregate( testFallbackStartsAt match { case None = val keyString = groupingExpressions.mkString([, ,, ]) -val valueString = allAggregateExpressions.mkString([, ,, ]) -sTungstenAggregate(key=$keyString, value=$valueString +val functionString = allAggregateExpressions.mkString([, ,, ]) +val outputString = output.mkString([, ,, ]) +sTungstenAggregate(key=$keyString, functions=$functionString, output=$outputString) case Some(fallbackStartsAt) = sTungstenAggregateWithControlledFallback $groupingExpressions + - s$allAggregateExpressions fallbackStartsAt=$fallbackStartsAt + s$allAggregateExpressions $resultExpressions fallbackStartsAt=$fallbackStartsAt } } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9914] [ML] define setters explicitly for Java and use setParam group in RFormula
Repository: spark Updated Branches: refs/heads/master df5438921 - d7eb371eb [SPARK-9914] [ML] define setters explicitly for Java and use setParam group in RFormula The problem with defining setters in the base class is that it doesn't return the correct type in Java. ericl Author: Xiangrui Meng m...@databricks.com Closes #8143 from mengxr/SPARK-9914 and squashes the following commits: d36c887 [Xiangrui Meng] remove setters from model a49021b [Xiangrui Meng] define setters explicitly for Java and use setParam group Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d7eb371e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d7eb371e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d7eb371e Branch: refs/heads/master Commit: d7eb371eb6369a34e58a09179efe058c4101de9e Parents: df54389 Author: Xiangrui Meng m...@databricks.com Authored: Wed Aug 12 22:30:33 2015 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Wed Aug 12 22:30:33 2015 -0700 -- .../scala/org/apache/spark/ml/feature/RFormula.scala | 11 ++- 1 file changed, 6 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d7eb371e/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala index d5360c9..a752dac 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala @@ -33,11 +33,6 @@ import org.apache.spark.sql.types._ * Base trait for [[RFormula]] and [[RFormulaModel]]. */ private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol { - /** @group getParam */ - def setFeaturesCol(value: String): this.type = set(featuresCol, value) - - /** @group getParam */ - def setLabelCol(value: String): this.type = set(labelCol, value) protected def hasLabelCol(schema: StructType): Boolean = { schema.map(_.name).contains($(labelCol)) @@ -71,6 +66,12 @@ class RFormula(override val uid: String) extends Estimator[RFormulaModel] with R /** @group getParam */ def getFormula: String = $(formula) + /** @group setParam */ + def setFeaturesCol(value: String): this.type = set(featuresCol, value) + + /** @group setParam */ + def setLabelCol(value: String): this.type = set(labelCol, value) + /** Whether the formula specifies fitting an intercept. */ private[ml] def hasIntercept: Boolean = { require(isDefined(formula), Formula must be defined first.) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9870] Disable driver UI and Master REST server in SparkSubmitSuite
Repository: spark Updated Branches: refs/heads/master f4bc01f1f - 7b13ed27c [SPARK-9870] Disable driver UI and Master REST server in SparkSubmitSuite I think that we should pass additional configuration flags to disable the driver UI and Master REST server in SparkSubmitSuite and HiveSparkSubmitSuite. This might cut down on port-contention-related flakiness in Jenkins. Author: Josh Rosen joshro...@databricks.com Closes #8124 from JoshRosen/disable-ui-in-sparksubmitsuite. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7b13ed27 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7b13ed27 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7b13ed27 Branch: refs/heads/master Commit: 7b13ed27c1296cf76d0946e400f3449c335c8471 Parents: f4bc01f Author: Josh Rosen joshro...@databricks.com Authored: Wed Aug 12 18:52:11 2015 -0700 Committer: Andrew Or and...@databricks.com Committed: Wed Aug 12 18:52:11 2015 -0700 -- .../scala/org/apache/spark/deploy/SparkSubmitSuite.scala | 7 +++ .../org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala | 10 +- 2 files changed, 16 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7b13ed27/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala index 2456c5d..1110ca6 100644 --- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala @@ -324,6 +324,8 @@ class SparkSubmitSuite --class, SimpleApplicationTest.getClass.getName.stripSuffix($), --name, testApp, --master, local, + --conf, spark.ui.enabled=false, + --conf, spark.master.rest.enabled=false, unusedJar.toString) runSparkSubmit(args) } @@ -337,6 +339,8 @@ class SparkSubmitSuite --class, JarCreationTest.getClass.getName.stripSuffix($), --name, testApp, --master, local-cluster[2,1,1024], + --conf, spark.ui.enabled=false, + --conf, spark.master.rest.enabled=false, --jars, jarsString, unusedJar.toString, SparkSubmitClassA, SparkSubmitClassB) runSparkSubmit(args) @@ -355,6 +359,7 @@ class SparkSubmitSuite --packages, Seq(main, dep).mkString(,), --repositories, repo, --conf, spark.ui.enabled=false, +--conf, spark.master.rest.enabled=false, unusedJar.toString, my.great.lib.MyLib, my.great.dep.MyLib) runSparkSubmit(args) @@ -500,6 +505,8 @@ class SparkSubmitSuite --master, local, --conf, spark.driver.extraClassPath= + systemJar, --conf, spark.driver.userClassPathFirst=true, + --conf, spark.ui.enabled=false, + --conf, spark.master.rest.enabled=false, userJar.toString) runSparkSubmit(args) } http://git-wip-us.apache.org/repos/asf/spark/blob/7b13ed27/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala index b8d4106..1e1972d 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala @@ -57,6 +57,8 @@ class HiveSparkSubmitSuite --class, SparkSubmitClassLoaderTest.getClass.getName.stripSuffix($), --name, SparkSubmitClassLoaderTest, --master, local-cluster[2,1,1024], + --conf, spark.ui.enabled=false, + --conf, spark.master.rest.enabled=false, --jars, jarsString, unusedJar.toString, SparkSubmitClassA, SparkSubmitClassB) runSparkSubmit(args) @@ -68,6 +70,8 @@ class HiveSparkSubmitSuite --class, SparkSQLConfTest.getClass.getName.stripSuffix($), --name, SparkSQLConfTest, --master, local-cluster[2,1,1024], + --conf, spark.ui.enabled=false, + --conf, spark.master.rest.enabled=false, unusedJar.toString) runSparkSubmit(args) } @@ -79,7 +83,11 @@ class HiveSparkSubmitSuite // the HiveContext code mistakenly overrides the class loader that contains user classes. // For more detail, see sql/hive/src/test/resources/regression-test-SPARK-8489/*scala. val testJar = sql/hive/src/test/resources/regression-test-SPARK-8489/test.jar -val args = Seq(--class, Main, testJar) +val args = Seq( + --conf, spark.ui.enabled=false, + --conf, spark.master.rest.enabled=false, +
spark git commit: [SPARK-9870] Disable driver UI and Master REST server in SparkSubmitSuite
Repository: spark Updated Branches: refs/heads/branch-1.5 ca39c9e91 - 4b547b91d [SPARK-9870] Disable driver UI and Master REST server in SparkSubmitSuite I think that we should pass additional configuration flags to disable the driver UI and Master REST server in SparkSubmitSuite and HiveSparkSubmitSuite. This might cut down on port-contention-related flakiness in Jenkins. Author: Josh Rosen joshro...@databricks.com Closes #8124 from JoshRosen/disable-ui-in-sparksubmitsuite. (cherry picked from commit 7b13ed27c1296cf76d0946e400f3449c335c8471) Signed-off-by: Andrew Or and...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4b547b91 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4b547b91 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4b547b91 Branch: refs/heads/branch-1.5 Commit: 4b547b91d6786b2478cf9068023217c979372e79 Parents: ca39c9e Author: Josh Rosen joshro...@databricks.com Authored: Wed Aug 12 18:52:11 2015 -0700 Committer: Andrew Or and...@databricks.com Committed: Wed Aug 12 18:52:20 2015 -0700 -- .../scala/org/apache/spark/deploy/SparkSubmitSuite.scala | 7 +++ .../org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala | 10 +- 2 files changed, 16 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4b547b91/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala index 2456c5d..1110ca6 100644 --- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala @@ -324,6 +324,8 @@ class SparkSubmitSuite --class, SimpleApplicationTest.getClass.getName.stripSuffix($), --name, testApp, --master, local, + --conf, spark.ui.enabled=false, + --conf, spark.master.rest.enabled=false, unusedJar.toString) runSparkSubmit(args) } @@ -337,6 +339,8 @@ class SparkSubmitSuite --class, JarCreationTest.getClass.getName.stripSuffix($), --name, testApp, --master, local-cluster[2,1,1024], + --conf, spark.ui.enabled=false, + --conf, spark.master.rest.enabled=false, --jars, jarsString, unusedJar.toString, SparkSubmitClassA, SparkSubmitClassB) runSparkSubmit(args) @@ -355,6 +359,7 @@ class SparkSubmitSuite --packages, Seq(main, dep).mkString(,), --repositories, repo, --conf, spark.ui.enabled=false, +--conf, spark.master.rest.enabled=false, unusedJar.toString, my.great.lib.MyLib, my.great.dep.MyLib) runSparkSubmit(args) @@ -500,6 +505,8 @@ class SparkSubmitSuite --master, local, --conf, spark.driver.extraClassPath= + systemJar, --conf, spark.driver.userClassPathFirst=true, + --conf, spark.ui.enabled=false, + --conf, spark.master.rest.enabled=false, userJar.toString) runSparkSubmit(args) } http://git-wip-us.apache.org/repos/asf/spark/blob/4b547b91/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala index b8d4106..1e1972d 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala @@ -57,6 +57,8 @@ class HiveSparkSubmitSuite --class, SparkSubmitClassLoaderTest.getClass.getName.stripSuffix($), --name, SparkSubmitClassLoaderTest, --master, local-cluster[2,1,1024], + --conf, spark.ui.enabled=false, + --conf, spark.master.rest.enabled=false, --jars, jarsString, unusedJar.toString, SparkSubmitClassA, SparkSubmitClassB) runSparkSubmit(args) @@ -68,6 +70,8 @@ class HiveSparkSubmitSuite --class, SparkSQLConfTest.getClass.getName.stripSuffix($), --name, SparkSQLConfTest, --master, local-cluster[2,1,1024], + --conf, spark.ui.enabled=false, + --conf, spark.master.rest.enabled=false, unusedJar.toString) runSparkSubmit(args) } @@ -79,7 +83,11 @@ class HiveSparkSubmitSuite // the HiveContext code mistakenly overrides the class loader that contains user classes. // For more detail, see sql/hive/src/test/resources/regression-test-SPARK-8489/*scala. val testJar = sql/hive/src/test/resources/regression-test-SPARK-8489/test.jar -val args = Seq(--class, Main,
[2/2] spark git commit: Preparing development version 1.5.0-SNAPSHOT
Preparing development version 1.5.0-SNAPSHOT Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8f055e59 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8f055e59 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8f055e59 Branch: refs/heads/branch-1.5 Commit: 8f055e59587f25d0c24b0447a3d741535c2c2702 Parents: cedce9b Author: Patrick Wendell pwend...@gmail.com Authored: Wed Aug 12 21:43:13 2015 -0700 Committer: Patrick Wendell pwend...@gmail.com Committed: Wed Aug 12 21:43:13 2015 -0700 -- assembly/pom.xml| 2 +- bagel/pom.xml | 2 +- core/pom.xml| 2 +- examples/pom.xml| 2 +- external/flume-assembly/pom.xml | 2 +- external/flume-sink/pom.xml | 2 +- external/flume/pom.xml | 2 +- external/kafka-assembly/pom.xml | 2 +- external/kafka/pom.xml | 2 +- external/mqtt-assembly/pom.xml | 2 +- external/mqtt/pom.xml | 2 +- external/twitter/pom.xml| 2 +- external/zeromq/pom.xml | 2 +- extras/java8-tests/pom.xml | 2 +- extras/kinesis-asl-assembly/pom.xml | 2 +- extras/kinesis-asl/pom.xml | 2 +- extras/spark-ganglia-lgpl/pom.xml | 2 +- graphx/pom.xml | 2 +- launcher/pom.xml| 2 +- mllib/pom.xml | 2 +- network/common/pom.xml | 2 +- network/shuffle/pom.xml | 2 +- network/yarn/pom.xml| 2 +- pom.xml | 2 +- repl/pom.xml| 2 +- sql/catalyst/pom.xml| 2 +- sql/core/pom.xml| 2 +- sql/hive-thriftserver/pom.xml | 2 +- sql/hive/pom.xml| 2 +- streaming/pom.xml | 2 +- tools/pom.xml | 2 +- unsafe/pom.xml | 2 +- yarn/pom.xml| 2 +- 33 files changed, 33 insertions(+), 33 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8f055e59/assembly/pom.xml -- diff --git a/assembly/pom.xml b/assembly/pom.xml index 1548c1d..e9c6d26 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.5.0-preview/version +version1.5.0-SNAPSHOT/version relativePath../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/8f055e59/bagel/pom.xml -- diff --git a/bagel/pom.xml b/bagel/pom.xml index 594d27b..ed5c37e 100644 --- a/bagel/pom.xml +++ b/bagel/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.5.0-preview/version +version1.5.0-SNAPSHOT/version relativePath../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/8f055e59/core/pom.xml -- diff --git a/core/pom.xml b/core/pom.xml index 227ba55..4f79d71 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.5.0-preview/version +version1.5.0-SNAPSHOT/version relativePath../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/8f055e59/examples/pom.xml -- diff --git a/examples/pom.xml b/examples/pom.xml index d3b7cf7..e6884b0 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.5.0-preview/version +version1.5.0-SNAPSHOT/version relativePath../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/8f055e59/external/flume-assembly/pom.xml -- diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml index 4653f4e..1318959 100644 --- a/external/flume-assembly/pom.xml +++ b/external/flume-assembly/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.5.0-preview/version +version1.5.0-SNAPSHOT/version relativePath../../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/8f055e59/external/flume-sink/pom.xml -- diff --git a/external/flume-sink/pom.xml
Git Push Summary
Repository: spark Updated Tags: refs/tags/v1.5.0-preview-20150812 [created] cedce9bdb - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[1/2] spark git commit: Preparing Spark release v1.5.0-preview-20150812
Repository: spark Updated Branches: refs/heads/branch-1.5 16f4bf4ca - 8f055e595 Preparing Spark release v1.5.0-preview-20150812 Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cedce9bd Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cedce9bd Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cedce9bd Branch: refs/heads/branch-1.5 Commit: cedce9bdb72a00cbcbcc81d57f2a550eaf4416e8 Parents: 16f4bf4 Author: Patrick Wendell pwend...@gmail.com Authored: Wed Aug 12 21:42:59 2015 -0700 Committer: Patrick Wendell pwend...@gmail.com Committed: Wed Aug 12 21:42:59 2015 -0700 -- assembly/pom.xml| 2 +- bagel/pom.xml | 2 +- core/pom.xml| 2 +- examples/pom.xml| 2 +- external/flume-assembly/pom.xml | 2 +- external/flume-sink/pom.xml | 2 +- external/flume/pom.xml | 2 +- external/kafka-assembly/pom.xml | 2 +- external/kafka/pom.xml | 2 +- external/mqtt-assembly/pom.xml | 2 +- external/mqtt/pom.xml | 2 +- external/twitter/pom.xml| 2 +- external/zeromq/pom.xml | 2 +- extras/java8-tests/pom.xml | 2 +- extras/kinesis-asl-assembly/pom.xml | 2 +- extras/kinesis-asl/pom.xml | 2 +- extras/spark-ganglia-lgpl/pom.xml | 2 +- graphx/pom.xml | 2 +- launcher/pom.xml| 2 +- mllib/pom.xml | 2 +- network/common/pom.xml | 2 +- network/shuffle/pom.xml | 2 +- network/yarn/pom.xml| 2 +- pom.xml | 2 +- repl/pom.xml| 2 +- sql/catalyst/pom.xml| 2 +- sql/core/pom.xml| 2 +- sql/hive-thriftserver/pom.xml | 2 +- sql/hive/pom.xml| 2 +- streaming/pom.xml | 2 +- tools/pom.xml | 2 +- unsafe/pom.xml | 2 +- yarn/pom.xml| 2 +- 33 files changed, 33 insertions(+), 33 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/cedce9bd/assembly/pom.xml -- diff --git a/assembly/pom.xml b/assembly/pom.xml index e9c6d26..1548c1d 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.5.0-SNAPSHOT/version +version1.5.0-preview/version relativePath../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/cedce9bd/bagel/pom.xml -- diff --git a/bagel/pom.xml b/bagel/pom.xml index ed5c37e..594d27b 100644 --- a/bagel/pom.xml +++ b/bagel/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.5.0-SNAPSHOT/version +version1.5.0-preview/version relativePath../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/cedce9bd/core/pom.xml -- diff --git a/core/pom.xml b/core/pom.xml index 4f79d71..227ba55 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.5.0-SNAPSHOT/version +version1.5.0-preview/version relativePath../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/cedce9bd/examples/pom.xml -- diff --git a/examples/pom.xml b/examples/pom.xml index e6884b0..d3b7cf7 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.5.0-SNAPSHOT/version +version1.5.0-preview/version relativePath../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/cedce9bd/external/flume-assembly/pom.xml -- diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml index 1318959..4653f4e 100644 --- a/external/flume-assembly/pom.xml +++ b/external/flume-assembly/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.5.0-SNAPSHOT/version +version1.5.0-preview/version relativePath../../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/cedce9bd/external/flume-sink/pom.xml
spark git commit: [SPARK-9927] [SQL] Revert 8049 since it's pushing wrong filter down
Repository: spark Updated Branches: refs/heads/branch-1.5 690284037 - 694e7a3c4 [SPARK-9927] [SQL] Revert 8049 since it's pushing wrong filter down I made a mistake in #8049 by casting literal value to attribute's data type, which would cause simply truncate the literal value and push a wrong filter down. JIRA: https://issues.apache.org/jira/browse/SPARK-9927 Author: Yijie Shen henry.yijies...@gmail.com Closes #8157 from yjshen/rever8049. (cherry picked from commit d0b18919d16e6a2f19159516bd2767b60b595279) Signed-off-by: Cheng Lian l...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/694e7a3c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/694e7a3c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/694e7a3c Branch: refs/heads/branch-1.5 Commit: 694e7a3c45eefbe046a5254a9196f532dc979743 Parents: 6902840 Author: Yijie Shen henry.yijies...@gmail.com Authored: Thu Aug 13 13:33:39 2015 +0800 Committer: Cheng Lian l...@databricks.com Committed: Thu Aug 13 13:34:29 2015 +0800 -- .../datasources/DataSourceStrategy.scala| 30 ++--- .../execution/datasources/jdbc/JDBCRDD.scala| 2 +- .../org/apache/spark/sql/jdbc/JDBCSuite.scala | 35 3 files changed, 3 insertions(+), 64 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/694e7a3c/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala index 9eea2b0..2a4c40d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala @@ -20,13 +20,13 @@ package org.apache.spark.sql.execution.datasources import org.apache.spark.{Logging, TaskContext} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.rdd.{MapPartitionsRDD, RDD, UnionRDD} -import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow, expressions} +import org.apache.spark.sql.catalyst.{InternalRow, expressions} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.planning.PhysicalOperation import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.sources._ -import org.apache.spark.sql.types.{TimestampType, DateType, StringType, StructType} +import org.apache.spark.sql.types.{StringType, StructType} import org.apache.spark.sql.{SaveMode, Strategy, execution, sources, _} import org.apache.spark.unsafe.types.UTF8String import org.apache.spark.util.{SerializableConfiguration, Utils} @@ -343,17 +343,11 @@ private[sql] object DataSourceStrategy extends Strategy with Logging { * and convert them. */ protected[sql] def selectFilters(filters: Seq[Expression]) = { -import CatalystTypeConverters._ - def translate(predicate: Expression): Option[Filter] = predicate match { case expressions.EqualTo(a: Attribute, Literal(v, _)) = Some(sources.EqualTo(a.name, v)) case expressions.EqualTo(Literal(v, _), a: Attribute) = Some(sources.EqualTo(a.name, v)) - case expressions.EqualTo(Cast(a: Attribute, _), l: Literal) = -Some(sources.EqualTo(a.name, convertToScala(Cast(l, a.dataType).eval(), a.dataType))) - case expressions.EqualTo(l: Literal, Cast(a: Attribute, _)) = -Some(sources.EqualTo(a.name, convertToScala(Cast(l, a.dataType).eval(), a.dataType))) case expressions.EqualNullSafe(a: Attribute, Literal(v, _)) = Some(sources.EqualNullSafe(a.name, v)) @@ -364,41 +358,21 @@ private[sql] object DataSourceStrategy extends Strategy with Logging { Some(sources.GreaterThan(a.name, v)) case expressions.GreaterThan(Literal(v, _), a: Attribute) = Some(sources.LessThan(a.name, v)) - case expressions.GreaterThan(Cast(a: Attribute, _), l: Literal) = -Some(sources.GreaterThan(a.name, convertToScala(Cast(l, a.dataType).eval(), a.dataType))) - case expressions.GreaterThan(l: Literal, Cast(a: Attribute, _)) = -Some(sources.LessThan(a.name, convertToScala(Cast(l, a.dataType).eval(), a.dataType))) case expressions.LessThan(a: Attribute, Literal(v, _)) = Some(sources.LessThan(a.name, v)) case expressions.LessThan(Literal(v, _), a: Attribute) = Some(sources.GreaterThan(a.name, v)) - case expressions.LessThan(Cast(a: Attribute, _), l: Literal) = -
spark git commit: [SPARK-9806] [WEB UI] Don't share ReplayListenerBus between multiple applications
Repository: spark Updated Branches: refs/heads/master b85f9a242 - a807fcbe5 [SPARK-9806] [WEB UI] Don't share ReplayListenerBus between multiple applications Author: Rohit Agarwal roh...@qubole.com Closes #8088 from mindprince/SPARK-9806. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a807fcbe Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a807fcbe Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a807fcbe Branch: refs/heads/master Commit: a807fcbe50b2ce18751d80d39e9d21842f7da32a Parents: b85f9a2 Author: Rohit Agarwal roh...@qubole.com Authored: Tue Aug 11 23:20:39 2015 -0700 Committer: Andrew Or and...@databricks.com Committed: Tue Aug 11 23:20:39 2015 -0700 -- .../scala/org/apache/spark/deploy/history/FsHistoryProvider.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a807fcbe/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala index e3060ac..53c18ca 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala @@ -272,9 +272,9 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) * Replay the log files in the list and merge the list of old applications with new ones */ private def mergeApplicationListing(logs: Seq[FileStatus]): Unit = { -val bus = new ReplayListenerBus() val newAttempts = logs.flatMap { fileStatus = try { +val bus = new ReplayListenerBus() val res = replay(fileStatus, bus) res match { case Some(r) = logDebug(sApplication log ${r.logPath} loaded successfully.) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9806] [WEB UI] Don't share ReplayListenerBus between multiple applications
Repository: spark Updated Branches: refs/heads/branch-1.5 2f909184e - 402c0ca9a [SPARK-9806] [WEB UI] Don't share ReplayListenerBus between multiple applications Author: Rohit Agarwal roh...@qubole.com Closes #8088 from mindprince/SPARK-9806. (cherry picked from commit a807fcbe50b2ce18751d80d39e9d21842f7da32a) Signed-off-by: Andrew Or and...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/402c0ca9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/402c0ca9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/402c0ca9 Branch: refs/heads/branch-1.5 Commit: 402c0ca9a36f738fc92d281f69ec5099f19ffbf8 Parents: 2f90918 Author: Rohit Agarwal roh...@qubole.com Authored: Tue Aug 11 23:20:39 2015 -0700 Committer: Andrew Or and...@databricks.com Committed: Tue Aug 11 23:20:55 2015 -0700 -- .../scala/org/apache/spark/deploy/history/FsHistoryProvider.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/402c0ca9/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala index e3060ac..53c18ca 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala @@ -272,9 +272,9 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) * Replay the log files in the list and merge the list of old applications with new ones */ private def mergeApplicationListing(logs: Seq[FileStatus]): Unit = { -val bus = new ReplayListenerBus() val newAttempts = logs.flatMap { fileStatus = try { +val bus = new ReplayListenerBus() val res = replay(fileStatus, bus) res match { case Some(r) = logDebug(sApplication log ${r.logPath} loaded successfully.) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9829] [WEBUI] Display the update value for peak execution memory
Repository: spark Updated Branches: refs/heads/master a807fcbe5 - 4e3f4b934 [SPARK-9829] [WEBUI] Display the update value for peak execution memory The peak execution memory is not correct because it shows the sum of finished tasks' values when a task finishes. This PR fixes it by using the update value rather than the accumulator value. Author: zsxwing zsxw...@gmail.com Closes #8121 from zsxwing/SPARK-9829. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4e3f4b93 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4e3f4b93 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4e3f4b93 Branch: refs/heads/master Commit: 4e3f4b934f74e8c7c06f4940d6381343f9fd4918 Parents: a807fcb Author: zsxwing zsxw...@gmail.com Authored: Tue Aug 11 23:23:17 2015 -0700 Committer: Andrew Or and...@databricks.com Committed: Tue Aug 11 23:23:17 2015 -0700 -- core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4e3f4b93/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala -- diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala index 0c94204..fb4556b 100644 --- a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala +++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala @@ -860,7 +860,7 @@ private[ui] class TaskDataSource( } val peakExecutionMemoryUsed = taskInternalAccumulables .find { acc = acc.name == InternalAccumulator.PEAK_EXECUTION_MEMORY } - .map { acc = acc.value.toLong } + .map { acc = acc.update.getOrElse(0).toLong } .getOrElse(0L) val maybeInput = metrics.flatMap(_.inputMetrics) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9829] [WEBUI] Display the update value for peak execution memory
Repository: spark Updated Branches: refs/heads/branch-1.5 402c0ca9a - d9d4bdea2 [SPARK-9829] [WEBUI] Display the update value for peak execution memory The peak execution memory is not correct because it shows the sum of finished tasks' values when a task finishes. This PR fixes it by using the update value rather than the accumulator value. Author: zsxwing zsxw...@gmail.com Closes #8121 from zsxwing/SPARK-9829. (cherry picked from commit 4e3f4b934f74e8c7c06f4940d6381343f9fd4918) Signed-off-by: Andrew Or and...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d9d4bdea Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d9d4bdea Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d9d4bdea Branch: refs/heads/branch-1.5 Commit: d9d4bdea2d711c50273012206832c6ee1a8d90d6 Parents: 402c0ca Author: zsxwing zsxw...@gmail.com Authored: Tue Aug 11 23:23:17 2015 -0700 Committer: Andrew Or and...@databricks.com Committed: Tue Aug 11 23:23:31 2015 -0700 -- core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d9d4bdea/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala -- diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala index 0c94204..fb4556b 100644 --- a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala +++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala @@ -860,7 +860,7 @@ private[ui] class TaskDataSource( } val peakExecutionMemoryUsed = taskInternalAccumulables .find { acc = acc.name == InternalAccumulator.PEAK_EXECUTION_MEMORY } - .map { acc = acc.value.toLong } + .map { acc = acc.update.getOrElse(0).toLong } .getOrElse(0L) val maybeInput = metrics.flatMap(_.inputMetrics) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9575] [MESOS] Add docuemntation around Mesos shuffle service.
Repository: spark Updated Branches: refs/heads/master 5c99d8bf9 - 741a29f98 [SPARK-9575] [MESOS] Add docuemntation around Mesos shuffle service. andrewor14 Author: Timothy Chen tnac...@gmail.com Closes #7907 from tnachen/mesos_shuffle. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/741a29f9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/741a29f9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/741a29f9 Branch: refs/heads/master Commit: 741a29f98945538a475579ccc974cd42c1613be4 Parents: 5c99d8b Author: Timothy Chen tnac...@gmail.com Authored: Tue Aug 11 23:33:22 2015 -0700 Committer: Andrew Or and...@databricks.com Committed: Tue Aug 11 23:33:22 2015 -0700 -- docs/running-on-mesos.md | 14 ++ 1 file changed, 14 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/741a29f9/docs/running-on-mesos.md -- diff --git a/docs/running-on-mesos.md b/docs/running-on-mesos.md index 55e6d4e..cfd219a 100644 --- a/docs/running-on-mesos.md +++ b/docs/running-on-mesos.md @@ -216,6 +216,20 @@ node. Please refer to [Hadoop on Mesos](https://github.com/mesos/hadoop). In either case, HDFS runs separately from Hadoop MapReduce, without being scheduled through Mesos. +# Dynamic Resource Allocation with Mesos + +Mesos supports dynamic allocation only with coarse grain mode, which can resize the number of executors based on statistics +of the application. While dynamic allocation supports both scaling up and scaling down the number of executors, the coarse grain scheduler only supports scaling down +since it is already designed to run one executor per slave with the configured amount of resources. However, after scaling down the number of executors the coarse grain scheduler +can scale back up to the same amount of executors when Spark signals more executors are needed. + +Users that like to utilize this feature should launch the Mesos Shuffle Service that +provides shuffle data cleanup functionality on top of the Shuffle Service since Mesos doesn't yet support notifying another framework's +termination. To launch/stop the Mesos Shuffle Service please use the provided sbin/start-mesos-shuffle-service.sh and sbin/stop-mesos-shuffle-service.sh +scripts accordingly. + +The Shuffle Service is expected to be running on each slave node that will run Spark executors. One way to easily achieve this with Mesos +is to launch the Shuffle Service with Marathon with a unique host constraint. # Configuration - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8366] maxNumExecutorsNeeded should properly handle failed tasks
Repository: spark Updated Branches: refs/heads/branch-1.5 b994f8928 - 2f909184e [SPARK-8366] maxNumExecutorsNeeded should properly handle failed tasks Author: xutingjun xuting...@huawei.com Author: meiyoula 1039320...@qq.com Closes #6817 from XuTingjun/SPARK-8366. (cherry picked from commit b85f9a242a12e8096e331fa77d5ebd16e93c844d) Signed-off-by: Andrew Or and...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2f909184 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2f909184 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2f909184 Branch: refs/heads/branch-1.5 Commit: 2f909184e2b346ba920129619fe8d45b20ae0573 Parents: b994f89 Author: xutingjun xuting...@huawei.com Authored: Tue Aug 11 23:19:35 2015 -0700 Committer: Andrew Or and...@databricks.com Committed: Tue Aug 11 23:19:45 2015 -0700 -- .../spark/ExecutorAllocationManager.scala | 22 +--- .../spark/ExecutorAllocationManagerSuite.scala | 22 ++-- 2 files changed, 34 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2f909184/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala -- diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala index 1877aaf..b93536e 100644 --- a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala +++ b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala @@ -599,14 +599,8 @@ private[spark] class ExecutorAllocationManager( // If this is the last pending task, mark the scheduler queue as empty stageIdToTaskIndices.getOrElseUpdate(stageId, new mutable.HashSet[Int]) += taskIndex -val numTasksScheduled = stageIdToTaskIndices(stageId).size -val numTasksTotal = stageIdToNumTasks.getOrElse(stageId, -1) -if (numTasksScheduled == numTasksTotal) { - // No more pending tasks for this stage - stageIdToNumTasks -= stageId - if (stageIdToNumTasks.isEmpty) { -allocationManager.onSchedulerQueueEmpty() - } +if (totalPendingTasks() == 0) { + allocationManager.onSchedulerQueueEmpty() } // Mark the executor on which this task is scheduled as busy @@ -618,6 +612,8 @@ private[spark] class ExecutorAllocationManager( override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = { val executorId = taskEnd.taskInfo.executorId val taskId = taskEnd.taskInfo.taskId + val taskIndex = taskEnd.taskInfo.index + val stageId = taskEnd.stageId allocationManager.synchronized { numRunningTasks -= 1 // If the executor is no longer running any scheduled tasks, mark it as idle @@ -628,6 +624,16 @@ private[spark] class ExecutorAllocationManager( allocationManager.onExecutorIdle(executorId) } } + +// If the task failed, we expect it to be resubmitted later. To ensure we have +// enough resources to run the resubmitted task, we need to mark the scheduler +// as backlogged again if it's not already marked as such (SPARK-8366) +if (taskEnd.reason != Success) { + if (totalPendingTasks() == 0) { +allocationManager.onSchedulerBacklogged() + } + stageIdToTaskIndices.get(stageId).foreach { _.remove(taskIndex) } +} } } http://git-wip-us.apache.org/repos/asf/spark/blob/2f909184/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala b/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala index 34caca8..f374f97 100644 --- a/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala +++ b/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala @@ -206,8 +206,8 @@ class ExecutorAllocationManagerSuite val task2Info = createTaskInfo(1, 0, executor-1) sc.listenerBus.postToAll(SparkListenerTaskStart(2, 0, task2Info)) -sc.listenerBus.postToAll(SparkListenerTaskEnd(2, 0, null, null, task1Info, null)) -sc.listenerBus.postToAll(SparkListenerTaskEnd(2, 0, null, null, task2Info, null)) +sc.listenerBus.postToAll(SparkListenerTaskEnd(2, 0, null, Success, task1Info, null)) +sc.listenerBus.postToAll(SparkListenerTaskEnd(2, 0, null, Success, task2Info, null)) assert(adjustRequestedExecutors(manager) === -1) } @@ -787,6 +787,24 @@ class ExecutorAllocationManagerSuite Map(host2 - 1, host3
spark git commit: [SPARK-8366] maxNumExecutorsNeeded should properly handle failed tasks
Repository: spark Updated Branches: refs/heads/master b1581ac28 - b85f9a242 [SPARK-8366] maxNumExecutorsNeeded should properly handle failed tasks Author: xutingjun xuting...@huawei.com Author: meiyoula 1039320...@qq.com Closes #6817 from XuTingjun/SPARK-8366. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b85f9a24 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b85f9a24 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b85f9a24 Branch: refs/heads/master Commit: b85f9a242a12e8096e331fa77d5ebd16e93c844d Parents: b1581ac Author: xutingjun xuting...@huawei.com Authored: Tue Aug 11 23:19:35 2015 -0700 Committer: Andrew Or and...@databricks.com Committed: Tue Aug 11 23:19:35 2015 -0700 -- .../spark/ExecutorAllocationManager.scala | 22 +--- .../spark/ExecutorAllocationManagerSuite.scala | 22 ++-- 2 files changed, 34 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b85f9a24/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala -- diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala index 1877aaf..b93536e 100644 --- a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala +++ b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala @@ -599,14 +599,8 @@ private[spark] class ExecutorAllocationManager( // If this is the last pending task, mark the scheduler queue as empty stageIdToTaskIndices.getOrElseUpdate(stageId, new mutable.HashSet[Int]) += taskIndex -val numTasksScheduled = stageIdToTaskIndices(stageId).size -val numTasksTotal = stageIdToNumTasks.getOrElse(stageId, -1) -if (numTasksScheduled == numTasksTotal) { - // No more pending tasks for this stage - stageIdToNumTasks -= stageId - if (stageIdToNumTasks.isEmpty) { -allocationManager.onSchedulerQueueEmpty() - } +if (totalPendingTasks() == 0) { + allocationManager.onSchedulerQueueEmpty() } // Mark the executor on which this task is scheduled as busy @@ -618,6 +612,8 @@ private[spark] class ExecutorAllocationManager( override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = { val executorId = taskEnd.taskInfo.executorId val taskId = taskEnd.taskInfo.taskId + val taskIndex = taskEnd.taskInfo.index + val stageId = taskEnd.stageId allocationManager.synchronized { numRunningTasks -= 1 // If the executor is no longer running any scheduled tasks, mark it as idle @@ -628,6 +624,16 @@ private[spark] class ExecutorAllocationManager( allocationManager.onExecutorIdle(executorId) } } + +// If the task failed, we expect it to be resubmitted later. To ensure we have +// enough resources to run the resubmitted task, we need to mark the scheduler +// as backlogged again if it's not already marked as such (SPARK-8366) +if (taskEnd.reason != Success) { + if (totalPendingTasks() == 0) { +allocationManager.onSchedulerBacklogged() + } + stageIdToTaskIndices.get(stageId).foreach { _.remove(taskIndex) } +} } } http://git-wip-us.apache.org/repos/asf/spark/blob/b85f9a24/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala b/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala index 34caca8..f374f97 100644 --- a/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala +++ b/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala @@ -206,8 +206,8 @@ class ExecutorAllocationManagerSuite val task2Info = createTaskInfo(1, 0, executor-1) sc.listenerBus.postToAll(SparkListenerTaskStart(2, 0, task2Info)) -sc.listenerBus.postToAll(SparkListenerTaskEnd(2, 0, null, null, task1Info, null)) -sc.listenerBus.postToAll(SparkListenerTaskEnd(2, 0, null, null, task2Info, null)) +sc.listenerBus.postToAll(SparkListenerTaskEnd(2, 0, null, Success, task1Info, null)) +sc.listenerBus.postToAll(SparkListenerTaskEnd(2, 0, null, Success, task2Info, null)) assert(adjustRequestedExecutors(manager) === -1) } @@ -787,6 +787,24 @@ class ExecutorAllocationManagerSuite Map(host2 - 1, host3 - 2, host4 - 1, host5 - 2)) } + test(SPARK-8366: maxNumExecutorsNeeded should properly handle failed tasks) { +sc
spark git commit: [SPARK-8798] [MESOS] Allow additional uris to be fetched with mesos
Repository: spark Updated Branches: refs/heads/branch-1.5 93fc95934 - a2f805729 [SPARK-8798] [MESOS] Allow additional uris to be fetched with mesos Some users like to download additional files in their sandbox that they can refer to from their spark program, or even later mount these files to another directory. Author: Timothy Chen tnac...@gmail.com Closes #7195 from tnachen/mesos_files. (cherry picked from commit 5c99d8bf98cbf7f568345d02a814fc318cbfca75) Signed-off-by: Andrew Or and...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a2f80572 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a2f80572 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a2f80572 Branch: refs/heads/branch-1.5 Commit: a2f805729b401c68b60bd690ad02533b8db57b58 Parents: 93fc959 Author: Timothy Chen tnac...@gmail.com Authored: Tue Aug 11 23:26:33 2015 -0700 Committer: Andrew Or and...@databricks.com Committed: Tue Aug 11 23:26:48 2015 -0700 -- .../cluster/mesos/CoarseMesosSchedulerBackend.scala | 5 + .../scheduler/cluster/mesos/MesosClusterScheduler.scala | 3 +++ .../scheduler/cluster/mesos/MesosSchedulerBackend.scala | 5 + .../spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala | 6 ++ docs/running-on-mesos.md | 8 5 files changed, 27 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a2f80572/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala index 15a0915..d6e1e9e 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala @@ -194,6 +194,11 @@ private[spark] class CoarseMesosSchedulerBackend( s --app-id $appId) command.addUris(CommandInfo.URI.newBuilder().setValue(uri.get)) } + +conf.getOption(spark.mesos.uris).map { uris = + setupUris(uris, command) +} + command.build() } http://git-wip-us.apache.org/repos/asf/spark/blob/a2f80572/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala index f078547..64ec2b8 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala @@ -403,6 +403,9 @@ private[spark] class MesosClusterScheduler( } builder.setValue(s$executable $cmdOptions $jar $appArguments) builder.setEnvironment(envBuilder.build()) +conf.getOption(spark.mesos.uris).map { uris = + setupUris(uris, builder) +} builder.build() } http://git-wip-us.apache.org/repos/asf/spark/blob/a2f80572/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala index 3f63ec1..5c20606 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala @@ -133,6 +133,11 @@ private[spark] class MesosSchedulerBackend( builder.addAllResources(usedCpuResources) builder.addAllResources(usedMemResources) + +sc.conf.getOption(spark.mesos.uris).map { uris = + setupUris(uris, command) +} + val executorInfo = builder .setExecutorId(ExecutorID.newBuilder().setValue(execId).build()) .setCommand(command) http://git-wip-us.apache.org/repos/asf/spark/blob/a2f80572/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala index c04920e..5b854aa 100644 ---
spark git commit: [SPARK-8798] [MESOS] Allow additional uris to be fetched with mesos
Repository: spark Updated Branches: refs/heads/master bab892328 - 5c99d8bf9 [SPARK-8798] [MESOS] Allow additional uris to be fetched with mesos Some users like to download additional files in their sandbox that they can refer to from their spark program, or even later mount these files to another directory. Author: Timothy Chen tnac...@gmail.com Closes #7195 from tnachen/mesos_files. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5c99d8bf Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5c99d8bf Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5c99d8bf Branch: refs/heads/master Commit: 5c99d8bf98cbf7f568345d02a814fc318cbfca75 Parents: bab8923 Author: Timothy Chen tnac...@gmail.com Authored: Tue Aug 11 23:26:33 2015 -0700 Committer: Andrew Or and...@databricks.com Committed: Tue Aug 11 23:26:33 2015 -0700 -- .../cluster/mesos/CoarseMesosSchedulerBackend.scala | 5 + .../scheduler/cluster/mesos/MesosClusterScheduler.scala | 3 +++ .../scheduler/cluster/mesos/MesosSchedulerBackend.scala | 5 + .../spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala | 6 ++ docs/running-on-mesos.md | 8 5 files changed, 27 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5c99d8bf/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala index 15a0915..d6e1e9e 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala @@ -194,6 +194,11 @@ private[spark] class CoarseMesosSchedulerBackend( s --app-id $appId) command.addUris(CommandInfo.URI.newBuilder().setValue(uri.get)) } + +conf.getOption(spark.mesos.uris).map { uris = + setupUris(uris, command) +} + command.build() } http://git-wip-us.apache.org/repos/asf/spark/blob/5c99d8bf/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala index f078547..64ec2b8 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala @@ -403,6 +403,9 @@ private[spark] class MesosClusterScheduler( } builder.setValue(s$executable $cmdOptions $jar $appArguments) builder.setEnvironment(envBuilder.build()) +conf.getOption(spark.mesos.uris).map { uris = + setupUris(uris, builder) +} builder.build() } http://git-wip-us.apache.org/repos/asf/spark/blob/5c99d8bf/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala index 3f63ec1..5c20606 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala @@ -133,6 +133,11 @@ private[spark] class MesosSchedulerBackend( builder.addAllResources(usedCpuResources) builder.addAllResources(usedMemResources) + +sc.conf.getOption(spark.mesos.uris).map { uris = + setupUris(uris, command) +} + val executorInfo = builder .setExecutorId(ExecutorID.newBuilder().setValue(execId).build()) .setCommand(command) http://git-wip-us.apache.org/repos/asf/spark/blob/5c99d8bf/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala index c04920e..5b854aa 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala +++
spark git commit: [SPARK-9806] [WEB UI] Don't share ReplayListenerBus between multiple applications
Repository: spark Updated Branches: refs/heads/branch-1.4 6dde38026 - 89c8aea94 [SPARK-9806] [WEB UI] Don't share ReplayListenerBus between multiple applications Author: Rohit Agarwal roh...@qubole.com Closes #8088 from mindprince/SPARK-9806. (cherry picked from commit a807fcbe50b2ce18751d80d39e9d21842f7da32a) Signed-off-by: Andrew Or and...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/89c8aea9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/89c8aea9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/89c8aea9 Branch: refs/heads/branch-1.4 Commit: 89c8aea94cfc4b2945d0739397a14333bc64c2ae Parents: 6dde380 Author: Rohit Agarwal roh...@qubole.com Authored: Tue Aug 11 23:20:39 2015 -0700 Committer: Andrew Or and...@databricks.com Committed: Tue Aug 11 23:22:07 2015 -0700 -- .../scala/org/apache/spark/deploy/history/FsHistoryProvider.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/89c8aea9/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala index b1e43fc..4483718 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala @@ -217,9 +217,9 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) * Replay the log files in the list and merge the list of old applications with new ones */ private def mergeApplicationListing(logs: Seq[FileStatus]): Unit = { -val bus = new ReplayListenerBus() val newAttempts = logs.flatMap { fileStatus = try { +val bus = new ReplayListenerBus() val res = replay(fileStatus, bus) res match { case Some(r) = logDebug(sApplication log ${r.logPath} loaded successfully.) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9426] [WEBUI] Job page DAG visualization is not shown
Repository: spark Updated Branches: refs/heads/master 4e3f4b934 - bab892328 [SPARK-9426] [WEBUI] Job page DAG visualization is not shown To reproduce the issue, go to the stage page and click DAG Visualization once, then go to the job page to show the job DAG visualization. You will only see the first stage of the job. Root cause: the java script use local storage to remember your selection. Once you click the stage DAG visualization, the local storage set `expand-dag-viz-arrow-stage` to true. When you go to the job page, the js checks `expand-dag-viz-arrow-stage` in the local storage first and will try to show stage DAG visualization on the job page. To fix this, I set an id to the DAG span to differ job page and stage page. In the js code, we check the id and local storage together to make sure we show the correct DAG visualization. Author: Carson Wang carson.w...@intel.com Closes #8104 from carsonwang/SPARK-9426. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bab89232 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bab89232 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bab89232 Branch: refs/heads/master Commit: bab89232854de7554e88f29cab76f1a1c349edc1 Parents: 4e3f4b9 Author: Carson Wang carson.w...@intel.com Authored: Tue Aug 11 23:25:02 2015 -0700 Committer: Andrew Or and...@databricks.com Committed: Tue Aug 11 23:25:02 2015 -0700 -- .../resources/org/apache/spark/ui/static/spark-dag-viz.js| 8 core/src/main/scala/org/apache/spark/ui/UIUtils.scala| 3 ++- 2 files changed, 6 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/bab89232/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js -- diff --git a/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js b/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js index 4a893bc..83dbea4 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js +++ b/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js @@ -109,13 +109,13 @@ function toggleDagViz(forJob) { } $(function (){ - if (window.localStorage.getItem(expandDagVizArrowKey(false)) == true) { + if ($(#stage-dag-viz).length + window.localStorage.getItem(expandDagVizArrowKey(false)) == true) { // Set it to false so that the click function can revert it window.localStorage.setItem(expandDagVizArrowKey(false), false); toggleDagViz(false); - } - - if (window.localStorage.getItem(expandDagVizArrowKey(true)) == true) { + } else if ($(#job-dag-viz).length + window.localStorage.getItem(expandDagVizArrowKey(true)) == true) { // Set it to false so that the click function can revert it window.localStorage.setItem(expandDagVizArrowKey(true), false); toggleDagViz(true); http://git-wip-us.apache.org/repos/asf/spark/blob/bab89232/core/src/main/scala/org/apache/spark/ui/UIUtils.scala -- diff --git a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala index 718aea7..f2da417 100644 --- a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala +++ b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala @@ -352,7 +352,8 @@ private[spark] object UIUtils extends Logging { */ private def showDagViz(graphs: Seq[RDDOperationGraph], forJob: Boolean): Seq[Node] = { div - span class=expand-dag-viz onclick={stoggleDagViz($forJob);} + span id={if (forJob) job-dag-viz else stage-dag-viz} +class=expand-dag-viz onclick={stoggleDagViz($forJob);} span class=expand-dag-viz-arrow arrow-closed/span a data-toggle=tooltip title={if (forJob) ToolTips.JOB_DAG else ToolTips.STAGE_DAG} data-placement=right - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9426] [WEBUI] Job page DAG visualization is not shown
Repository: spark Updated Branches: refs/heads/branch-1.5 d9d4bdea2 - 93fc95934 [SPARK-9426] [WEBUI] Job page DAG visualization is not shown To reproduce the issue, go to the stage page and click DAG Visualization once, then go to the job page to show the job DAG visualization. You will only see the first stage of the job. Root cause: the java script use local storage to remember your selection. Once you click the stage DAG visualization, the local storage set `expand-dag-viz-arrow-stage` to true. When you go to the job page, the js checks `expand-dag-viz-arrow-stage` in the local storage first and will try to show stage DAG visualization on the job page. To fix this, I set an id to the DAG span to differ job page and stage page. In the js code, we check the id and local storage together to make sure we show the correct DAG visualization. Author: Carson Wang carson.w...@intel.com Closes #8104 from carsonwang/SPARK-9426. (cherry picked from commit bab89232854de7554e88f29cab76f1a1c349edc1) Signed-off-by: Andrew Or and...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/93fc9593 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/93fc9593 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/93fc9593 Branch: refs/heads/branch-1.5 Commit: 93fc95934f233a3cfdf744160242092dadeaec8b Parents: d9d4bde Author: Carson Wang carson.w...@intel.com Authored: Tue Aug 11 23:25:02 2015 -0700 Committer: Andrew Or and...@databricks.com Committed: Tue Aug 11 23:25:27 2015 -0700 -- .../resources/org/apache/spark/ui/static/spark-dag-viz.js| 8 core/src/main/scala/org/apache/spark/ui/UIUtils.scala| 3 ++- 2 files changed, 6 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/93fc9593/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js -- diff --git a/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js b/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js index 4a893bc..83dbea4 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js +++ b/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js @@ -109,13 +109,13 @@ function toggleDagViz(forJob) { } $(function (){ - if (window.localStorage.getItem(expandDagVizArrowKey(false)) == true) { + if ($(#stage-dag-viz).length + window.localStorage.getItem(expandDagVizArrowKey(false)) == true) { // Set it to false so that the click function can revert it window.localStorage.setItem(expandDagVizArrowKey(false), false); toggleDagViz(false); - } - - if (window.localStorage.getItem(expandDagVizArrowKey(true)) == true) { + } else if ($(#job-dag-viz).length + window.localStorage.getItem(expandDagVizArrowKey(true)) == true) { // Set it to false so that the click function can revert it window.localStorage.setItem(expandDagVizArrowKey(true), false); toggleDagViz(true); http://git-wip-us.apache.org/repos/asf/spark/blob/93fc9593/core/src/main/scala/org/apache/spark/ui/UIUtils.scala -- diff --git a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala index 718aea7..f2da417 100644 --- a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala +++ b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala @@ -352,7 +352,8 @@ private[spark] object UIUtils extends Logging { */ private def showDagViz(graphs: Seq[RDDOperationGraph], forJob: Boolean): Seq[Node] = { div - span class=expand-dag-viz onclick={stoggleDagViz($forJob);} + span id={if (forJob) job-dag-viz else stage-dag-viz} +class=expand-dag-viz onclick={stoggleDagViz($forJob);} span class=expand-dag-viz-arrow arrow-closed/span a data-toggle=tooltip title={if (forJob) ToolTips.JOB_DAG else ToolTips.STAGE_DAG} data-placement=right - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9795] Dynamic allocation: avoid double counting when killing same executor twice
Repository: spark Updated Branches: refs/heads/branch-1.5 0579f28df - bc4ac65d4 [SPARK-9795] Dynamic allocation: avoid double counting when killing same executor twice This is based on KaiXinXiaoLei's changes in #7716. The issue is that when someone calls `sc.killExecutor(1)` on the same executor twice quickly, then the executor target will be adjusted downwards by 2 instead of 1 even though we're only actually killing one executor. In certain cases where we don't adjust the target back upwards quickly, we'll end up with jobs hanging. This is a common danger because there are many places where this is called: - `HeartbeatReceiver` kills an executor that has not been sending heartbeats - `ExecutorAllocationManager` kills an executor that has been idle - The user code might call this, which may interfere with the previous callers While it's not clear whether this fixes SPARK-9745, fixing this potential race condition seems like a strict improvement. I've added a regression test to illustrate the issue. Author: Andrew Or and...@databricks.com Closes #8078 from andrewor14/da-double-kill. (cherry picked from commit be5d1912076c2ffd21ec88611e53d3b3c59b7ecc) Signed-off-by: Andrew Or and...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bc4ac65d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bc4ac65d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bc4ac65d Branch: refs/heads/branch-1.5 Commit: bc4ac65d4c0fed93c70582fc74574c5b70aa842d Parents: 0579f28 Author: Andrew Or and...@databricks.com Authored: Wed Aug 12 09:24:50 2015 -0700 Committer: Andrew Or and...@databricks.com Committed: Wed Aug 12 09:24:58 2015 -0700 -- .../cluster/CoarseGrainedSchedulerBackend.scala | 11 +++ .../StandaloneDynamicAllocationSuite.scala | 20 2 files changed, 27 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/bc4ac65d/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala index 6acf8a9..5730a87 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala @@ -422,16 +422,19 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp logWarning(sExecutor to kill $id does not exist!) } +// If an executor is already pending to be removed, do not kill it again (SPARK-9795) +val executorsToKill = knownExecutors.filter { id = !executorsPendingToRemove.contains(id) } +executorsPendingToRemove ++= executorsToKill + // If we do not wish to replace the executors we kill, sync the target number of executors // with the cluster manager to avoid allocating new ones. When computing the new target, // take into account executors that are pending to be added or removed. if (!replace) { - doRequestTotalExecutors(numExistingExecutors + numPendingExecutors -- executorsPendingToRemove.size - knownExecutors.size) + doRequestTotalExecutors( +numExistingExecutors + numPendingExecutors - executorsPendingToRemove.size) } -executorsPendingToRemove ++= knownExecutors -doKillExecutors(knownExecutors) +doKillExecutors(executorsToKill) } /** http://git-wip-us.apache.org/repos/asf/spark/blob/bc4ac65d/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala b/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala index 08c41a8..1f2a0f0 100644 --- a/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala @@ -283,6 +283,26 @@ class StandaloneDynamicAllocationSuite assert(master.apps.head.getExecutorLimit === 1000) } + test(kill the same executor twice (SPARK-9795)) { +sc = new SparkContext(appConf) +val appId = sc.applicationId +assert(master.apps.size === 1) +assert(master.apps.head.id === appId) +assert(master.apps.head.executors.size === 2) +assert(master.apps.head.getExecutorLimit === Int.MaxValue) +// sync executors between the Master and the driver, needed because +// the driver refuses to
spark git commit: [SPARK-7583] [MLLIB] User guide update for RegexTokenizer
Repository: spark Updated Branches: refs/heads/branch-1.5 bc4ac65d4 - 2d86faddd [SPARK-7583] [MLLIB] User guide update for RegexTokenizer jira: https://issues.apache.org/jira/browse/SPARK-7583 User guide update for RegexTokenizer Author: Yuhao Yang hhb...@gmail.com Closes #7828 from hhbyyh/regexTokenizerDoc. (cherry picked from commit 66d87c1d76bea2b81993156ac1fa7dad6c312ebf) Signed-off-by: Xiangrui Meng m...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2d86fadd Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2d86fadd Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2d86fadd Branch: refs/heads/branch-1.5 Commit: 2d86faddd87b6e61565cbdf18dadaf4aeb2b223e Parents: bc4ac65 Author: Yuhao Yang hhb...@gmail.com Authored: Wed Aug 12 09:35:32 2015 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Wed Aug 12 09:35:41 2015 -0700 -- docs/ml-features.md | 41 ++--- 1 file changed, 30 insertions(+), 11 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2d86fadd/docs/ml-features.md -- diff --git a/docs/ml-features.md b/docs/ml-features.md index fa0ad1f..cec2cbe 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -217,21 +217,32 @@ for feature in result.select(result).take(3): [Tokenization](http://en.wikipedia.org/wiki/Lexical_analysis#Tokenization) is the process of taking text (such as a sentence) and breaking it into individual terms (usually words). A simple [Tokenizer](api/scala/index.html#org.apache.spark.ml.feature.Tokenizer) class provides this functionality. The example below shows how to split sentences into sequences of words. -Note: A more advanced tokenizer is provided via [RegexTokenizer](api/scala/index.html#org.apache.spark.ml.feature.RegexTokenizer). +[RegexTokenizer](api/scala/index.html#org.apache.spark.ml.feature.RegexTokenizer) allows more + advanced tokenization based on regular expression (regex) matching. + By default, the parameter pattern (regex, default: \\s+) is used as delimiters to split the input text. + Alternatively, users can set parameter gaps to false indicating the regex pattern denotes + tokens rather than splitting gaps, and find all matching occurrences as the tokenization result. div class=codetabs div data-lang=scala markdown=1 {% highlight scala %} -import org.apache.spark.ml.feature.Tokenizer +import org.apache.spark.ml.feature.{Tokenizer, RegexTokenizer} val sentenceDataFrame = sqlContext.createDataFrame(Seq( (0, Hi I heard about Spark), - (0, I wish Java could use case classes), - (1, Logistic regression models are neat) + (1, I wish Java could use case classes), + (2, Logistic,regression,models,are,neat) )).toDF(label, sentence) val tokenizer = new Tokenizer().setInputCol(sentence).setOutputCol(words) -val wordsDataFrame = tokenizer.transform(sentenceDataFrame) -wordsDataFrame.select(words, label).take(3).foreach(println) +val regexTokenizer = new RegexTokenizer() + .setInputCol(sentence) + .setOutputCol(words) + .setPattern(\\W) // alternatively .setPattern(\\w+).setGaps(false) + +val tokenized = tokenizer.transform(sentenceDataFrame) +tokenized.select(words, label).take(3).foreach(println) +val regexTokenized = regexTokenizer.transform(sentenceDataFrame) +regexTokenized.select(words, label).take(3).foreach(println) {% endhighlight %} /div @@ -240,6 +251,7 @@ wordsDataFrame.select(words, label).take(3).foreach(println) import com.google.common.collect.Lists; import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.ml.feature.RegexTokenizer; import org.apache.spark.ml.feature.Tokenizer; import org.apache.spark.mllib.linalg.Vector; import org.apache.spark.sql.DataFrame; @@ -252,8 +264,8 @@ import org.apache.spark.sql.types.StructType; JavaRDDRow jrdd = jsc.parallelize(Lists.newArrayList( RowFactory.create(0, Hi I heard about Spark), - RowFactory.create(0, I wish Java could use case classes), - RowFactory.create(1, Logistic regression models are neat) + RowFactory.create(1, I wish Java could use case classes), + RowFactory.create(2, Logistic,regression,models,are,neat) )); StructType schema = new StructType(new StructField[]{ new StructField(label, DataTypes.DoubleType, false, Metadata.empty()), @@ -267,22 +279,29 @@ for (Row r : wordsDataFrame.select(words, label).take(3)) { for (String word : words) System.out.print(word + ); System.out.println(); } + +RegexTokenizer regexTokenizer = new RegexTokenizer() + .setInputCol(sentence) + .setOutputCol(words) + .setPattern(\\W); // alternatively .setPattern(\\w+).setGaps(false); {% endhighlight %} /div div data-lang=python markdown=1 {% highlight
spark git commit: [SPARK-7583] [MLLIB] User guide update for RegexTokenizer
Repository: spark Updated Branches: refs/heads/master be5d19120 - 66d87c1d7 [SPARK-7583] [MLLIB] User guide update for RegexTokenizer jira: https://issues.apache.org/jira/browse/SPARK-7583 User guide update for RegexTokenizer Author: Yuhao Yang hhb...@gmail.com Closes #7828 from hhbyyh/regexTokenizerDoc. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/66d87c1d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/66d87c1d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/66d87c1d Branch: refs/heads/master Commit: 66d87c1d76bea2b81993156ac1fa7dad6c312ebf Parents: be5d191 Author: Yuhao Yang hhb...@gmail.com Authored: Wed Aug 12 09:35:32 2015 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Wed Aug 12 09:35:32 2015 -0700 -- docs/ml-features.md | 41 ++--- 1 file changed, 30 insertions(+), 11 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/66d87c1d/docs/ml-features.md -- diff --git a/docs/ml-features.md b/docs/ml-features.md index fa0ad1f..cec2cbe 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -217,21 +217,32 @@ for feature in result.select(result).take(3): [Tokenization](http://en.wikipedia.org/wiki/Lexical_analysis#Tokenization) is the process of taking text (such as a sentence) and breaking it into individual terms (usually words). A simple [Tokenizer](api/scala/index.html#org.apache.spark.ml.feature.Tokenizer) class provides this functionality. The example below shows how to split sentences into sequences of words. -Note: A more advanced tokenizer is provided via [RegexTokenizer](api/scala/index.html#org.apache.spark.ml.feature.RegexTokenizer). +[RegexTokenizer](api/scala/index.html#org.apache.spark.ml.feature.RegexTokenizer) allows more + advanced tokenization based on regular expression (regex) matching. + By default, the parameter pattern (regex, default: \\s+) is used as delimiters to split the input text. + Alternatively, users can set parameter gaps to false indicating the regex pattern denotes + tokens rather than splitting gaps, and find all matching occurrences as the tokenization result. div class=codetabs div data-lang=scala markdown=1 {% highlight scala %} -import org.apache.spark.ml.feature.Tokenizer +import org.apache.spark.ml.feature.{Tokenizer, RegexTokenizer} val sentenceDataFrame = sqlContext.createDataFrame(Seq( (0, Hi I heard about Spark), - (0, I wish Java could use case classes), - (1, Logistic regression models are neat) + (1, I wish Java could use case classes), + (2, Logistic,regression,models,are,neat) )).toDF(label, sentence) val tokenizer = new Tokenizer().setInputCol(sentence).setOutputCol(words) -val wordsDataFrame = tokenizer.transform(sentenceDataFrame) -wordsDataFrame.select(words, label).take(3).foreach(println) +val regexTokenizer = new RegexTokenizer() + .setInputCol(sentence) + .setOutputCol(words) + .setPattern(\\W) // alternatively .setPattern(\\w+).setGaps(false) + +val tokenized = tokenizer.transform(sentenceDataFrame) +tokenized.select(words, label).take(3).foreach(println) +val regexTokenized = regexTokenizer.transform(sentenceDataFrame) +regexTokenized.select(words, label).take(3).foreach(println) {% endhighlight %} /div @@ -240,6 +251,7 @@ wordsDataFrame.select(words, label).take(3).foreach(println) import com.google.common.collect.Lists; import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.ml.feature.RegexTokenizer; import org.apache.spark.ml.feature.Tokenizer; import org.apache.spark.mllib.linalg.Vector; import org.apache.spark.sql.DataFrame; @@ -252,8 +264,8 @@ import org.apache.spark.sql.types.StructType; JavaRDDRow jrdd = jsc.parallelize(Lists.newArrayList( RowFactory.create(0, Hi I heard about Spark), - RowFactory.create(0, I wish Java could use case classes), - RowFactory.create(1, Logistic regression models are neat) + RowFactory.create(1, I wish Java could use case classes), + RowFactory.create(2, Logistic,regression,models,are,neat) )); StructType schema = new StructType(new StructField[]{ new StructField(label, DataTypes.DoubleType, false, Metadata.empty()), @@ -267,22 +279,29 @@ for (Row r : wordsDataFrame.select(words, label).take(3)) { for (String word : words) System.out.print(word + ); System.out.println(); } + +RegexTokenizer regexTokenizer = new RegexTokenizer() + .setInputCol(sentence) + .setOutputCol(words) + .setPattern(\\W); // alternatively .setPattern(\\w+).setGaps(false); {% endhighlight %} /div div data-lang=python markdown=1 {% highlight python %} -from pyspark.ml.feature import Tokenizer +from pyspark.ml.feature import Tokenizer, RegexTokenizer sentenceDataFrame
spark git commit: [SPARK-9747] [SQL] Avoid starving an unsafe operator in aggregation
Repository: spark Updated Branches: refs/heads/branch-1.5 2d86faddd - 4c6b1296d [SPARK-9747] [SQL] Avoid starving an unsafe operator in aggregation This is the sister patch to #8011, but for aggregation. In a nutshell: create the `TungstenAggregationIterator` before computing the parent partition. Internally this creates a `BytesToBytesMap` which acquires a page in the constructor as of this patch. This ensures that the aggregation operator is not starved since we reserve at least 1 page in advance. rxin yhuai Author: Andrew Or and...@databricks.com Closes #8038 from andrewor14/unsafe-starve-memory-agg. (cherry picked from commit e0110792ef71ebfd3727b970346a2e13695990a4) Signed-off-by: Reynold Xin r...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4c6b1296 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4c6b1296 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4c6b1296 Branch: refs/heads/branch-1.5 Commit: 4c6b1296d20f594f71e63b0772b5290ef21ddd21 Parents: 2d86fad Author: Andrew Or and...@databricks.com Authored: Wed Aug 12 10:08:35 2015 -0700 Committer: Reynold Xin r...@databricks.com Committed: Wed Aug 12 10:08:47 2015 -0700 -- .../spark/unsafe/map/BytesToBytesMap.java | 34 ++-- .../unsafe/sort/UnsafeExternalSorter.java | 9 +- .../map/AbstractBytesToBytesMapSuite.java | 11 ++- .../UnsafeFixedWidthAggregationMap.java | 7 ++ .../execution/aggregate/TungstenAggregate.scala | 72 ++-- .../aggregate/TungstenAggregationIterator.scala | 88 .../TungstenAggregationIteratorSuite.scala | 56 + 7 files changed, 201 insertions(+), 76 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4c6b1296/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java -- diff --git a/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java b/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java index 85b46ec..87ed47e 100644 --- a/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java +++ b/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java @@ -193,6 +193,11 @@ public final class BytesToBytesMap { TaskMemoryManager.MAXIMUM_PAGE_SIZE_BYTES); } allocate(initialCapacity); + +// Acquire a new page as soon as we construct the map to ensure that we have at least +// one page to work with. Otherwise, other operators in the same task may starve this +// map (SPARK-9747). +acquireNewPage(); } public BytesToBytesMap( @@ -574,16 +579,9 @@ public final class BytesToBytesMap { final long lengthOffsetInPage = currentDataPage.getBaseOffset() + pageCursor; Platform.putInt(pageBaseObject, lengthOffsetInPage, END_OF_PAGE_MARKER); } -final long memoryGranted = shuffleMemoryManager.tryToAcquire(pageSizeBytes); -if (memoryGranted != pageSizeBytes) { - shuffleMemoryManager.release(memoryGranted); - logger.debug(Failed to acquire {} bytes of memory, pageSizeBytes); +if (!acquireNewPage()) { return false; } -MemoryBlock newPage = taskMemoryManager.allocatePage(pageSizeBytes); -dataPages.add(newPage); -pageCursor = 0; -currentDataPage = newPage; dataPage = currentDataPage; dataPageBaseObject = currentDataPage.getBaseObject(); dataPageInsertOffset = currentDataPage.getBaseOffset(); @@ -643,6 +641,24 @@ public final class BytesToBytesMap { } /** + * Acquire a new page from the {@link ShuffleMemoryManager}. + * @return whether there is enough space to allocate the new page. + */ + private boolean acquireNewPage() { +final long memoryGranted = shuffleMemoryManager.tryToAcquire(pageSizeBytes); +if (memoryGranted != pageSizeBytes) { + shuffleMemoryManager.release(memoryGranted); + logger.debug(Failed to acquire {} bytes of memory, pageSizeBytes); + return false; +} +MemoryBlock newPage = taskMemoryManager.allocatePage(pageSizeBytes); +dataPages.add(newPage); +pageCursor = 0; +currentDataPage = newPage; +return true; + } + + /** * Allocate new data structures for this map. When calling this outside of the constructor, * make sure to keep references to the old data structures so that you can free them. * @@ -748,7 +764,7 @@ public final class BytesToBytesMap { } @VisibleForTesting - int getNumDataPages() { + public int getNumDataPages() { return dataPages.size(); }
spark git commit: [SPARK-9747] [SQL] Avoid starving an unsafe operator in aggregation
Repository: spark Updated Branches: refs/heads/master 66d87c1d7 - e0110792e [SPARK-9747] [SQL] Avoid starving an unsafe operator in aggregation This is the sister patch to #8011, but for aggregation. In a nutshell: create the `TungstenAggregationIterator` before computing the parent partition. Internally this creates a `BytesToBytesMap` which acquires a page in the constructor as of this patch. This ensures that the aggregation operator is not starved since we reserve at least 1 page in advance. rxin yhuai Author: Andrew Or and...@databricks.com Closes #8038 from andrewor14/unsafe-starve-memory-agg. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e0110792 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e0110792 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e0110792 Branch: refs/heads/master Commit: e0110792ef71ebfd3727b970346a2e13695990a4 Parents: 66d87c1 Author: Andrew Or and...@databricks.com Authored: Wed Aug 12 10:08:35 2015 -0700 Committer: Reynold Xin r...@databricks.com Committed: Wed Aug 12 10:08:35 2015 -0700 -- .../spark/unsafe/map/BytesToBytesMap.java | 34 ++-- .../unsafe/sort/UnsafeExternalSorter.java | 9 +- .../map/AbstractBytesToBytesMapSuite.java | 11 ++- .../UnsafeFixedWidthAggregationMap.java | 7 ++ .../execution/aggregate/TungstenAggregate.scala | 72 ++-- .../aggregate/TungstenAggregationIterator.scala | 88 .../TungstenAggregationIteratorSuite.scala | 56 + 7 files changed, 201 insertions(+), 76 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e0110792/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java -- diff --git a/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java b/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java index 85b46ec..87ed47e 100644 --- a/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java +++ b/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java @@ -193,6 +193,11 @@ public final class BytesToBytesMap { TaskMemoryManager.MAXIMUM_PAGE_SIZE_BYTES); } allocate(initialCapacity); + +// Acquire a new page as soon as we construct the map to ensure that we have at least +// one page to work with. Otherwise, other operators in the same task may starve this +// map (SPARK-9747). +acquireNewPage(); } public BytesToBytesMap( @@ -574,16 +579,9 @@ public final class BytesToBytesMap { final long lengthOffsetInPage = currentDataPage.getBaseOffset() + pageCursor; Platform.putInt(pageBaseObject, lengthOffsetInPage, END_OF_PAGE_MARKER); } -final long memoryGranted = shuffleMemoryManager.tryToAcquire(pageSizeBytes); -if (memoryGranted != pageSizeBytes) { - shuffleMemoryManager.release(memoryGranted); - logger.debug(Failed to acquire {} bytes of memory, pageSizeBytes); +if (!acquireNewPage()) { return false; } -MemoryBlock newPage = taskMemoryManager.allocatePage(pageSizeBytes); -dataPages.add(newPage); -pageCursor = 0; -currentDataPage = newPage; dataPage = currentDataPage; dataPageBaseObject = currentDataPage.getBaseObject(); dataPageInsertOffset = currentDataPage.getBaseOffset(); @@ -643,6 +641,24 @@ public final class BytesToBytesMap { } /** + * Acquire a new page from the {@link ShuffleMemoryManager}. + * @return whether there is enough space to allocate the new page. + */ + private boolean acquireNewPage() { +final long memoryGranted = shuffleMemoryManager.tryToAcquire(pageSizeBytes); +if (memoryGranted != pageSizeBytes) { + shuffleMemoryManager.release(memoryGranted); + logger.debug(Failed to acquire {} bytes of memory, pageSizeBytes); + return false; +} +MemoryBlock newPage = taskMemoryManager.allocatePage(pageSizeBytes); +dataPages.add(newPage); +pageCursor = 0; +currentDataPage = newPage; +return true; + } + + /** * Allocate new data structures for this map. When calling this outside of the constructor, * make sure to keep references to the old data structures so that you can free them. * @@ -748,7 +764,7 @@ public final class BytesToBytesMap { } @VisibleForTesting - int getNumDataPages() { + public int getNumDataPages() { return dataPages.size(); } http://git-wip-us.apache.org/repos/asf/spark/blob/e0110792/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java -- diff --git
spark git commit: [SPARK-9407] [SQL] Relaxes Parquet ValidTypeMap to allow ENUM predicates to be pushed down
Repository: spark Updated Branches: refs/heads/master 9d0822455 - 3ecb37943 [SPARK-9407] [SQL] Relaxes Parquet ValidTypeMap to allow ENUM predicates to be pushed down This PR adds a hacky workaround for PARQUET-201, and should be removed once we upgrade to parquet-mr 1.8.1 or higher versions. In Parquet, not all types of columns can be used for filter push-down optimization. The set of valid column types is controlled by `ValidTypeMap`. Unfortunately, in parquet-mr 1.7.0 and prior versions, this limitation is too strict, and doesn't allow `BINARY (ENUM)` columns to be pushed down. On the other hand, `BINARY (ENUM)` is commonly seen in Parquet files written by libraries like `parquet-avro`. This restriction is problematic for Spark SQL, because Spark SQL doesn't have a type that maps to Parquet `BINARY (ENUM)` directly, and always converts `BINARY (ENUM)` to Catalyst `StringType`. Thus, a predicate involving a `BINARY (ENUM)` is recognized as one involving a string field instead and can be pushed down by the query optimizer. Such predicates are actually perfectly legal except that it fails the `ValidTypeMap` check. The workaround added here is relaxing `ValidTypeMap` to include `BINARY (ENUM)`. I also took the chance to simplify `ParquetCompatibilityTest` a little bit when adding regression test. Author: Cheng Lian l...@databricks.com Closes #8107 from liancheng/spark-9407/parquet-enum-filter-push-down. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3ecb3794 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3ecb3794 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3ecb3794 Branch: refs/heads/master Commit: 3ecb3794302dc12d0989f8d725483b2cc37762cf Parents: 9d08224 Author: Cheng Lian l...@databricks.com Authored: Wed Aug 12 20:01:34 2015 +0800 Committer: Cheng Lian l...@databricks.com Committed: Wed Aug 12 20:01:34 2015 +0800 -- .../datasources/parquet/ParquetFilters.scala| 38 - .../datasources/parquet/ParquetRelation.scala | 2 +- sql/core/src/test/README.md | 16 +-- sql/core/src/test/avro/parquet-compat.avdl | 13 +- sql/core/src/test/avro/parquet-compat.avpr | 13 +- .../parquet/test/avro/CompatibilityTest.java| 2 +- .../datasources/parquet/test/avro/Nested.java | 4 +- .../parquet/test/avro/ParquetAvroCompat.java| 4 +- .../parquet/test/avro/ParquetEnum.java | 142 +++ .../datasources/parquet/test/avro/Suit.java | 13 ++ .../parquet/ParquetAvroCompatibilitySuite.scala | 105 -- .../parquet/ParquetCompatibilityTest.scala | 33 ++--- sql/core/src/test/scripts/gen-avro.sh | 30 sql/core/src/test/scripts/gen-code.sh | 31 sql/core/src/test/scripts/gen-thrift.sh | 27 sql/core/src/test/thrift/parquet-compat.thrift | 2 +- .../hive/ParquetHiveCompatibilitySuite.scala| 83 +-- 17 files changed, 398 insertions(+), 160 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3ecb3794/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala index 9e2e232..63915e0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala @@ -25,9 +25,10 @@ import org.apache.hadoop.conf.Configuration import org.apache.parquet.filter2.compat.FilterCompat import org.apache.parquet.filter2.compat.FilterCompat._ import org.apache.parquet.filter2.predicate.FilterApi._ -import org.apache.parquet.filter2.predicate.{FilterApi, FilterPredicate, Statistics} -import org.apache.parquet.filter2.predicate.UserDefinedPredicate +import org.apache.parquet.filter2.predicate._ import org.apache.parquet.io.api.Binary +import org.apache.parquet.schema.OriginalType +import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName import org.apache.spark.SparkEnv import org.apache.spark.sql.catalyst.expressions._ @@ -197,6 +198,8 @@ private[sql] object ParquetFilters { def createFilter(schema: StructType, predicate: sources.Filter): Option[FilterPredicate] = { val dataTypeOf = schema.map(f = f.name - f.dataType).toMap +relaxParquetValidTypeMap + // NOTE: // // For any comparison operator `cmp`, both `a cmp NULL` and `NULL cmp a` evaluate to `NULL`, @@ -239,6 +242,37 @@ private[sql] object ParquetFilters { } }
spark git commit: [SPARK-9182] [SQL] Filters are not passed through to jdbc source
Repository: spark Updated Branches: refs/heads/master 741a29f98 - 9d0822455 [SPARK-9182] [SQL] Filters are not passed through to jdbc source This PR fixes unable to push filter down to JDBC source caused by `Cast` during pattern matching. While we are comparing columns of different type, there's a big chance we need a cast on the column, therefore not match the pattern directly on Attribute and would fail to push down. Author: Yijie Shen henry.yijies...@gmail.com Closes #8049 from yjshen/jdbc_pushdown. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9d082245 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9d082245 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9d082245 Branch: refs/heads/master Commit: 9d0822455ddc8d765440d58c463367a4d67ef456 Parents: 741a29f Author: Yijie Shen henry.yijies...@gmail.com Authored: Wed Aug 12 19:54:00 2015 +0800 Committer: Cheng Lian l...@databricks.com Committed: Wed Aug 12 19:54:00 2015 +0800 -- .../datasources/DataSourceStrategy.scala| 30 +++-- .../execution/datasources/jdbc/JDBCRDD.scala| 2 +- .../org/apache/spark/sql/jdbc/JDBCSuite.scala | 34 3 files changed, 63 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9d082245/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala index 2a4c40d..9eea2b0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala @@ -20,13 +20,13 @@ package org.apache.spark.sql.execution.datasources import org.apache.spark.{Logging, TaskContext} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.rdd.{MapPartitionsRDD, RDD, UnionRDD} -import org.apache.spark.sql.catalyst.{InternalRow, expressions} +import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow, expressions} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.planning.PhysicalOperation import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.sources._ -import org.apache.spark.sql.types.{StringType, StructType} +import org.apache.spark.sql.types.{TimestampType, DateType, StringType, StructType} import org.apache.spark.sql.{SaveMode, Strategy, execution, sources, _} import org.apache.spark.unsafe.types.UTF8String import org.apache.spark.util.{SerializableConfiguration, Utils} @@ -343,11 +343,17 @@ private[sql] object DataSourceStrategy extends Strategy with Logging { * and convert them. */ protected[sql] def selectFilters(filters: Seq[Expression]) = { +import CatalystTypeConverters._ + def translate(predicate: Expression): Option[Filter] = predicate match { case expressions.EqualTo(a: Attribute, Literal(v, _)) = Some(sources.EqualTo(a.name, v)) case expressions.EqualTo(Literal(v, _), a: Attribute) = Some(sources.EqualTo(a.name, v)) + case expressions.EqualTo(Cast(a: Attribute, _), l: Literal) = +Some(sources.EqualTo(a.name, convertToScala(Cast(l, a.dataType).eval(), a.dataType))) + case expressions.EqualTo(l: Literal, Cast(a: Attribute, _)) = +Some(sources.EqualTo(a.name, convertToScala(Cast(l, a.dataType).eval(), a.dataType))) case expressions.EqualNullSafe(a: Attribute, Literal(v, _)) = Some(sources.EqualNullSafe(a.name, v)) @@ -358,21 +364,41 @@ private[sql] object DataSourceStrategy extends Strategy with Logging { Some(sources.GreaterThan(a.name, v)) case expressions.GreaterThan(Literal(v, _), a: Attribute) = Some(sources.LessThan(a.name, v)) + case expressions.GreaterThan(Cast(a: Attribute, _), l: Literal) = +Some(sources.GreaterThan(a.name, convertToScala(Cast(l, a.dataType).eval(), a.dataType))) + case expressions.GreaterThan(l: Literal, Cast(a: Attribute, _)) = +Some(sources.LessThan(a.name, convertToScala(Cast(l, a.dataType).eval(), a.dataType))) case expressions.LessThan(a: Attribute, Literal(v, _)) = Some(sources.LessThan(a.name, v)) case expressions.LessThan(Literal(v, _), a: Attribute) = Some(sources.GreaterThan(a.name, v)) + case expressions.LessThan(Cast(a: Attribute, _), l: Literal) = +Some(sources.LessThan(a.name, convertToScala(Cast(l,
spark git commit: [SPARK-9182] [SQL] Filters are not passed through to jdbc source
Repository: spark Updated Branches: refs/heads/branch-1.5 5dd0c5cd6 - 8e32db9a5 [SPARK-9182] [SQL] Filters are not passed through to jdbc source This PR fixes unable to push filter down to JDBC source caused by `Cast` during pattern matching. While we are comparing columns of different type, there's a big chance we need a cast on the column, therefore not match the pattern directly on Attribute and would fail to push down. Author: Yijie Shen henry.yijies...@gmail.com Closes #8049 from yjshen/jdbc_pushdown. (cherry picked from commit 9d0822455ddc8d765440d58c463367a4d67ef456) Signed-off-by: Cheng Lian l...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8e32db9a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8e32db9a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8e32db9a Branch: refs/heads/branch-1.5 Commit: 8e32db9a5b082f45d161ba1fc88732a6ba166ac1 Parents: 5dd0c5c Author: Yijie Shen henry.yijies...@gmail.com Authored: Wed Aug 12 19:54:00 2015 +0800 Committer: Cheng Lian l...@databricks.com Committed: Wed Aug 12 19:54:31 2015 +0800 -- .../datasources/DataSourceStrategy.scala| 30 +++-- .../execution/datasources/jdbc/JDBCRDD.scala| 2 +- .../org/apache/spark/sql/jdbc/JDBCSuite.scala | 34 3 files changed, 63 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8e32db9a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala index 2a4c40d..9eea2b0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala @@ -20,13 +20,13 @@ package org.apache.spark.sql.execution.datasources import org.apache.spark.{Logging, TaskContext} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.rdd.{MapPartitionsRDD, RDD, UnionRDD} -import org.apache.spark.sql.catalyst.{InternalRow, expressions} +import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow, expressions} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.planning.PhysicalOperation import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.sources._ -import org.apache.spark.sql.types.{StringType, StructType} +import org.apache.spark.sql.types.{TimestampType, DateType, StringType, StructType} import org.apache.spark.sql.{SaveMode, Strategy, execution, sources, _} import org.apache.spark.unsafe.types.UTF8String import org.apache.spark.util.{SerializableConfiguration, Utils} @@ -343,11 +343,17 @@ private[sql] object DataSourceStrategy extends Strategy with Logging { * and convert them. */ protected[sql] def selectFilters(filters: Seq[Expression]) = { +import CatalystTypeConverters._ + def translate(predicate: Expression): Option[Filter] = predicate match { case expressions.EqualTo(a: Attribute, Literal(v, _)) = Some(sources.EqualTo(a.name, v)) case expressions.EqualTo(Literal(v, _), a: Attribute) = Some(sources.EqualTo(a.name, v)) + case expressions.EqualTo(Cast(a: Attribute, _), l: Literal) = +Some(sources.EqualTo(a.name, convertToScala(Cast(l, a.dataType).eval(), a.dataType))) + case expressions.EqualTo(l: Literal, Cast(a: Attribute, _)) = +Some(sources.EqualTo(a.name, convertToScala(Cast(l, a.dataType).eval(), a.dataType))) case expressions.EqualNullSafe(a: Attribute, Literal(v, _)) = Some(sources.EqualNullSafe(a.name, v)) @@ -358,21 +364,41 @@ private[sql] object DataSourceStrategy extends Strategy with Logging { Some(sources.GreaterThan(a.name, v)) case expressions.GreaterThan(Literal(v, _), a: Attribute) = Some(sources.LessThan(a.name, v)) + case expressions.GreaterThan(Cast(a: Attribute, _), l: Literal) = +Some(sources.GreaterThan(a.name, convertToScala(Cast(l, a.dataType).eval(), a.dataType))) + case expressions.GreaterThan(l: Literal, Cast(a: Attribute, _)) = +Some(sources.LessThan(a.name, convertToScala(Cast(l, a.dataType).eval(), a.dataType))) case expressions.LessThan(a: Attribute, Literal(v, _)) = Some(sources.LessThan(a.name, v)) case expressions.LessThan(Literal(v, _), a: Attribute) = Some(sources.GreaterThan(a.name, v)) + case
spark git commit: [SPARK-9407] [SQL] Relaxes Parquet ValidTypeMap to allow ENUM predicates to be pushed down
Repository: spark Updated Branches: refs/heads/branch-1.5 8e32db9a5 - 5e6fdc659 [SPARK-9407] [SQL] Relaxes Parquet ValidTypeMap to allow ENUM predicates to be pushed down This PR adds a hacky workaround for PARQUET-201, and should be removed once we upgrade to parquet-mr 1.8.1 or higher versions. In Parquet, not all types of columns can be used for filter push-down optimization. The set of valid column types is controlled by `ValidTypeMap`. Unfortunately, in parquet-mr 1.7.0 and prior versions, this limitation is too strict, and doesn't allow `BINARY (ENUM)` columns to be pushed down. On the other hand, `BINARY (ENUM)` is commonly seen in Parquet files written by libraries like `parquet-avro`. This restriction is problematic for Spark SQL, because Spark SQL doesn't have a type that maps to Parquet `BINARY (ENUM)` directly, and always converts `BINARY (ENUM)` to Catalyst `StringType`. Thus, a predicate involving a `BINARY (ENUM)` is recognized as one involving a string field instead and can be pushed down by the query optimizer. Such predicates are actually perfectly legal except that it fails the `ValidTypeMap` check. The workaround added here is relaxing `ValidTypeMap` to include `BINARY (ENUM)`. I also took the chance to simplify `ParquetCompatibilityTest` a little bit when adding regression test. Author: Cheng Lian l...@databricks.com Closes #8107 from liancheng/spark-9407/parquet-enum-filter-push-down. (cherry picked from commit 3ecb3794302dc12d0989f8d725483b2cc37762cf) Signed-off-by: Cheng Lian l...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5e6fdc65 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5e6fdc65 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5e6fdc65 Branch: refs/heads/branch-1.5 Commit: 5e6fdc659c634bd6496be117802c99f152a2b373 Parents: 8e32db9 Author: Cheng Lian l...@databricks.com Authored: Wed Aug 12 20:01:34 2015 +0800 Committer: Cheng Lian l...@databricks.com Committed: Wed Aug 12 20:05:45 2015 +0800 -- .../datasources/parquet/ParquetFilters.scala| 38 - .../datasources/parquet/ParquetRelation.scala | 2 +- sql/core/src/test/README.md | 16 +-- sql/core/src/test/avro/parquet-compat.avdl | 13 +- sql/core/src/test/avro/parquet-compat.avpr | 13 +- .../parquet/test/avro/CompatibilityTest.java| 2 +- .../datasources/parquet/test/avro/Nested.java | 4 +- .../parquet/test/avro/ParquetAvroCompat.java| 4 +- .../parquet/test/avro/ParquetEnum.java | 142 +++ .../datasources/parquet/test/avro/Suit.java | 13 ++ .../parquet/ParquetAvroCompatibilitySuite.scala | 105 -- .../parquet/ParquetCompatibilityTest.scala | 33 ++--- sql/core/src/test/scripts/gen-avro.sh | 30 sql/core/src/test/scripts/gen-code.sh | 31 sql/core/src/test/scripts/gen-thrift.sh | 27 sql/core/src/test/thrift/parquet-compat.thrift | 2 +- .../hive/ParquetHiveCompatibilitySuite.scala| 83 +-- 17 files changed, 398 insertions(+), 160 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5e6fdc65/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala index 9e2e232..63915e0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala @@ -25,9 +25,10 @@ import org.apache.hadoop.conf.Configuration import org.apache.parquet.filter2.compat.FilterCompat import org.apache.parquet.filter2.compat.FilterCompat._ import org.apache.parquet.filter2.predicate.FilterApi._ -import org.apache.parquet.filter2.predicate.{FilterApi, FilterPredicate, Statistics} -import org.apache.parquet.filter2.predicate.UserDefinedPredicate +import org.apache.parquet.filter2.predicate._ import org.apache.parquet.io.api.Binary +import org.apache.parquet.schema.OriginalType +import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName import org.apache.spark.SparkEnv import org.apache.spark.sql.catalyst.expressions._ @@ -197,6 +198,8 @@ private[sql] object ParquetFilters { def createFilter(schema: StructType, predicate: sources.Filter): Option[FilterPredicate] = { val dataTypeOf = schema.map(f = f.name - f.dataType).toMap +relaxParquetValidTypeMap + // NOTE: // // For any comparison operator `cmp`,
spark git commit: [SPARK-9847] [ML] Modified copyValues to distinguish between default, explicit param values
Repository: spark Updated Branches: refs/heads/master 57ec27dd7 - 70fe55886 [SPARK-9847] [ML] Modified copyValues to distinguish between default, explicit param values From JIRA: Currently, Params.copyValues copies default parameter values to the paramMap of the target instance, rather than the defaultParamMap. It should copy to the defaultParamMap because explicitly setting a parameter can change the semantics. This issue arose in SPARK-9789, where 2 params threshold and thresholds for LogisticRegression can have mutually exclusive values. If thresholds is set, then fit() will copy the default value of threshold as well, easily resulting in inconsistent settings for the 2 params. CC: mengxr Author: Joseph K. Bradley jos...@databricks.com Closes #8115 from jkbradley/copyvalues-fix. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/70fe5588 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/70fe5588 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/70fe5588 Branch: refs/heads/master Commit: 70fe558867ccb4bcff6ec673438b03608bb02252 Parents: 57ec27d Author: Joseph K. Bradley jos...@databricks.com Authored: Wed Aug 12 10:48:52 2015 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Wed Aug 12 10:48:52 2015 -0700 -- .../scala/org/apache/spark/ml/param/params.scala | 19 --- .../org/apache/spark/ml/param/ParamsSuite.scala | 8 2 files changed, 24 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/70fe5588/mllib/src/main/scala/org/apache/spark/ml/param/params.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala index d68f5ff..91c0a56 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala @@ -559,13 +559,26 @@ trait Params extends Identifiable with Serializable { /** * Copies param values from this instance to another instance for params shared by them. - * @param to the target instance - * @param extra extra params to be copied + * + * This handles default Params and explicitly set Params separately. + * Default Params are copied from and to [[defaultParamMap]], and explicitly set Params are + * copied from and to [[paramMap]]. + * Warning: This implicitly assumes that this [[Params]] instance and the target instance + * share the same set of default Params. + * + * @param to the target instance, which should work with the same set of default Params as this + * source instance + * @param extra extra params to be copied to the target's [[paramMap]] * @return the target instance with param values copied */ protected def copyValues[T : Params](to: T, extra: ParamMap = ParamMap.empty): T = { -val map = extractParamMap(extra) +val map = paramMap ++ extra params.foreach { param = + // copy default Params + if (defaultParamMap.contains(param) to.hasParam(param.name)) { +to.defaultParamMap.put(to.getParam(param.name), defaultParamMap(param)) + } + // copy explicitly set Params if (map.contains(param) to.hasParam(param.name)) { to.set(param.name, map(param)) } http://git-wip-us.apache.org/repos/asf/spark/blob/70fe5588/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala index 050d417..be95638 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala @@ -200,6 +200,14 @@ class ParamsSuite extends SparkFunSuite { val inArray = ParamValidators.inArray[Int](Array(1, 2)) assert(inArray(1) inArray(2) !inArray(0)) } + + test(Params.copyValues) { +val t = new TestParams() +val t2 = t.copy(ParamMap.empty) +assert(!t2.isSet(t2.maxIter)) +val t3 = t.copy(ParamMap(t.maxIter - 20)) +assert(t3.isSet(t3.maxIter)) + } } object ParamsSuite extends SparkFunSuite { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9726] [PYTHON] PySpark DF join no longer accepts on=None
Repository: spark Updated Branches: refs/heads/branch-1.5 b515f890d - 8629c33b6 [SPARK-9726] [PYTHON] PySpark DF join no longer accepts on=None rxin First pull request for Spark so let me know if I am missing anything The contribution is my original work and I license the work to the project under the project's open source license. Author: Brennan Ashton bash...@brennanashton.com Closes #8016 from btashton/patch-1. (cherry picked from commit 60103ecd3d9c92709a5878be7ebd57012813ab48) Signed-off-by: Michael Armbrust mich...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8629c33b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8629c33b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8629c33b Branch: refs/heads/branch-1.5 Commit: 8629c33b616d4e3548b64f7681ebcd9c9c4195fd Parents: b515f89 Author: Brennan Ashton bash...@brennanashton.com Authored: Wed Aug 12 11:57:30 2015 -0700 Committer: Michael Armbrust mich...@databricks.com Committed: Wed Aug 12 11:57:40 2015 -0700 -- python/pyspark/sql/dataframe.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8629c33b/python/pyspark/sql/dataframe.py -- diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 47d5a6a..09647ff 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -566,8 +566,7 @@ class DataFrame(object): if on is None or len(on) == 0: jdf = self._jdf.join(other._jdf) - -if isinstance(on[0], basestring): +elif isinstance(on[0], basestring): jdf = self._jdf.join(other._jdf, self._jseq(on)) else: assert isinstance(on[0], Column), on should be Column or list of Column - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9726] [PYTHON] PySpark DF join no longer accepts on=None
Repository: spark Updated Branches: refs/heads/master 70fe55886 - 60103ecd3 [SPARK-9726] [PYTHON] PySpark DF join no longer accepts on=None rxin First pull request for Spark so let me know if I am missing anything The contribution is my original work and I license the work to the project under the project's open source license. Author: Brennan Ashton bash...@brennanashton.com Closes #8016 from btashton/patch-1. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/60103ecd Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/60103ecd Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/60103ecd Branch: refs/heads/master Commit: 60103ecd3d9c92709a5878be7ebd57012813ab48 Parents: 70fe558 Author: Brennan Ashton bash...@brennanashton.com Authored: Wed Aug 12 11:57:30 2015 -0700 Committer: Michael Armbrust mich...@databricks.com Committed: Wed Aug 12 11:57:30 2015 -0700 -- python/pyspark/sql/dataframe.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/60103ecd/python/pyspark/sql/dataframe.py -- diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 47d5a6a..09647ff 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -566,8 +566,7 @@ class DataFrame(object): if on is None or len(on) == 0: jdf = self._jdf.join(other._jdf) - -if isinstance(on[0], basestring): +elif isinstance(on[0], basestring): jdf = self._jdf.join(other._jdf, self._jseq(on)) else: assert isinstance(on[0], Column), on should be Column or list of Column - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9804] [HIVE] Use correct value for isSrcLocal parameter.
Repository: spark Updated Branches: refs/heads/branch-1.5 4c6b1296d - e9641f192 [SPARK-9804] [HIVE] Use correct value for isSrcLocal parameter. If the correct parameter is not provided, Hive will run into an error because it calls methods that are specific to the local filesystem to copy the data. Author: Marcelo Vanzin van...@cloudera.com Closes #8086 from vanzin/SPARK-9804. (cherry picked from commit 57ec27dd7784ce15a2ece8a6c8ac7bd5fd25aea2) Signed-off-by: Michael Armbrust mich...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e9641f19 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e9641f19 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e9641f19 Branch: refs/heads/branch-1.5 Commit: e9641f192dc6a949cfb8fa1614d446026c7bf4b3 Parents: 4c6b129 Author: Marcelo Vanzin van...@cloudera.com Authored: Wed Aug 12 10:38:30 2015 -0700 Committer: Michael Armbrust mich...@databricks.com Committed: Wed Aug 12 10:38:41 2015 -0700 -- .../org/apache/spark/sql/hive/client/HiveShim.scala| 13 ++--- 1 file changed, 10 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e9641f19/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala index 6e826ce..8fc8935 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala @@ -25,7 +25,7 @@ import java.util.concurrent.TimeUnit import scala.collection.JavaConversions._ -import org.apache.hadoop.fs.Path +import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.conf.HiveConf import org.apache.hadoop.hive.ql.Driver import org.apache.hadoop.hive.ql.metadata.{Hive, Partition, Table} @@ -429,7 +429,7 @@ private[client] class Shim_v0_14 extends Shim_v0_13 { isSkewedStoreAsSubdir: Boolean): Unit = { loadPartitionMethod.invoke(hive, loadPath, tableName, partSpec, replace: JBoolean, holdDDLTime: JBoolean, inheritTableSpecs: JBoolean, isSkewedStoreAsSubdir: JBoolean, - JBoolean.TRUE, JBoolean.FALSE) + isSrcLocal(loadPath, hive.getConf()): JBoolean, JBoolean.FALSE) } override def loadTable( @@ -439,7 +439,7 @@ private[client] class Shim_v0_14 extends Shim_v0_13 { replace: Boolean, holdDDLTime: Boolean): Unit = { loadTableMethod.invoke(hive, loadPath, tableName, replace: JBoolean, holdDDLTime: JBoolean, - JBoolean.TRUE, JBoolean.FALSE, JBoolean.FALSE) + isSrcLocal(loadPath, hive.getConf()): JBoolean, JBoolean.FALSE, JBoolean.FALSE) } override def loadDynamicPartitions( @@ -461,6 +461,13 @@ private[client] class Shim_v0_14 extends Shim_v0_13 { HiveConf.ConfVars.METASTORE_CLIENT_CONNECT_RETRY_DELAY, TimeUnit.MILLISECONDS).asInstanceOf[Long] } + + protected def isSrcLocal(path: Path, conf: HiveConf): Boolean = { +val localFs = FileSystem.getLocal(conf) +val pathFs = FileSystem.get(path.toUri(), conf) +localFs.getUri() == pathFs.getUri() + } + } private[client] class Shim_v1_0 extends Shim_v0_14 { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org