git commit: SPARK-2294: fix locality inversion bug in TaskManager
Repository: spark Updated Branches: refs/heads/master 5a826c00c - 63bdb1f41 SPARK-2294: fix locality inversion bug in TaskManager copied from original JIRA (https://issues.apache.org/jira/browse/SPARK-2294): If an executor E is free, a task may be speculatively assigned to E when there are other tasks in the job that have not been launched (at all) yet. Similarly, a task without any locality preferences may be assigned to E when there was another NODE_LOCAL task that could have been scheduled. This happens because TaskSchedulerImpl calls TaskSetManager.resourceOffer (which in turn calls TaskSetManager.findTask) with increasing locality levels, beginning with PROCESS_LOCAL, followed by NODE_LOCAL, and so on until the highest currently allowed level. Now, supposed NODE_LOCAL is the highest currently allowed locality level. The first time findTask is called, it will be called with max level PROCESS_LOCAL; if it cannot find any PROCESS_LOCAL tasks, it will try to schedule tasks with no locality preferences or speculative tasks. As a result, speculative tasks or tasks with no preferences may be scheduled instead of NODE_LOCAL tasks. I added an additional parameter in resourceOffer and findTask, maxLocality, indicating when we should consider the tasks without locality preference Author: CodingCat zhunans...@gmail.com Closes #1313 from CodingCat/SPARK-2294 and squashes the following commits: bf3f13b [CodingCat] rollback some forgotten changes 89f9bc0 [CodingCat] address matei's comments 18cae02 [CodingCat] add test case for node-local tasks 2ba6195 [CodingCat] fix failed test cases 87dd09e [CodingCat] fix style 9b9432f [CodingCat] remove hasNodeLocalOnlyTasks fdd1573 [CodingCat] fix failed test cases 941a4fd [CodingCat] see my shocked face.. f600085 [CodingCat] remove hasNodeLocalOnlyTasks checking 0b8a46b [CodingCat] test whether hasNodeLocalOnlyTasks affect the results 73ceda8 [CodingCat] style fix b3a430b [CodingCat] remove fine granularity tracking for node-local only tasks f9a2ad8 [CodingCat] simplify the logic in TaskSchedulerImpl c8c1de4 [CodingCat] simplify the patch be652ed [CodingCat] avoid unnecessary delay when we only have nopref tasks dee9e22 [CodingCat] fix locality inversion bug in TaskManager by moving nopref branch Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/63bdb1f4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/63bdb1f4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/63bdb1f4 Branch: refs/heads/master Commit: 63bdb1f41b4895e3a9444f7938094438a94d3007 Parents: 5a826c0 Author: CodingCat zhunans...@gmail.com Authored: Tue Aug 5 23:02:58 2014 -0700 Committer: Matei Zaharia ma...@databricks.com Committed: Tue Aug 5 23:02:58 2014 -0700 -- .../apache/spark/scheduler/TaskLocality.scala | 2 +- .../spark/scheduler/TaskSchedulerImpl.scala | 7 +- .../apache/spark/scheduler/TaskSetManager.scala | 109 +- .../spark/scheduler/TaskSetManagerSuite.scala | 205 +-- 4 files changed, 203 insertions(+), 120 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/63bdb1f4/core/src/main/scala/org/apache/spark/scheduler/TaskLocality.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskLocality.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskLocality.scala index eb920ab..f176d09 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskLocality.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskLocality.scala @@ -22,7 +22,7 @@ import org.apache.spark.annotation.DeveloperApi @DeveloperApi object TaskLocality extends Enumeration { // Process local is expected to be used ONLY within TaskSetManager for now. - val PROCESS_LOCAL, NODE_LOCAL, RACK_LOCAL, ANY = Value + val PROCESS_LOCAL, NODE_LOCAL, NO_PREF, RACK_LOCAL, ANY = Value type TaskLocality = Value http://git-wip-us.apache.org/repos/asf/spark/blob/63bdb1f4/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala index d2f764f..6c0d1b2 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala @@ -89,11 +89,11 @@ private[spark] class TaskSchedulerImpl( // The set of executors we have on each host; this is used to compute hostsAlive, which // in turn is used to decide when we can attain data locality on a given host - private val executorsByHost = new HashMap[String,
git commit: SPARK-2294: fix locality inversion bug in TaskManager
Repository: spark Updated Branches: refs/heads/branch-1.1 048205549 - 1da2fdfab SPARK-2294: fix locality inversion bug in TaskManager copied from original JIRA (https://issues.apache.org/jira/browse/SPARK-2294): If an executor E is free, a task may be speculatively assigned to E when there are other tasks in the job that have not been launched (at all) yet. Similarly, a task without any locality preferences may be assigned to E when there was another NODE_LOCAL task that could have been scheduled. This happens because TaskSchedulerImpl calls TaskSetManager.resourceOffer (which in turn calls TaskSetManager.findTask) with increasing locality levels, beginning with PROCESS_LOCAL, followed by NODE_LOCAL, and so on until the highest currently allowed level. Now, supposed NODE_LOCAL is the highest currently allowed locality level. The first time findTask is called, it will be called with max level PROCESS_LOCAL; if it cannot find any PROCESS_LOCAL tasks, it will try to schedule tasks with no locality preferences or speculative tasks. As a result, speculative tasks or tasks with no preferences may be scheduled instead of NODE_LOCAL tasks. I added an additional parameter in resourceOffer and findTask, maxLocality, indicating when we should consider the tasks without locality preference Author: CodingCat zhunans...@gmail.com Closes #1313 from CodingCat/SPARK-2294 and squashes the following commits: bf3f13b [CodingCat] rollback some forgotten changes 89f9bc0 [CodingCat] address matei's comments 18cae02 [CodingCat] add test case for node-local tasks 2ba6195 [CodingCat] fix failed test cases 87dd09e [CodingCat] fix style 9b9432f [CodingCat] remove hasNodeLocalOnlyTasks fdd1573 [CodingCat] fix failed test cases 941a4fd [CodingCat] see my shocked face.. f600085 [CodingCat] remove hasNodeLocalOnlyTasks checking 0b8a46b [CodingCat] test whether hasNodeLocalOnlyTasks affect the results 73ceda8 [CodingCat] style fix b3a430b [CodingCat] remove fine granularity tracking for node-local only tasks f9a2ad8 [CodingCat] simplify the logic in TaskSchedulerImpl c8c1de4 [CodingCat] simplify the patch be652ed [CodingCat] avoid unnecessary delay when we only have nopref tasks dee9e22 [CodingCat] fix locality inversion bug in TaskManager by moving nopref branch (cherry picked from commit 63bdb1f41b4895e3a9444f7938094438a94d3007) Signed-off-by: Matei Zaharia ma...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1da2fdfa Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1da2fdfa Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1da2fdfa Branch: refs/heads/branch-1.1 Commit: 1da2fdfab66c0dac67f413d479876bed43a9df03 Parents: 0482055 Author: CodingCat zhunans...@gmail.com Authored: Tue Aug 5 23:02:58 2014 -0700 Committer: Matei Zaharia ma...@databricks.com Committed: Tue Aug 5 23:03:13 2014 -0700 -- .../apache/spark/scheduler/TaskLocality.scala | 2 +- .../spark/scheduler/TaskSchedulerImpl.scala | 7 +- .../apache/spark/scheduler/TaskSetManager.scala | 109 +- .../spark/scheduler/TaskSetManagerSuite.scala | 205 +-- 4 files changed, 203 insertions(+), 120 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1da2fdfa/core/src/main/scala/org/apache/spark/scheduler/TaskLocality.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskLocality.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskLocality.scala index eb920ab..f176d09 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskLocality.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskLocality.scala @@ -22,7 +22,7 @@ import org.apache.spark.annotation.DeveloperApi @DeveloperApi object TaskLocality extends Enumeration { // Process local is expected to be used ONLY within TaskSetManager for now. - val PROCESS_LOCAL, NODE_LOCAL, RACK_LOCAL, ANY = Value + val PROCESS_LOCAL, NODE_LOCAL, NO_PREF, RACK_LOCAL, ANY = Value type TaskLocality = Value http://git-wip-us.apache.org/repos/asf/spark/blob/1da2fdfa/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala index d2f764f..6c0d1b2 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala @@ -89,11 +89,11 @@ private[spark] class TaskSchedulerImpl( // The set of executors we have on each host; this is used to compute hostsAlive, which // in turn
git commit: [MLlib] Use this.type as return type in k-means' builder pattern
Repository: spark Updated Branches: refs/heads/master 63bdb1f41 - c7b52010d [MLlib] Use this.type as return type in k-means' builder pattern to ensure that the return object is itself. Author: DB Tsai dbt...@alpinenow.com Closes #1796 from dbtsai/dbtsai-kmeans and squashes the following commits: 658989e [DB Tsai] Alpine Data Labs Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c7b52010 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c7b52010 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c7b52010 Branch: refs/heads/master Commit: c7b52010dfd0a765376464ebc43d5cdd3b80a460 Parents: 63bdb1f Author: DB Tsai dbt...@alpinenow.com Authored: Tue Aug 5 23:32:29 2014 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Tue Aug 5 23:32:29 2014 -0700 -- .../org/apache/spark/mllib/clustering/KMeans.scala | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c7b52010/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala index db425d8..fce8fe2 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala @@ -52,13 +52,13 @@ class KMeans private ( def this() = this(2, 20, 1, KMeans.K_MEANS_PARALLEL, 5, 1e-4) /** Set the number of clusters to create (k). Default: 2. */ - def setK(k: Int): KMeans = { + def setK(k: Int): this.type = { this.k = k this } /** Set maximum number of iterations to run. Default: 20. */ - def setMaxIterations(maxIterations: Int): KMeans = { + def setMaxIterations(maxIterations: Int): this.type = { this.maxIterations = maxIterations this } @@ -68,7 +68,7 @@ class KMeans private ( * initial cluster centers, or k-means|| to use a parallel variant of k-means++ * (Bahmani et al., Scalable K-Means++, VLDB 2012). Default: k-means||. */ - def setInitializationMode(initializationMode: String): KMeans = { + def setInitializationMode(initializationMode: String): this.type = { if (initializationMode != KMeans.RANDOM initializationMode != KMeans.K_MEANS_PARALLEL) { throw new IllegalArgumentException(Invalid initialization mode: + initializationMode) } @@ -83,7 +83,7 @@ class KMeans private ( * return the best clustering found over any run. Default: 1. */ @Experimental - def setRuns(runs: Int): KMeans = { + def setRuns(runs: Int): this.type = { if (runs = 0) { throw new IllegalArgumentException(Number of runs must be positive) } @@ -95,7 +95,7 @@ class KMeans private ( * Set the number of steps for the k-means|| initialization mode. This is an advanced * setting -- the default of 5 is almost always enough. Default: 5. */ - def setInitializationSteps(initializationSteps: Int): KMeans = { + def setInitializationSteps(initializationSteps: Int): this.type = { if (initializationSteps = 0) { throw new IllegalArgumentException(Number of initialization steps must be positive) } @@ -107,7 +107,7 @@ class KMeans private ( * Set the distance threshold within which we've consider centers to have converged. * If all centers move less than this Euclidean distance, we stop iterating one run. */ - def setEpsilon(epsilon: Double): KMeans = { + def setEpsilon(epsilon: Double): this.type = { this.epsilon = epsilon this } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
git commit: [MLlib] Use this.type as return type in k-means' builder pattern
Repository: spark Updated Branches: refs/heads/branch-1.1 1da2fdfab - aec217a58 [MLlib] Use this.type as return type in k-means' builder pattern to ensure that the return object is itself. Author: DB Tsai dbt...@alpinenow.com Closes #1796 from dbtsai/dbtsai-kmeans and squashes the following commits: 658989e [DB Tsai] Alpine Data Labs (cherry picked from commit c7b52010dfd0a765376464ebc43d5cdd3b80a460) Signed-off-by: Xiangrui Meng m...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/aec217a5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/aec217a5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/aec217a5 Branch: refs/heads/branch-1.1 Commit: aec217a5831f421c20ee2227282d2ba0bd8b6d41 Parents: 1da2fdf Author: DB Tsai dbt...@alpinenow.com Authored: Tue Aug 5 23:32:29 2014 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Tue Aug 5 23:33:02 2014 -0700 -- .../org/apache/spark/mllib/clustering/KMeans.scala | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/aec217a5/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala index db425d8..fce8fe2 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala @@ -52,13 +52,13 @@ class KMeans private ( def this() = this(2, 20, 1, KMeans.K_MEANS_PARALLEL, 5, 1e-4) /** Set the number of clusters to create (k). Default: 2. */ - def setK(k: Int): KMeans = { + def setK(k: Int): this.type = { this.k = k this } /** Set maximum number of iterations to run. Default: 20. */ - def setMaxIterations(maxIterations: Int): KMeans = { + def setMaxIterations(maxIterations: Int): this.type = { this.maxIterations = maxIterations this } @@ -68,7 +68,7 @@ class KMeans private ( * initial cluster centers, or k-means|| to use a parallel variant of k-means++ * (Bahmani et al., Scalable K-Means++, VLDB 2012). Default: k-means||. */ - def setInitializationMode(initializationMode: String): KMeans = { + def setInitializationMode(initializationMode: String): this.type = { if (initializationMode != KMeans.RANDOM initializationMode != KMeans.K_MEANS_PARALLEL) { throw new IllegalArgumentException(Invalid initialization mode: + initializationMode) } @@ -83,7 +83,7 @@ class KMeans private ( * return the best clustering found over any run. Default: 1. */ @Experimental - def setRuns(runs: Int): KMeans = { + def setRuns(runs: Int): this.type = { if (runs = 0) { throw new IllegalArgumentException(Number of runs must be positive) } @@ -95,7 +95,7 @@ class KMeans private ( * Set the number of steps for the k-means|| initialization mode. This is an advanced * setting -- the default of 5 is almost always enough. Default: 5. */ - def setInitializationSteps(initializationSteps: Int): KMeans = { + def setInitializationSteps(initializationSteps: Int): this.type = { if (initializationSteps = 0) { throw new IllegalArgumentException(Number of initialization steps must be positive) } @@ -107,7 +107,7 @@ class KMeans private ( * Set the distance threshold within which we've consider centers to have converged. * If all centers move less than this Euclidean distance, we stop iterating one run. */ - def setEpsilon(epsilon: Double): KMeans = { + def setEpsilon(epsilon: Double): this.type = { this.epsilon = epsilon this } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
git commit: [SPARK-1022][Streaming][HOTFIX] Fixed zookeeper dependency of Kafka
Repository: spark Updated Branches: refs/heads/master c7b52010d - ee7f30856 [SPARK-1022][Streaming][HOTFIX] Fixed zookeeper dependency of Kafka https://github.com/apache/spark/pull/1751 caused maven builds to fail. ``` ~/Apache/spark(branch-1.1|â) ⤠mvn -U -DskipTests clean install . . . [error] Apache/spark/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaStreamSuite.scala:36: object NIOServerCnxnFactory is not a member of package org.apache.zookeeper.server [error] import org.apache.zookeeper.server.NIOServerCnxnFactory [error]^ [error] Apache/spark/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaStreamSuite.scala:199: not found: type NIOServerCnxnFactory [error] val factory = new NIOServerCnxnFactory() [error] ^ [error] two errors found [error] Compile failed at Aug 5, 2014 1:42:36 PM [0.503s] ``` The problem is how SBT and Maven resolves multiple versions of the same library, which in this case, is Zookeeper. Observing and comparing the dependency trees from Maven and SBT showed this. Spark depends on ZK 3.4.5 whereas Apache Kafka transitively depends on upon ZK 3.3.4. SBT decides to evict 3.3.4 and use the higher version 3.4.5. But Maven decides to stick to the closest (in the tree) dependent version of 3.3.4. And 3.3.4 does not have NIOServerCnxnFactory. The solution in this patch excludes zookeeper from the apache-kafka dependency in streaming-kafka module so that it just inherits zookeeper from Spark core. Author: Tathagata Das tathagata.das1...@gmail.com Closes #1797 from tdas/kafka-zk-fix and squashes the following commits: 94b3931 [Tathagata Das] Fixed zookeeper dependency of Kafka Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ee7f3085 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ee7f3085 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ee7f3085 Branch: refs/heads/master Commit: ee7f30856bf3f7b9a4f1d3641b6bc2cc4e842b0e Parents: c7b5201 Author: Tathagata Das tathagata.das1...@gmail.com Authored: Tue Aug 5 23:41:34 2014 -0700 Committer: Patrick Wendell pwend...@gmail.com Committed: Tue Aug 5 23:41:34 2014 -0700 -- external/kafka/pom.xml | 4 1 file changed, 4 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ee7f3085/external/kafka/pom.xml -- diff --git a/external/kafka/pom.xml b/external/kafka/pom.xml index 2aee999..4e2275a 100644 --- a/external/kafka/pom.xml +++ b/external/kafka/pom.xml @@ -68,6 +68,10 @@ groupIdorg.slf4j/groupId artifactIdslf4j-simple/artifactId /exclusion +exclusion + groupIdorg.apache.zookeeper/groupId + artifactIdzookeeper/artifactId +/exclusion /exclusions /dependency dependency - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
git commit: [SPARK-2875] [PySpark] [SQL] handle null in schemaRDD()
Repository: spark Updated Branches: refs/heads/master 09f7e4587 - 48789117c [SPARK-2875] [PySpark] [SQL] handle null in schemaRDD() Handle null in schemaRDD during converting them into Python. Author: Davies Liu davies@gmail.com Closes #1802 from davies/json and squashes the following commits: 88e6b1f [Davies Liu] handle null in schemaRDD() Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/48789117 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/48789117 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/48789117 Branch: refs/heads/master Commit: 48789117c2dd6d38e0bd8d21cdbcb989913205a6 Parents: 09f7e45 Author: Davies Liu davies@gmail.com Authored: Wed Aug 6 11:08:12 2014 -0700 Committer: Michael Armbrust mich...@databricks.com Committed: Wed Aug 6 11:08:12 2014 -0700 -- python/pyspark/sql.py | 7 + .../scala/org/apache/spark/sql/SchemaRDD.scala | 27 2 files changed, 23 insertions(+), 11 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/48789117/python/pyspark/sql.py -- diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py index f109370..adc56e7 100644 --- a/python/pyspark/sql.py +++ b/python/pyspark/sql.py @@ -1231,6 +1231,13 @@ class SQLContext: ... field3.field5[0] as f3 from table3) srdd6.collect() [Row(f1=u'row1', f2=None,...Row(f1=u'row3', f2=[], f3=None)] + + sqlCtx.jsonRDD(sc.parallelize(['{}', +... '{key0: {key1: value1}}'])).collect() +[Row(key0=None), Row(key0=Row(key1=u'value1'))] + sqlCtx.jsonRDD(sc.parallelize(['{key0: null}', +... '{key0: {key1: value1}}'])).collect() +[Row(key0=None), Row(key0=Row(key1=u'value1'))] def func(iterator): http://git-wip-us.apache.org/repos/asf/spark/blob/48789117/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala index 57df793..33b2ed1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala @@ -382,21 +382,26 @@ class SchemaRDD( private[sql] def javaToPython: JavaRDD[Array[Byte]] = { import scala.collection.Map -def toJava(obj: Any, dataType: DataType): Any = dataType match { - case struct: StructType = rowToArray(obj.asInstanceOf[Row], struct) - case array: ArrayType = obj match { -case seq: Seq[Any] = seq.map(x = toJava(x, array.elementType)).asJava -case list: JList[_] = list.map(x = toJava(x, array.elementType)).asJava -case arr if arr != null arr.getClass.isArray = - arr.asInstanceOf[Array[Any]].map(x = toJava(x, array.elementType)) -case other = other - } - case mt: MapType = obj.asInstanceOf[Map[_, _]].map { +def toJava(obj: Any, dataType: DataType): Any = (obj, dataType) match { + case (null, _) = null + + case (obj: Row, struct: StructType) = rowToArray(obj, struct) + + case (seq: Seq[Any], array: ArrayType) = +seq.map(x = toJava(x, array.elementType)).asJava + case (list: JList[_], array: ArrayType) = +list.map(x = toJava(x, array.elementType)).asJava + case (arr, array: ArrayType) if arr.getClass.isArray = +arr.asInstanceOf[Array[Any]].map(x = toJava(x, array.elementType)) + + case (obj: Map[_, _], mt: MapType) = obj.map { case (k, v) = (k, toJava(v, mt.valueType)) // key should be primitive type }.asJava + // Pyrolite can handle Timestamp - case other = obj + case (other, _) = other } + def rowToArray(row: Row, structType: StructType): Array[Any] = { val fields = structType.fields.map(field = field.dataType) row.zip(fields).map { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
git commit: [SPARK-2875] [PySpark] [SQL] handle null in schemaRDD()
Repository: spark Updated Branches: refs/heads/branch-1.1 31090e43c - 27a8d4ce3 [SPARK-2875] [PySpark] [SQL] handle null in schemaRDD() Handle null in schemaRDD during converting them into Python. Author: Davies Liu davies@gmail.com Closes #1802 from davies/json and squashes the following commits: 88e6b1f [Davies Liu] handle null in schemaRDD() (cherry picked from commit 48789117c2dd6d38e0bd8d21cdbcb989913205a6) Signed-off-by: Michael Armbrust mich...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/27a8d4ce Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/27a8d4ce Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/27a8d4ce Branch: refs/heads/branch-1.1 Commit: 27a8d4ce39aa620a5926b33371fcf03bbcb18698 Parents: 31090e4 Author: Davies Liu davies@gmail.com Authored: Wed Aug 6 11:08:12 2014 -0700 Committer: Michael Armbrust mich...@databricks.com Committed: Wed Aug 6 11:08:23 2014 -0700 -- python/pyspark/sql.py | 7 + .../scala/org/apache/spark/sql/SchemaRDD.scala | 27 2 files changed, 23 insertions(+), 11 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/27a8d4ce/python/pyspark/sql.py -- diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py index f109370..adc56e7 100644 --- a/python/pyspark/sql.py +++ b/python/pyspark/sql.py @@ -1231,6 +1231,13 @@ class SQLContext: ... field3.field5[0] as f3 from table3) srdd6.collect() [Row(f1=u'row1', f2=None,...Row(f1=u'row3', f2=[], f3=None)] + + sqlCtx.jsonRDD(sc.parallelize(['{}', +... '{key0: {key1: value1}}'])).collect() +[Row(key0=None), Row(key0=Row(key1=u'value1'))] + sqlCtx.jsonRDD(sc.parallelize(['{key0: null}', +... '{key0: {key1: value1}}'])).collect() +[Row(key0=None), Row(key0=Row(key1=u'value1'))] def func(iterator): http://git-wip-us.apache.org/repos/asf/spark/blob/27a8d4ce/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala index 57df793..33b2ed1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala @@ -382,21 +382,26 @@ class SchemaRDD( private[sql] def javaToPython: JavaRDD[Array[Byte]] = { import scala.collection.Map -def toJava(obj: Any, dataType: DataType): Any = dataType match { - case struct: StructType = rowToArray(obj.asInstanceOf[Row], struct) - case array: ArrayType = obj match { -case seq: Seq[Any] = seq.map(x = toJava(x, array.elementType)).asJava -case list: JList[_] = list.map(x = toJava(x, array.elementType)).asJava -case arr if arr != null arr.getClass.isArray = - arr.asInstanceOf[Array[Any]].map(x = toJava(x, array.elementType)) -case other = other - } - case mt: MapType = obj.asInstanceOf[Map[_, _]].map { +def toJava(obj: Any, dataType: DataType): Any = (obj, dataType) match { + case (null, _) = null + + case (obj: Row, struct: StructType) = rowToArray(obj, struct) + + case (seq: Seq[Any], array: ArrayType) = +seq.map(x = toJava(x, array.elementType)).asJava + case (list: JList[_], array: ArrayType) = +list.map(x = toJava(x, array.elementType)).asJava + case (arr, array: ArrayType) if arr.getClass.isArray = +arr.asInstanceOf[Array[Any]].map(x = toJava(x, array.elementType)) + + case (obj: Map[_, _], mt: MapType) = obj.map { case (k, v) = (k, toJava(v, mt.valueType)) // key should be primitive type }.asJava + // Pyrolite can handle Timestamp - case other = obj + case (other, _) = other } + def rowToArray(row: Row, structType: StructType): Array[Any] = { val fields = structType.fields.map(field = field.dataType) row.zip(fields).map { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
git commit: [SPARK-2678][Core][SQL] A workaround for SPARK-2678
Repository: spark Updated Branches: refs/heads/branch-1.1 27a8d4ce3 - cf8e7fd5e [SPARK-2678][Core][SQL] A workaround for SPARK-2678 JIRA issues: - Main: [SPARK-2678](https://issues.apache.org/jira/browse/SPARK-2678) - Related: [SPARK-2874](https://issues.apache.org/jira/browse/SPARK-2874) Related PR: - #1715 This PR is both a fix for SPARK-2874 and a workaround for SPARK-2678. Fixing SPARK-2678 completely requires some API level changes that need further discussion, and we decided not to include it in Spark 1.1 release. As currently SPARK-2678 only affects Spark SQL scripts, this workaround is enough for Spark 1.1. Command line option handling logic in bash scripts looks somewhat dirty and duplicated, but it helps to provide a cleaner user interface as well as retain full downward compatibility for now. Author: Cheng Lian lian.cs@gmail.com Closes #1801 from liancheng/spark-2874 and squashes the following commits: 8045d7a [Cheng Lian] Make sure test suites pass 8493a9e [Cheng Lian] Using eval to retain quoted arguments aed523f [Cheng Lian] Fixed typo in bin/spark-sql f12a0b1 [Cheng Lian] Worked arount SPARK-2678 daee105 [Cheng Lian] Fixed usage messages of all Spark SQL related scripts (cherry picked from commit a6cd31108f0d73ce6823daafe8447677e03cfd13) Signed-off-by: Patrick Wendell pwend...@gmail.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cf8e7fd5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cf8e7fd5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cf8e7fd5 Branch: refs/heads/branch-1.1 Commit: cf8e7fd5e18509531dc1ab04384d18a2f11330c2 Parents: 27a8d4c Author: Cheng Lian lian.cs@gmail.com Authored: Wed Aug 6 12:28:35 2014 -0700 Committer: Patrick Wendell pwend...@gmail.com Committed: Wed Aug 6 12:28:49 2014 -0700 -- bin/beeline | 29 +++-- bin/spark-sql | 66 ++-- .../spark/deploy/SparkSubmitArguments.scala | 39 +--- .../apache/spark/deploy/SparkSubmitSuite.scala | 12 sbin/start-thriftserver.sh | 50 +-- .../hive/thriftserver/HiveThriftServer2.scala | 1 - .../spark/sql/hive/thriftserver/CliSuite.scala | 19 +++--- .../thriftserver/HiveThriftServer2Suite.scala | 23 --- 8 files changed, 164 insertions(+), 75 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/cf8e7fd5/bin/beeline -- diff --git a/bin/beeline b/bin/beeline index 09fe366..1bda4db 100755 --- a/bin/beeline +++ b/bin/beeline @@ -17,29 +17,14 @@ # limitations under the License. # -# Figure out where Spark is installed -FWDIR=$(cd `dirname $0`/..; pwd) +# +# Shell script for starting BeeLine -# Find the java binary -if [ -n ${JAVA_HOME} ]; then - RUNNER=${JAVA_HOME}/bin/java -else - if [ `command -v java` ]; then -RUNNER=java - else -echo JAVA_HOME is not set 2 -exit 1 - fi -fi +# Enter posix mode for bash +set -o posix -# Compute classpath using external script -classpath_output=$($FWDIR/bin/compute-classpath.sh) -if [[ $? != 0 ]]; then - echo $classpath_output - exit 1 -else - CLASSPATH=$classpath_output -fi +# Figure out where Spark is installed +FWDIR=$(cd `dirname $0`/..; pwd) CLASS=org.apache.hive.beeline.BeeLine -exec $RUNNER -cp $CLASSPATH $CLASS $@ +exec $FWDIR/bin/spark-class $CLASS $@ http://git-wip-us.apache.org/repos/asf/spark/blob/cf8e7fd5/bin/spark-sql -- diff --git a/bin/spark-sql b/bin/spark-sql index bba7f89..61ebd8a 100755 --- a/bin/spark-sql +++ b/bin/spark-sql @@ -23,14 +23,72 @@ # Enter posix mode for bash set -o posix +CLASS=org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver + # Figure out where Spark is installed FWDIR=$(cd `dirname $0`/..; pwd) -if [[ $@ = *--help ]] || [[ $@ = *-h ]]; then - echo Usage: ./sbin/spark-sql [options] +function usage { + echo Usage: ./sbin/spark-sql [options] [cli option] + pattern=usage + pattern+=\|Spark assembly has been built with Hive + pattern+=\|NOTE: SPARK_PREPEND_CLASSES is set + pattern+=\|Spark Command: + pattern+=\|--help + pattern+=\|=== + $FWDIR/bin/spark-submit --help 21 | grep -v Usage 12 + echo + echo CLI options: + $FWDIR/bin/spark-class $CLASS --help 21 | grep -v $pattern 12 +} + +function ensure_arg_number { + arg_number=$1 + at_least=$2 + + if [[ $arg_number -lt $at_least ]]; then +usage +exit 1 + fi +} + +if [[ $@ = --help ]] || [[ $@ = -h ]]; then + usage exit 0 fi -CLASS=org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver -exec $FWDIR/bin/spark-submit --class $CLASS spark-internal $@ +CLI_ARGS=() +SUBMISSION_ARGS=() +
git commit: [SPARK-2678][Core][SQL] A workaround for SPARK-2678
Repository: spark Updated Branches: refs/heads/master 48789117c - a6cd31108 [SPARK-2678][Core][SQL] A workaround for SPARK-2678 JIRA issues: - Main: [SPARK-2678](https://issues.apache.org/jira/browse/SPARK-2678) - Related: [SPARK-2874](https://issues.apache.org/jira/browse/SPARK-2874) Related PR: - #1715 This PR is both a fix for SPARK-2874 and a workaround for SPARK-2678. Fixing SPARK-2678 completely requires some API level changes that need further discussion, and we decided not to include it in Spark 1.1 release. As currently SPARK-2678 only affects Spark SQL scripts, this workaround is enough for Spark 1.1. Command line option handling logic in bash scripts looks somewhat dirty and duplicated, but it helps to provide a cleaner user interface as well as retain full downward compatibility for now. Author: Cheng Lian lian.cs@gmail.com Closes #1801 from liancheng/spark-2874 and squashes the following commits: 8045d7a [Cheng Lian] Make sure test suites pass 8493a9e [Cheng Lian] Using eval to retain quoted arguments aed523f [Cheng Lian] Fixed typo in bin/spark-sql f12a0b1 [Cheng Lian] Worked arount SPARK-2678 daee105 [Cheng Lian] Fixed usage messages of all Spark SQL related scripts Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a6cd3110 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a6cd3110 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a6cd3110 Branch: refs/heads/master Commit: a6cd31108f0d73ce6823daafe8447677e03cfd13 Parents: 4878911 Author: Cheng Lian lian.cs@gmail.com Authored: Wed Aug 6 12:28:35 2014 -0700 Committer: Patrick Wendell pwend...@gmail.com Committed: Wed Aug 6 12:28:35 2014 -0700 -- bin/beeline | 29 +++-- bin/spark-sql | 66 ++-- .../spark/deploy/SparkSubmitArguments.scala | 39 +--- .../apache/spark/deploy/SparkSubmitSuite.scala | 12 sbin/start-thriftserver.sh | 50 +-- .../hive/thriftserver/HiveThriftServer2.scala | 1 - .../spark/sql/hive/thriftserver/CliSuite.scala | 19 +++--- .../thriftserver/HiveThriftServer2Suite.scala | 23 --- 8 files changed, 164 insertions(+), 75 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a6cd3110/bin/beeline -- diff --git a/bin/beeline b/bin/beeline index 09fe366..1bda4db 100755 --- a/bin/beeline +++ b/bin/beeline @@ -17,29 +17,14 @@ # limitations under the License. # -# Figure out where Spark is installed -FWDIR=$(cd `dirname $0`/..; pwd) +# +# Shell script for starting BeeLine -# Find the java binary -if [ -n ${JAVA_HOME} ]; then - RUNNER=${JAVA_HOME}/bin/java -else - if [ `command -v java` ]; then -RUNNER=java - else -echo JAVA_HOME is not set 2 -exit 1 - fi -fi +# Enter posix mode for bash +set -o posix -# Compute classpath using external script -classpath_output=$($FWDIR/bin/compute-classpath.sh) -if [[ $? != 0 ]]; then - echo $classpath_output - exit 1 -else - CLASSPATH=$classpath_output -fi +# Figure out where Spark is installed +FWDIR=$(cd `dirname $0`/..; pwd) CLASS=org.apache.hive.beeline.BeeLine -exec $RUNNER -cp $CLASSPATH $CLASS $@ +exec $FWDIR/bin/spark-class $CLASS $@ http://git-wip-us.apache.org/repos/asf/spark/blob/a6cd3110/bin/spark-sql -- diff --git a/bin/spark-sql b/bin/spark-sql index bba7f89..61ebd8a 100755 --- a/bin/spark-sql +++ b/bin/spark-sql @@ -23,14 +23,72 @@ # Enter posix mode for bash set -o posix +CLASS=org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver + # Figure out where Spark is installed FWDIR=$(cd `dirname $0`/..; pwd) -if [[ $@ = *--help ]] || [[ $@ = *-h ]]; then - echo Usage: ./sbin/spark-sql [options] +function usage { + echo Usage: ./sbin/spark-sql [options] [cli option] + pattern=usage + pattern+=\|Spark assembly has been built with Hive + pattern+=\|NOTE: SPARK_PREPEND_CLASSES is set + pattern+=\|Spark Command: + pattern+=\|--help + pattern+=\|=== + $FWDIR/bin/spark-submit --help 21 | grep -v Usage 12 + echo + echo CLI options: + $FWDIR/bin/spark-class $CLASS --help 21 | grep -v $pattern 12 +} + +function ensure_arg_number { + arg_number=$1 + at_least=$2 + + if [[ $arg_number -lt $at_least ]]; then +usage +exit 1 + fi +} + +if [[ $@ = --help ]] || [[ $@ = -h ]]; then + usage exit 0 fi -CLASS=org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver -exec $FWDIR/bin/spark-submit --class $CLASS spark-internal $@ +CLI_ARGS=() +SUBMISSION_ARGS=() + +while (($#)); do + case $1 in +-d | --define | --database | -f | -h | --hiveconf | --hivevar | -i | -p) +
git commit: [SPARK-2627] [PySpark] have the build enforce PEP 8 automatically
Repository: spark Updated Branches: refs/heads/branch-1.1 cf8e7fd5e - 4c19614e9 [SPARK-2627] [PySpark] have the build enforce PEP 8 automatically As described in [SPARK-2627](https://issues.apache.org/jira/browse/SPARK-2627), we'd like Python code to automatically be checked for PEP 8 compliance by Jenkins. This pull request aims to do that. Notes: * We may need to install [`pep8`](https://pypi.python.org/pypi/pep8) on the build server. * I'm expecting tests to fail now that PEP 8 compliance is being checked as part of the build. I'm fine with cleaning up any remaining PEP 8 violations as part of this pull request. * I did not understand why the RAT and scalastyle reports are saved to text files. I did the same for the PEP 8 check, but only so that the console output style can match those for the RAT and scalastyle checks. The PEP 8 report is removed right after the check is complete. * Updates to the [Contributing to Spark](https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark) guide will be submitted elsewhere, as I don't believe that text is part of the Spark repo. Author: Nicholas Chammas nicholas.cham...@gmail.com Author: nchammas nicholas.cham...@gmail.com Closes #1744 from nchammas/master and squashes the following commits: 274b238 [Nicholas Chammas] [SPARK-2627] [PySpark] minor indentation changes 983d963 [nchammas] Merge pull request #5 from apache/master 1db5314 [nchammas] Merge pull request #4 from apache/master 0e0245f [Nicholas Chammas] [SPARK-2627] undo erroneous whitespace fixes bf30942 [Nicholas Chammas] [SPARK-2627] PEP8: comment spacing 6db9a44 [nchammas] Merge pull request #3 from apache/master 7b4750e [Nicholas Chammas] merge upstream changes 91b7584 [Nicholas Chammas] [SPARK-2627] undo unnecessary line breaks 44e3e56 [Nicholas Chammas] [SPARK-2627] use tox.ini to exclude files b09fae2 [Nicholas Chammas] don't wrap comments unnecessarily bfb9f9f [Nicholas Chammas] [SPARK-2627] keep up with the PEP 8 fixes 9da347f [nchammas] Merge pull request #2 from apache/master aa5b4b5 [Nicholas Chammas] [SPARK-2627] follow Spark bash style for if blocks d0a83b9 [Nicholas Chammas] [SPARK-2627] check that pep8 downloaded fine dffb5dd [Nicholas Chammas] [SPARK-2627] download pep8 at runtime a1ce7ae [Nicholas Chammas] [SPARK-2627] space out test report sections 21da538 [Nicholas Chammas] [SPARK-2627] it's PEP 8, not PEP8 6f4900b [Nicholas Chammas] [SPARK-2627] more misc PEP 8 fixes fe57ed0 [Nicholas Chammas] removing merge conflict backups 9c01d4c [nchammas] Merge pull request #1 from apache/master 9a66cb0 [Nicholas Chammas] resolving merge conflicts a31ccc4 [Nicholas Chammas] [SPARK-2627] miscellaneous PEP 8 fixes beaa9ac [Nicholas Chammas] [SPARK-2627] fail check on non-zero status 723ed39 [Nicholas Chammas] always delete the report file 0541ebb [Nicholas Chammas] [SPARK-2627] call Python linter from run-tests 12440fa [Nicholas Chammas] [SPARK-2627] add Scala linter 61c07b9 [Nicholas Chammas] [SPARK-2627] add Python linter 75ad552 [Nicholas Chammas] make check output style consistent (cherry picked from commit d614967b0bad1e6c5277d612602ec0a653a00258) Signed-off-by: Reynold Xin r...@apache.org Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4c19614e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4c19614e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4c19614e Branch: refs/heads/branch-1.1 Commit: 4c19614e94d9c26109e5ffc6cf83665fab0bad84 Parents: cf8e7fd Author: Nicholas Chammas nicholas.cham...@gmail.com Authored: Wed Aug 6 12:58:24 2014 -0700 Committer: Reynold Xin r...@apache.org Committed: Wed Aug 6 12:59:05 2014 -0700 -- dev/lint-python| 60 dev/lint-scala | 23 + dev/run-tests | 13 ++- dev/scalastyle | 2 +- python/pyspark/accumulators.py | 7 ++ python/pyspark/broadcast.py| 1 + python/pyspark/conf.py | 1 + python/pyspark/context.py | 25 ++--- python/pyspark/daemon.py | 5 +- python/pyspark/files.py| 1 + python/pyspark/java_gateway.py | 1 + python/pyspark/mllib/_common.py| 5 +- python/pyspark/mllib/classification.py | 8 ++ python/pyspark/mllib/clustering.py | 3 + python/pyspark/mllib/linalg.py | 2 + python/pyspark/mllib/random.py | 14 +-- python/pyspark/mllib/recommendation.py | 2 + python/pyspark/mllib/regression.py | 12 +++ python/pyspark/mllib/stat.py | 1 + python/pyspark/mllib/tests.py | 11 ++- python/pyspark/mllib/tree.py | 4 +- python/pyspark/mllib/util.py | 1 + python/pyspark/rdd.py | 22 +++-- python/pyspark/rddsampler.py | 4 +
git commit: SPARK-2566. Update ShuffleWriteMetrics incrementally
Repository: spark Updated Branches: refs/heads/master d614967b0 - 4e9823644 SPARK-2566. Update ShuffleWriteMetrics incrementally I haven't tested this out on a cluster yet, but wanted to make sure the approach (passing ShuffleWriteMetrics down to DiskBlockObjectWriter) was ok Author: Sandy Ryza sa...@cloudera.com Closes #1481 from sryza/sandy-spark-2566 and squashes the following commits: 8090d88 [Sandy Ryza] Fix ExternalSorter b2a62ed [Sandy Ryza] Fix more test failures 8be6218 [Sandy Ryza] Fix test failures and mark a couple variables private c5e68e5 [Sandy Ryza] SPARK-2566. Update ShuffleWriteMetrics incrementally Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4e982364 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4e982364 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4e982364 Branch: refs/heads/master Commit: 4e982364426c7d65032e8006c63ca4f9a0d40470 Parents: d614967 Author: Sandy Ryza sa...@cloudera.com Authored: Wed Aug 6 13:10:33 2014 -0700 Committer: Patrick Wendell pwend...@gmail.com Committed: Wed Aug 6 13:10:33 2014 -0700 -- .../org/apache/spark/executor/TaskMetrics.scala | 4 +- .../spark/shuffle/hash/HashShuffleWriter.scala | 16 ++-- .../spark/shuffle/sort/SortShuffleWriter.scala | 16 ++-- .../org/apache/spark/storage/BlockManager.scala | 12 +-- .../spark/storage/BlockObjectWriter.scala | 77 +++- .../spark/storage/ShuffleBlockManager.scala | 9 ++- .../util/collection/ExternalAppendOnlyMap.scala | 18 +++-- .../spark/util/collection/ExternalSorter.scala | 17 +++-- .../spark/storage/BlockObjectWriterSuite.scala | 65 + .../spark/storage/DiskBlockManagerSuite.scala | 9 ++- .../apache/spark/tools/StoragePerfTester.scala | 3 +- 11 files changed, 164 insertions(+), 82 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4e982364/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala -- diff --git a/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala b/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala index 56cd872..11a6e10 100644 --- a/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala +++ b/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala @@ -190,10 +190,10 @@ class ShuffleWriteMetrics extends Serializable { /** * Number of bytes written for the shuffle by this task */ - var shuffleBytesWritten: Long = _ + @volatile var shuffleBytesWritten: Long = _ /** * Time the task spent blocking on writes to disk or buffer cache, in nanoseconds */ - var shuffleWriteTime: Long = _ + @volatile var shuffleWriteTime: Long = _ } http://git-wip-us.apache.org/repos/asf/spark/blob/4e982364/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleWriter.scala -- diff --git a/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleWriter.scala b/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleWriter.scala index 45d3b8b..51e454d 100644 --- a/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleWriter.scala +++ b/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleWriter.scala @@ -39,10 +39,14 @@ private[spark] class HashShuffleWriter[K, V]( // we don't try deleting files, etc twice. private var stopping = false + private val writeMetrics = new ShuffleWriteMetrics() + metrics.shuffleWriteMetrics = Some(writeMetrics) + private val blockManager = SparkEnv.get.blockManager private val shuffleBlockManager = blockManager.shuffleBlockManager private val ser = Serializer.getSerializer(dep.serializer.getOrElse(null)) - private val shuffle = shuffleBlockManager.forMapTask(dep.shuffleId, mapId, numOutputSplits, ser) + private val shuffle = shuffleBlockManager.forMapTask(dep.shuffleId, mapId, numOutputSplits, ser, +writeMetrics) /** Write a bunch of records to this task's output */ override def write(records: Iterator[_ : Product2[K, V]]): Unit = { @@ -99,22 +103,12 @@ private[spark] class HashShuffleWriter[K, V]( private def commitWritesAndBuildStatus(): MapStatus = { // Commit the writes. Get the size of each bucket block (total block size). -var totalBytes = 0L -var totalTime = 0L val compressedSizes = shuffle.writers.map { writer: BlockObjectWriter = writer.commitAndClose() val size = writer.fileSegment().length - totalBytes += size - totalTime += writer.timeWriting() MapOutputTracker.compressSize(size) } -// Update shuffle metrics. -val shuffleMetrics = new ShuffleWriteMetrics -shuffleMetrics.shuffleBytesWritten = totalBytes -
git commit: SPARK-2566. Update ShuffleWriteMetrics incrementally
Repository: spark Updated Branches: refs/heads/branch-1.1 4c19614e9 - a65c9ac11 SPARK-2566. Update ShuffleWriteMetrics incrementally I haven't tested this out on a cluster yet, but wanted to make sure the approach (passing ShuffleWriteMetrics down to DiskBlockObjectWriter) was ok Author: Sandy Ryza sa...@cloudera.com Closes #1481 from sryza/sandy-spark-2566 and squashes the following commits: 8090d88 [Sandy Ryza] Fix ExternalSorter b2a62ed [Sandy Ryza] Fix more test failures 8be6218 [Sandy Ryza] Fix test failures and mark a couple variables private c5e68e5 [Sandy Ryza] SPARK-2566. Update ShuffleWriteMetrics incrementally (cherry picked from commit 4e982364426c7d65032e8006c63ca4f9a0d40470) Signed-off-by: Patrick Wendell pwend...@gmail.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a65c9ac1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a65c9ac1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a65c9ac1 Branch: refs/heads/branch-1.1 Commit: a65c9ac11e7075c2d7a925772273b9b7cf9586d6 Parents: 4c19614 Author: Sandy Ryza sa...@cloudera.com Authored: Wed Aug 6 13:10:33 2014 -0700 Committer: Patrick Wendell pwend...@gmail.com Committed: Wed Aug 6 13:10:43 2014 -0700 -- .../org/apache/spark/executor/TaskMetrics.scala | 4 +- .../spark/shuffle/hash/HashShuffleWriter.scala | 16 ++-- .../spark/shuffle/sort/SortShuffleWriter.scala | 16 ++-- .../org/apache/spark/storage/BlockManager.scala | 12 +-- .../spark/storage/BlockObjectWriter.scala | 77 +++- .../spark/storage/ShuffleBlockManager.scala | 9 ++- .../util/collection/ExternalAppendOnlyMap.scala | 18 +++-- .../spark/util/collection/ExternalSorter.scala | 17 +++-- .../spark/storage/BlockObjectWriterSuite.scala | 65 + .../spark/storage/DiskBlockManagerSuite.scala | 9 ++- .../apache/spark/tools/StoragePerfTester.scala | 3 +- 11 files changed, 164 insertions(+), 82 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a65c9ac1/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala -- diff --git a/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala b/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala index 56cd872..11a6e10 100644 --- a/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala +++ b/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala @@ -190,10 +190,10 @@ class ShuffleWriteMetrics extends Serializable { /** * Number of bytes written for the shuffle by this task */ - var shuffleBytesWritten: Long = _ + @volatile var shuffleBytesWritten: Long = _ /** * Time the task spent blocking on writes to disk or buffer cache, in nanoseconds */ - var shuffleWriteTime: Long = _ + @volatile var shuffleWriteTime: Long = _ } http://git-wip-us.apache.org/repos/asf/spark/blob/a65c9ac1/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleWriter.scala -- diff --git a/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleWriter.scala b/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleWriter.scala index 45d3b8b..51e454d 100644 --- a/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleWriter.scala +++ b/core/src/main/scala/org/apache/spark/shuffle/hash/HashShuffleWriter.scala @@ -39,10 +39,14 @@ private[spark] class HashShuffleWriter[K, V]( // we don't try deleting files, etc twice. private var stopping = false + private val writeMetrics = new ShuffleWriteMetrics() + metrics.shuffleWriteMetrics = Some(writeMetrics) + private val blockManager = SparkEnv.get.blockManager private val shuffleBlockManager = blockManager.shuffleBlockManager private val ser = Serializer.getSerializer(dep.serializer.getOrElse(null)) - private val shuffle = shuffleBlockManager.forMapTask(dep.shuffleId, mapId, numOutputSplits, ser) + private val shuffle = shuffleBlockManager.forMapTask(dep.shuffleId, mapId, numOutputSplits, ser, +writeMetrics) /** Write a bunch of records to this task's output */ override def write(records: Iterator[_ : Product2[K, V]]): Unit = { @@ -99,22 +103,12 @@ private[spark] class HashShuffleWriter[K, V]( private def commitWritesAndBuildStatus(): MapStatus = { // Commit the writes. Get the size of each bucket block (total block size). -var totalBytes = 0L -var totalTime = 0L val compressedSizes = shuffle.writers.map { writer: BlockObjectWriter = writer.commitAndClose() val size = writer.fileSegment().length - totalBytes += size - totalTime += writer.timeWriting() MapOutputTracker.compressSize(size) } -// Update shuffle
git commit: [SPARK-2852][MLLIB] API consistency for `mllib.feature`
Repository: spark Updated Branches: refs/heads/master 4e9823644 - 25cff1019 [SPARK-2852][MLLIB] API consistency for `mllib.feature` This is part of SPARK-2828: 1. added a Java-friendly fit method to Word2Vec with tests 2. change DeveloperApi to Experimental for Normalizer StandardScaler 3. change default feature dimension to 2^20 in HashingTF Author: Xiangrui Meng m...@databricks.com Closes #1807 from mengxr/feature-api-check and squashes the following commits: 773c1a9 [Xiangrui Meng] change default numFeatures to 2^20 in HashingTF change annotation from DeveloperApi to Experimental in Normalizer and StandardScaler 883e122 [Xiangrui Meng] add @Experimental to Word2VecModel add a Java-friendly method to Word2Vec.fit with tests Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/25cff101 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/25cff101 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/25cff101 Branch: refs/heads/master Commit: 25cff1019da9d6cfc486a31d035b372ea5fbdfd2 Parents: 4e98236 Author: Xiangrui Meng m...@databricks.com Authored: Wed Aug 6 14:07:51 2014 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Wed Aug 6 14:07:51 2014 -0700 -- .../apache/spark/mllib/feature/HashingTF.scala | 4 +- .../apache/spark/mllib/feature/Normalizer.scala | 6 +- .../spark/mllib/feature/StandardScaler.scala| 6 +- .../apache/spark/mllib/feature/Word2Vec.scala | 19 +- .../spark/mllib/feature/JavaWord2VecSuite.java | 66 5 files changed, 91 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/25cff101/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala index 0f6d580..c534758 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala @@ -32,12 +32,12 @@ import org.apache.spark.util.Utils * :: Experimental :: * Maps a sequence of terms to their term frequencies using the hashing trick. * - * @param numFeatures number of features (default: 100) + * @param numFeatures number of features (default: 2^20^) */ @Experimental class HashingTF(val numFeatures: Int) extends Serializable { - def this() = this(100) + def this() = this(1 20) /** * Returns the index of the input term. http://git-wip-us.apache.org/repos/asf/spark/blob/25cff101/mllib/src/main/scala/org/apache/spark/mllib/feature/Normalizer.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Normalizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Normalizer.scala index ea9fd0a..3afb477 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Normalizer.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Normalizer.scala @@ -19,11 +19,11 @@ package org.apache.spark.mllib.feature import breeze.linalg.{DenseVector = BDV, SparseVector = BSV} -import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.annotation.Experimental import org.apache.spark.mllib.linalg.{Vector, Vectors} /** - * :: DeveloperApi :: + * :: Experimental :: * Normalizes samples individually to unit L^p^ norm * * For any 1 = p Double.PositiveInfinity, normalizes samples using @@ -33,7 +33,7 @@ import org.apache.spark.mllib.linalg.{Vector, Vectors} * * @param p Normalization in L^p^ space, p = 2 by default. */ -@DeveloperApi +@Experimental class Normalizer(p: Double) extends VectorTransformer { def this() = this(2) http://git-wip-us.apache.org/repos/asf/spark/blob/25cff101/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala index cc2d757..e6c9f8f 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala @@ -19,14 +19,14 @@ package org.apache.spark.mllib.feature import breeze.linalg.{DenseVector = BDV, SparseVector = BSV, Vector = BV} -import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.annotation.Experimental import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.rdd.RDDFunctions._ import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import
git commit: [SPARK-2852][MLLIB] API consistency for `mllib.feature`
Repository: spark Updated Branches: refs/heads/branch-1.1 a65c9ac11 - e654cfdd0 [SPARK-2852][MLLIB] API consistency for `mllib.feature` This is part of SPARK-2828: 1. added a Java-friendly fit method to Word2Vec with tests 2. change DeveloperApi to Experimental for Normalizer StandardScaler 3. change default feature dimension to 2^20 in HashingTF Author: Xiangrui Meng m...@databricks.com Closes #1807 from mengxr/feature-api-check and squashes the following commits: 773c1a9 [Xiangrui Meng] change default numFeatures to 2^20 in HashingTF change annotation from DeveloperApi to Experimental in Normalizer and StandardScaler 883e122 [Xiangrui Meng] add @Experimental to Word2VecModel add a Java-friendly method to Word2Vec.fit with tests (cherry picked from commit 25cff1019da9d6cfc486a31d035b372ea5fbdfd2) Signed-off-by: Xiangrui Meng m...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e654cfdd Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e654cfdd Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e654cfdd Branch: refs/heads/branch-1.1 Commit: e654cfdd02e56fd3aaf6b784dcd25cb9ec35aece Parents: a65c9ac Author: Xiangrui Meng m...@databricks.com Authored: Wed Aug 6 14:07:51 2014 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Wed Aug 6 14:08:03 2014 -0700 -- .../apache/spark/mllib/feature/HashingTF.scala | 4 +- .../apache/spark/mllib/feature/Normalizer.scala | 6 +- .../spark/mllib/feature/StandardScaler.scala| 6 +- .../apache/spark/mllib/feature/Word2Vec.scala | 19 +- .../spark/mllib/feature/JavaWord2VecSuite.java | 66 5 files changed, 91 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e654cfdd/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala index 0f6d580..c534758 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/HashingTF.scala @@ -32,12 +32,12 @@ import org.apache.spark.util.Utils * :: Experimental :: * Maps a sequence of terms to their term frequencies using the hashing trick. * - * @param numFeatures number of features (default: 100) + * @param numFeatures number of features (default: 2^20^) */ @Experimental class HashingTF(val numFeatures: Int) extends Serializable { - def this() = this(100) + def this() = this(1 20) /** * Returns the index of the input term. http://git-wip-us.apache.org/repos/asf/spark/blob/e654cfdd/mllib/src/main/scala/org/apache/spark/mllib/feature/Normalizer.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Normalizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Normalizer.scala index ea9fd0a..3afb477 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Normalizer.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Normalizer.scala @@ -19,11 +19,11 @@ package org.apache.spark.mllib.feature import breeze.linalg.{DenseVector = BDV, SparseVector = BSV} -import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.annotation.Experimental import org.apache.spark.mllib.linalg.{Vector, Vectors} /** - * :: DeveloperApi :: + * :: Experimental :: * Normalizes samples individually to unit L^p^ norm * * For any 1 = p Double.PositiveInfinity, normalizes samples using @@ -33,7 +33,7 @@ import org.apache.spark.mllib.linalg.{Vector, Vectors} * * @param p Normalization in L^p^ space, p = 2 by default. */ -@DeveloperApi +@Experimental class Normalizer(p: Double) extends VectorTransformer { def this() = this(2) http://git-wip-us.apache.org/repos/asf/spark/blob/e654cfdd/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala index cc2d757..e6c9f8f 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala @@ -19,14 +19,14 @@ package org.apache.spark.mllib.feature import breeze.linalg.{DenseVector = BDV, SparseVector = BSV, Vector = BV} -import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.annotation.Experimental import org.apache.spark.mllib.linalg.{Vector,
git commit: [PySpark] Add blanklines to Python docstrings so example code renders correctly
Repository: spark Updated Branches: refs/heads/master 25cff1019 - e537b33c6 [PySpark] Add blanklines to Python docstrings so example code renders correctly Author: RJ Nowling rnowl...@gmail.com Closes #1808 from rnowling/pyspark_docs and squashes the following commits: c06d774 [RJ Nowling] Add blanklines to Python docstrings so example code renders correctly Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e537b33c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e537b33c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e537b33c Branch: refs/heads/master Commit: e537b33c63d3fb373fe41deaa607d72e76e3906b Parents: 25cff10 Author: RJ Nowling rnowl...@gmail.com Authored: Wed Aug 6 14:12:21 2014 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Wed Aug 6 14:12:21 2014 -0700 -- python/pyspark/rdd.py | 9 + 1 file changed, 9 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e537b33c/python/pyspark/rdd.py -- diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index 30b834d..756e8f3 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -134,6 +134,7 @@ class MaxHeapQ(object): An implementation of MaxHeap. + import pyspark.rdd heap = pyspark.rdd.MaxHeapQ(5) [heap.insert(i) for i in range(10)] @@ -381,6 +382,7 @@ class RDD(object): def getNumPartitions(self): Returns the number of partitions in RDD + rdd = sc.parallelize([1, 2, 3, 4], 2) rdd.getNumPartitions() 2 @@ -570,6 +572,7 @@ class RDD(object): Sorts this RDD, which is assumed to consist of (key, value) pairs. # noqa + tmp = [('a', 1), ('b', 2), ('1', 3), ('d', 4), ('2', 5)] sc.parallelize(tmp).sortByKey(True, 2).collect() [('1', 3), ('2', 5), ('a', 1), ('b', 2), ('d', 4)] @@ -1209,6 +1212,7 @@ class RDD(object): def keys(self): Return an RDD with the keys of each tuple. + m = sc.parallelize([(1, 2), (3, 4)]).keys() m.collect() [1, 3] @@ -1218,6 +1222,7 @@ class RDD(object): def values(self): Return an RDD with the values of each tuple. + m = sc.parallelize([(1, 2), (3, 4)]).values() m.collect() [2, 4] @@ -1642,6 +1647,7 @@ class RDD(object): Internally, this uses a shuffle to redistribute data. If you are decreasing the number of partitions in this RDD, consider using `coalesce`, which can avoid performing a shuffle. + rdd = sc.parallelize([1,2,3,4,5,6,7], 4) sorted(rdd.glom().collect()) [[1], [2, 3], [4, 5], [6, 7]] @@ -1656,6 +1662,7 @@ class RDD(object): def coalesce(self, numPartitions, shuffle=False): Return a new RDD that is reduced into `numPartitions` partitions. + sc.parallelize([1, 2, 3, 4, 5], 3).glom().collect() [[1], [2, 3], [4, 5]] sc.parallelize([1, 2, 3, 4, 5], 3).coalesce(1).glom().collect() @@ -1694,6 +1701,7 @@ class RDD(object): def setName(self, name): Assign a name to this RDD. + rdd1 = sc.parallelize([1,2]) rdd1.setName('RDD1') rdd1.name() @@ -1753,6 +1761,7 @@ class PipelinedRDD(RDD): Pipelined maps: + rdd = sc.parallelize([1, 2, 3, 4]) rdd.map(lambda x: 2 * x).cache().map(lambda x: 2 * x).collect() [4, 8, 12, 16] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
git commit: [HOTFIX][Streaming] Handle port collisions in flume polling test
Repository: spark Updated Branches: refs/heads/master e537b33c6 - c6889d2cb [HOTFIX][Streaming] Handle port collisions in flume polling test This is failing my tests in #1777. @tdas Author: Andrew Or andrewo...@gmail.com Closes #1803 from andrewor14/fix-flaky-streaming-test and squashes the following commits: ea11a03 [Andrew Or] Catch all exceptions caused by BindExceptions 54a0ca0 [Andrew Or] Merge branch 'master' of github.com:apache/spark into fix-flaky-streaming-test 664095c [Andrew Or] Tone down bind exception message af3ddc9 [Andrew Or] Handle port collisions in flume polling test Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c6889d2c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c6889d2c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c6889d2c Branch: refs/heads/master Commit: c6889d2cb9cd99f7e3e0ee14a4fdf301f1f9810e Parents: e537b33 Author: Andrew Or andrewo...@gmail.com Authored: Wed Aug 6 16:34:53 2014 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Wed Aug 6 16:34:53 2014 -0700 -- .../flume/FlumePollingStreamSuite.scala | 32 +++- 1 file changed, 31 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c6889d2c/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala -- diff --git a/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala b/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala index 27bf2ac..a69baa1 100644 --- a/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala +++ b/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala @@ -35,6 +35,7 @@ import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.util.ManualClock import org.apache.spark.streaming.{TestSuiteBase, TestOutputStream, StreamingContext} import org.apache.spark.streaming.flume.sink._ +import org.apache.spark.util.Utils class FlumePollingStreamSuite extends TestSuiteBase { @@ -45,8 +46,37 @@ class FlumePollingStreamSuite extends TestSuiteBase { val eventsPerBatch = 100 val totalEventsPerChannel = batchCount * eventsPerBatch val channelCapacity = 5000 + val maxAttempts = 5 test(flume polling test) { +testMultipleTimes(testFlumePolling) + } + + test(flume polling test multiple hosts) { +testMultipleTimes(testFlumePollingMultipleHost) + } + + /** + * Run the given test until no more java.net.BindException's are thrown. + * Do this only up to a certain attempt limit. + */ + private def testMultipleTimes(test: () = Unit): Unit = { +var testPassed = false +var attempt = 0 +while (!testPassed attempt maxAttempts) { + try { +test() +testPassed = true + } catch { +case e: Exception if Utils.isBindCollision(e) = + logWarning(Exception when running flume polling test: + e) + attempt += 1 + } +} +assert(testPassed, sTest failed after $attempt attempts!) + } + + private def testFlumePolling(): Unit = { val testPort = getTestPort // Set up the streaming context and input streams val ssc = new StreamingContext(conf, batchDuration) @@ -80,7 +110,7 @@ class FlumePollingStreamSuite extends TestSuiteBase { channel.stop() } - test(flume polling test multiple hosts) { + private def testFlumePollingMultipleHost(): Unit = { val testPort = getTestPort // Set up the streaming context and input streams val ssc = new StreamingContext(conf, batchDuration) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
git commit: SPARK-2882: Spark build now checks local maven cache for dependencies
Repository: spark Updated Branches: refs/heads/master c6889d2cb - 4e008334e SPARK-2882: Spark build now checks local maven cache for dependencies Fixes [SPARK-2882](https://issues.apache.org/jira/browse/SPARK-2882) Author: Gregory Owen greo...@gmail.com Closes #1818 from GregOwen/spark-2882 and squashes the following commits: 294446d [Gregory Owen] SPARK-2882: Spark build now checks local maven cache for dependencies Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4e008334 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4e008334 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4e008334 Branch: refs/heads/master Commit: 4e008334ee0fb60f9fe8820afa06f7b7f0fa7a6c Parents: c6889d2 Author: Gregory Owen greo...@gmail.com Authored: Wed Aug 6 16:52:00 2014 -0700 Committer: Patrick Wendell pwend...@gmail.com Committed: Wed Aug 6 16:52:00 2014 -0700 -- project/SparkBuild.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4e008334/project/SparkBuild.scala -- diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index 40b5885..ed58778 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -115,7 +115,8 @@ object SparkBuild extends PomBuild { retrieveManaged := true, retrievePattern := [type]s/[artifact](-[revision])(-[classifier]).[ext], publishMavenStyle := true, - + +resolvers += Resolver.mavenLocal, otherResolvers = SbtPomKeys.mvnLocalRepository(dotM2 = Seq(Resolver.file(dotM2, dotM2))), publishLocalConfiguration in MavenCompile = (packagedArtifacts, deliverLocal, ivyLoggingLevel) map { (arts, _, level) = new PublishConfiguration(None, dotM2, arts, Seq(), level) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
git commit: SPARK-2882: Spark build now checks local maven cache for dependencies
Repository: spark Updated Branches: refs/heads/branch-1.1 a314e293f - c2ae0b036 SPARK-2882: Spark build now checks local maven cache for dependencies Fixes [SPARK-2882](https://issues.apache.org/jira/browse/SPARK-2882) Author: Gregory Owen greo...@gmail.com Closes #1818 from GregOwen/spark-2882 and squashes the following commits: 294446d [Gregory Owen] SPARK-2882: Spark build now checks local maven cache for dependencies (cherry picked from commit 4e008334ee0fb60f9fe8820afa06f7b7f0fa7a6c) Signed-off-by: Patrick Wendell pwend...@gmail.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c2ae0b03 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c2ae0b03 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c2ae0b03 Branch: refs/heads/branch-1.1 Commit: c2ae0b03669c72f5b842dc0cb4ba1f808c9ef702 Parents: a314e29 Author: Gregory Owen greo...@gmail.com Authored: Wed Aug 6 16:52:00 2014 -0700 Committer: Patrick Wendell pwend...@gmail.com Committed: Wed Aug 6 16:52:10 2014 -0700 -- project/SparkBuild.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c2ae0b03/project/SparkBuild.scala -- diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index 40b5885..ed58778 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -115,7 +115,8 @@ object SparkBuild extends PomBuild { retrieveManaged := true, retrievePattern := [type]s/[artifact](-[revision])(-[classifier]).[ext], publishMavenStyle := true, - + +resolvers += Resolver.mavenLocal, otherResolvers = SbtPomKeys.mvnLocalRepository(dotM2 = Seq(Resolver.file(dotM2, dotM2))), publishLocalConfiguration in MavenCompile = (packagedArtifacts, deliverLocal, ivyLoggingLevel) map { (arts, _, level) = new PublishConfiguration(None, dotM2, arts, Seq(), level) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
git commit: SPARK-2879 [BUILD] Use HTTPS to access Maven Central and other repos
Repository: spark Updated Branches: refs/heads/branch-1.1 3f92ce4e2 - 40284a9a3 SPARK-2879 [BUILD] Use HTTPS to access Maven Central and other repos Maven Central has just now enabled HTTPS access for everyone to Maven Central (http://central.sonatype.org/articles/2014/Aug/03/https-support-launching-now/) This is timely, as a reminder of how easily an attacker can slip malicious code into a build that's downloading artifacts over HTTP (http://blog.ontoillogical.com/blog/2014/07/28/how-to-take-over-any-java-developer/). In the meantime, it looks like the Spring repo also now supports HTTPS, so can be used this way too. I propose to use HTTPS to access these repos. Author: Sean Owen sro...@gmail.com Closes #1805 from srowen/SPARK-2879 and squashes the following commits: 7043a8e [Sean Owen] Use HTTPS for Maven Central libs and plugins; use id 'central' to override parent properly; use HTTPS for Spring repo (cherry picked from commit 4201d2711cd20a2892c40eb11102f73c2f826b2e) Signed-off-by: Patrick Wendell pwend...@gmail.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/40284a9a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/40284a9a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/40284a9a Branch: refs/heads/branch-1.1 Commit: 40284a9a32a6efb6195098c93e292cbc6d128c42 Parents: 3f92ce4 Author: Sean Owen sro...@gmail.com Authored: Wed Aug 6 18:13:35 2014 -0700 Committer: Patrick Wendell pwend...@gmail.com Committed: Wed Aug 6 18:14:55 2014 -0700 -- pom.xml | 15 --- 1 file changed, 12 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/40284a9a/pom.xml -- diff --git a/pom.xml b/pom.xml index 4ab027b..76bf6d8 100644 --- a/pom.xml +++ b/pom.xml @@ -143,11 +143,11 @@ repositories repository - idmaven-repo/id + idcentral/id !-- This should be at top, it makes maven try the central repo first and then others and hence faster dep resolution -- nameMaven Repository/name !-- HTTPS is unavailable for Maven Central -- - urlhttp://repo.maven.apache.org/maven2/url + urlhttps://repo.maven.apache.org/maven2/url releases enabledtrue/enabled /releases @@ -213,7 +213,7 @@ repository idspring-releases/id nameSpring Release Repository/name - urlhttp://repo.spring.io/libs-release/url + urlhttps://repo.spring.io/libs-release/url releases enabledtrue/enabled /releases @@ -222,6 +222,15 @@ /snapshots /repository /repositories + pluginRepositories +pluginRepository + idcentral/id + urlhttps://repo1.maven.org/maven2/url + releases +enabledtrue/enabled + /releases +/pluginRepository + /pluginRepositories dependencyManagement dependencies - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
git commit: HOTFIX: Support custom Java 7 location
Repository: spark Updated Branches: refs/heads/branch-1.1 40284a9a3 - 53fa0486a HOTFIX: Support custom Java 7 location Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/53fa0486 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/53fa0486 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/53fa0486 Branch: refs/heads/branch-1.1 Commit: 53fa0486af202b76dfea08d541c5d874731f81fb Parents: 40284a9 Author: Patrick Wendell pwend...@gmail.com Authored: Wed Aug 6 18:45:03 2014 -0700 Committer: Patrick Wendell pwend...@gmail.com Committed: Wed Aug 6 18:45:36 2014 -0700 -- dev/create-release/create-release.sh | 9 - 1 file changed, 8 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/53fa0486/dev/create-release/create-release.sh -- diff --git a/dev/create-release/create-release.sh b/dev/create-release/create-release.sh index 4247362..1867cf4 100755 --- a/dev/create-release/create-release.sh +++ b/dev/create-release/create-release.sh @@ -35,6 +35,12 @@ RELEASE_VERSION=${RELEASE_VERSION:-1.0.0} RC_NAME=${RC_NAME:-rc2} USER_NAME=${USER_NAME:-pwendell} +if [ -z $JAVA_HOME ]; then + echo Error: JAVA_HOME is not set, cannot proceed. + exit -1 +fi +JAVA_7_HOME=${JAVA_7_HOME:-$JAVA_HOME} + set -e GIT_TAG=v$RELEASE_VERSION-$RC_NAME @@ -130,7 +136,8 @@ scp spark-* \ cd spark sbt/sbt clean cd docs -PRODUCTION=1 jekyll build +# Compile docs with Java 7 to use nicer format +JAVA_HOME=$JAVA_7_HOME PRODUCTION=1 jekyll build echo Copying release documentation rc_docs_folder=${rc_folder}-docs ssh $user_n...@people.apache.org \ - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
git commit: HOTFIX: Support custom Java 7 location
Repository: spark Updated Branches: refs/heads/master 4201d2711 - a263a7e9f HOTFIX: Support custom Java 7 location Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a263a7e9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a263a7e9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a263a7e9 Branch: refs/heads/master Commit: a263a7e9f060b3017142cdae5f1270db9458d8d3 Parents: 4201d27 Author: Patrick Wendell pwend...@gmail.com Authored: Wed Aug 6 18:45:03 2014 -0700 Committer: Patrick Wendell pwend...@gmail.com Committed: Wed Aug 6 18:45:19 2014 -0700 -- dev/create-release/create-release.sh | 9 - 1 file changed, 8 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a263a7e9/dev/create-release/create-release.sh -- diff --git a/dev/create-release/create-release.sh b/dev/create-release/create-release.sh index 4247362..1867cf4 100755 --- a/dev/create-release/create-release.sh +++ b/dev/create-release/create-release.sh @@ -35,6 +35,12 @@ RELEASE_VERSION=${RELEASE_VERSION:-1.0.0} RC_NAME=${RC_NAME:-rc2} USER_NAME=${USER_NAME:-pwendell} +if [ -z $JAVA_HOME ]; then + echo Error: JAVA_HOME is not set, cannot proceed. + exit -1 +fi +JAVA_7_HOME=${JAVA_7_HOME:-$JAVA_HOME} + set -e GIT_TAG=v$RELEASE_VERSION-$RC_NAME @@ -130,7 +136,8 @@ scp spark-* \ cd spark sbt/sbt clean cd docs -PRODUCTION=1 jekyll build +# Compile docs with Java 7 to use nicer format +JAVA_HOME=$JAVA_7_HOME PRODUCTION=1 jekyll build echo Copying release documentation rc_docs_folder=${rc_folder}-docs ssh $user_n...@people.apache.org \ - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
git commit: Updating versions for Spark 1.1.0
Repository: spark Updated Branches: refs/heads/branch-1.1 53fa0486a - cf35b56d4 Updating versions for Spark 1.1.0 Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cf35b56d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cf35b56d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cf35b56d Branch: refs/heads/branch-1.1 Commit: cf35b56d4daed1bb4de3084825842fc750c830f1 Parents: 53fa048 Author: Patrick Wendell pwend...@gmail.com Authored: Wed Aug 6 19:11:39 2014 -0700 Committer: Patrick Wendell pwend...@gmail.com Committed: Wed Aug 6 19:11:39 2014 -0700 -- core/src/main/scala/org/apache/spark/SparkContext.scala| 2 +- docs/_config.yml | 4 ++-- ec2/spark_ec2.py | 2 +- extras/java8-tests/pom.xml | 2 +- python/epydoc.conf | 2 +- python/pyspark/shell.py| 2 +- repl/src/main/scala/org/apache/spark/repl/SparkILoopInit.scala | 2 +- yarn/alpha/pom.xml | 2 +- 8 files changed, 9 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/cf35b56d/core/src/main/scala/org/apache/spark/SparkContext.scala -- diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index e132955..0470fbe 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -1312,7 +1312,7 @@ class SparkContext(config: SparkConf) extends Logging { */ object SparkContext extends Logging { - private[spark] val SPARK_VERSION = 1.0.0 + private[spark] val SPARK_VERSION = 1.1.0 private[spark] val SPARK_JOB_DESCRIPTION = spark.job.description http://git-wip-us.apache.org/repos/asf/spark/blob/cf35b56d/docs/_config.yml -- diff --git a/docs/_config.yml b/docs/_config.yml index 45b78fe..84db618 100644 --- a/docs/_config.yml +++ b/docs/_config.yml @@ -3,8 +3,8 @@ markdown: kramdown # These allow the documentation to be updated with nerw releases # of Spark, Scala, and Mesos. -SPARK_VERSION: 1.0.0-SNAPSHOT -SPARK_VERSION_SHORT: 1.0.0 +SPARK_VERSION: 1.1.0-SNAPSHOT +SPARK_VERSION_SHORT: 1.1.0 SCALA_BINARY_VERSION: 2.10 SCALA_VERSION: 2.10.4 MESOS_VERSION: 0.18.1 http://git-wip-us.apache.org/repos/asf/spark/blob/cf35b56d/ec2/spark_ec2.py -- diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py index 0c2f85a..fc6fb1d 100755 --- a/ec2/spark_ec2.py +++ b/ec2/spark_ec2.py @@ -196,7 +196,7 @@ def is_active(instance): def get_spark_shark_version(opts): spark_shark_map = { 0.7.3: 0.7.1, 0.8.0: 0.8.0, 0.8.1: 0.8.1, 0.9.0: 0.9.0, 0.9.1: 0.9.1, -1.0.0: 1.0.0 +1.0.0: 1.0.0, 1.0.1: 1.0.1, 1.0.2: 1.0.2, 1.1.0: 1.1.0 } version = opts.spark_version.replace(v, ) if version not in spark_shark_map: http://git-wip-us.apache.org/repos/asf/spark/blob/cf35b56d/extras/java8-tests/pom.xml -- diff --git a/extras/java8-tests/pom.xml b/extras/java8-tests/pom.xml index 5308bb4..8ce7b94 100644 --- a/extras/java8-tests/pom.xml +++ b/extras/java8-tests/pom.xml @@ -20,7 +20,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent/artifactId -version1.1.0-SNAPSHOT/version +version1.1.0/version relativePath../../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/cf35b56d/python/epydoc.conf -- diff --git a/python/epydoc.conf b/python/epydoc.conf index 51c0faf..d066ecb 100644 --- a/python/epydoc.conf +++ b/python/epydoc.conf @@ -18,7 +18,7 @@ # # Information about the project. -name: Spark 1.0.0 Python API Docs +name: Spark 1.1.0 Python API Docs url: http://spark.apache.org # The list of modules to document. Modules can be named using http://git-wip-us.apache.org/repos/asf/spark/blob/cf35b56d/python/pyspark/shell.py -- diff --git a/python/pyspark/shell.py b/python/pyspark/shell.py index e1e7cd9..8a9777a 100644 --- a/python/pyspark/shell.py +++ b/python/pyspark/shell.py @@ -47,7 +47,7 @@ print(Welcome to __ / __/__ ___ _/ /__ _\ \/ _ \/ _ `/ __/ '_/ - /__ / .__/\_,_/_/ /_/\_\ version 1.0.0-SNAPSHOT + /__ / .__/\_,_/_/ /_/\_\ version 1.1.0 /_/ )
git commit: [maven-release-plugin] prepare release v1.1.0-snapshot1
Repository: spark Updated Branches: refs/heads/branch-1.1 cf35b56d4 - d428d8841 [maven-release-plugin] prepare release v1.1.0-snapshot1 Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d428d884 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d428d884 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d428d884 Branch: refs/heads/branch-1.1 Commit: d428d88418d385d1d04e1b0adcb6b068efe9c7b0 Parents: cf35b56 Author: Patrick Wendell pwend...@gmail.com Authored: Thu Aug 7 03:16:14 2014 + Committer: Patrick Wendell pwend...@gmail.com Committed: Thu Aug 7 03:16:14 2014 + -- assembly/pom.xml | 6 +++--- bagel/pom.xml | 2 +- core/pom.xml | 2 +- examples/pom.xml | 2 +- external/flume-sink/pom.xml | 2 +- external/flume/pom.xml| 2 +- external/kafka/pom.xml| 2 +- external/mqtt/pom.xml | 2 +- external/twitter/pom.xml | 2 +- external/zeromq/pom.xml | 2 +- extras/kinesis-asl/pom.xml| 2 +- extras/spark-ganglia-lgpl/pom.xml | 2 +- graphx/pom.xml| 2 +- mllib/pom.xml | 2 +- pom.xml | 9 - repl/pom.xml | 2 +- sql/catalyst/pom.xml | 5 ++--- sql/core/pom.xml | 5 ++--- sql/hive-thriftserver/pom.xml | 5 ++--- sql/hive/pom.xml | 5 ++--- streaming/pom.xml | 2 +- tools/pom.xml | 2 +- yarn/pom.xml | 2 +- yarn/stable/pom.xml | 2 +- 24 files changed, 33 insertions(+), 38 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d428d884/assembly/pom.xml -- diff --git a/assembly/pom.xml b/assembly/pom.xml index 703f159..02dd3d1 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent/artifactId -version1.1.0-SNAPSHOT/version +version1.1.0/version relativePath../pom.xml/relativePath /parent @@ -124,8 +124,8 @@ transformer implementation=org.apache.maven.plugins.shade.resource.DontIncludeResourceTransformer resourcelog4j.properties/resource /transformer -transformer implementation=org.apache.maven.plugins.shade.resource.ApacheLicenseResourceTransformer/ -transformer implementation=org.apache.maven.plugins.shade.resource.ApacheNoticeResourceTransformer/ +transformer implementation=org.apache.maven.plugins.shade.resource.ApacheLicenseResourceTransformer / +transformer implementation=org.apache.maven.plugins.shade.resource.ApacheNoticeResourceTransformer / /transformers /configuration /execution http://git-wip-us.apache.org/repos/asf/spark/blob/d428d884/bagel/pom.xml -- diff --git a/bagel/pom.xml b/bagel/pom.xml index bd51b11..8eec7e5 100644 --- a/bagel/pom.xml +++ b/bagel/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent/artifactId -version1.1.0-SNAPSHOT/version +version1.1.0/version relativePath../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/d428d884/core/pom.xml -- diff --git a/core/pom.xml b/core/pom.xml index 6d8be37..83e6026 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent/artifactId -version1.1.0-SNAPSHOT/version +version1.1.0/version relativePath../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/d428d884/examples/pom.xml -- diff --git a/examples/pom.xml b/examples/pom.xml index 8c4c128..9bde90e 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent/artifactId -version1.1.0-SNAPSHOT/version +version1.1.0/version relativePath../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/d428d884/external/flume-sink/pom.xml -- diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml index d0bf1cf..fd317e7 100644 --- a/external/flume-sink/pom.xml +++ b/external/flume-sink/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId
git commit: [maven-release-plugin] prepare for next development iteration
Repository: spark Updated Branches: refs/heads/branch-1.1 d428d8841 - c204a742a [maven-release-plugin] prepare for next development iteration Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c204a742 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c204a742 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c204a742 Branch: refs/heads/branch-1.1 Commit: c204a742a9eb9d3fd318e0f059bd00cbfb8b2c14 Parents: d428d88 Author: Patrick Wendell pwend...@gmail.com Authored: Thu Aug 7 03:16:23 2014 + Committer: Patrick Wendell pwend...@gmail.com Committed: Thu Aug 7 03:16:23 2014 + -- assembly/pom.xml | 2 +- bagel/pom.xml | 2 +- core/pom.xml | 2 +- examples/pom.xml | 2 +- external/flume-sink/pom.xml | 2 +- external/flume/pom.xml| 2 +- external/kafka/pom.xml| 2 +- external/mqtt/pom.xml | 2 +- external/twitter/pom.xml | 2 +- external/zeromq/pom.xml | 2 +- extras/kinesis-asl/pom.xml| 2 +- extras/spark-ganglia-lgpl/pom.xml | 2 +- graphx/pom.xml| 2 +- mllib/pom.xml | 2 +- pom.xml | 4 ++-- repl/pom.xml | 2 +- sql/catalyst/pom.xml | 2 +- sql/core/pom.xml | 2 +- sql/hive-thriftserver/pom.xml | 2 +- sql/hive/pom.xml | 2 +- streaming/pom.xml | 2 +- tools/pom.xml | 2 +- yarn/pom.xml | 2 +- yarn/stable/pom.xml | 2 +- 24 files changed, 25 insertions(+), 25 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c204a742/assembly/pom.xml -- diff --git a/assembly/pom.xml b/assembly/pom.xml index 02dd3d1..16e5271 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent/artifactId -version1.1.0/version +version1.1.1-SNAPSHOT/version relativePath../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/c204a742/bagel/pom.xml -- diff --git a/bagel/pom.xml b/bagel/pom.xml index 8eec7e5..f29540b 100644 --- a/bagel/pom.xml +++ b/bagel/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent/artifactId -version1.1.0/version +version1.1.1-SNAPSHOT/version relativePath../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/c204a742/core/pom.xml -- diff --git a/core/pom.xml b/core/pom.xml index 83e6026..debc4dd 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent/artifactId -version1.1.0/version +version1.1.1-SNAPSHOT/version relativePath../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/c204a742/examples/pom.xml -- diff --git a/examples/pom.xml b/examples/pom.xml index 9bde90e..f35d3d6 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent/artifactId -version1.1.0/version +version1.1.1-SNAPSHOT/version relativePath../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/c204a742/external/flume-sink/pom.xml -- diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml index fd317e7..cfbf943 100644 --- a/external/flume-sink/pom.xml +++ b/external/flume-sink/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent/artifactId -version1.1.0/version +version1.1.1-SNAPSHOT/version relativePath../../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/c204a742/external/flume/pom.xml -- diff --git a/external/flume/pom.xml b/external/flume/pom.xml index 6563f4d..b127136 100644 --- a/external/flume/pom.xml +++ b/external/flume/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent/artifactId -version1.1.0/version +version1.1.1-SNAPSHOT/version relativePath../../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/c204a742/external/kafka/pom.xml
Git Push Summary
Repository: spark Updated Tags: refs/tags/v1.1.0-snapshot1 [created] db4a0a5e8 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
git commit: [SPARK-2887] fix bug of countApproxDistinct() when have more than one partition
Repository: spark Updated Branches: refs/heads/master a263a7e9f - ffd1f59a6 [SPARK-2887] fix bug of countApproxDistinct() when have more than one partition fix bug of countApproxDistinct() when have more than one partition Author: Davies Liu davies@gmail.com Closes #1812 from davies/approx and squashes the following commits: bf757ce [Davies Liu] fix bug of countApproxDistinct() when have more than one partition Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ffd1f59a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ffd1f59a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ffd1f59a Branch: refs/heads/master Commit: ffd1f59a62a9dd9a4d5a7b09490b9d01ff1cd42d Parents: a263a7e Author: Davies Liu davies@gmail.com Authored: Wed Aug 6 21:22:13 2014 -0700 Committer: Patrick Wendell pwend...@gmail.com Committed: Wed Aug 6 21:22:13 2014 -0700 -- core/src/main/scala/org/apache/spark/rdd/RDD.scala | 2 +- core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala | 10 +- 2 files changed, 6 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ffd1f59a/core/src/main/scala/org/apache/spark/rdd/RDD.scala -- diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala index e1c49e3..0159003 100644 --- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala @@ -1004,7 +1004,7 @@ abstract class RDD[T: ClassTag]( }, (h1: HyperLogLogPlus, h2: HyperLogLogPlus) = { h1.addAll(h2) -h2 +h1 }).cardinality() } http://git-wip-us.apache.org/repos/asf/spark/blob/ffd1f59a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala index b31e3a0..4a7dc8d 100644 --- a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala +++ b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala @@ -81,11 +81,11 @@ class RDDSuite extends FunSuite with SharedSparkContext { def error(est: Long, size: Long) = math.abs(est - size) / size.toDouble -val size = 100 -val uniformDistro = for (i - 1 to 10) yield i % size -val simpleRdd = sc.makeRDD(uniformDistro) -assert(error(simpleRdd.countApproxDistinct(4, 0), size) 0.4) -assert(error(simpleRdd.countApproxDistinct(8, 0), size) 0.1) +val size = 1000 +val uniformDistro = for (i - 1 to 5000) yield i % size +val simpleRdd = sc.makeRDD(uniformDistro, 10) +assert(error(simpleRdd.countApproxDistinct(8, 0), size) 0.2) +assert(error(simpleRdd.countApproxDistinct(12, 0), size) 0.1) } test(SparkContext.union) { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
git commit: [SPARK-2887] fix bug of countApproxDistinct() when have more than one partition
Repository: spark Updated Branches: refs/heads/branch-1.1 c204a742a - cc8a7e97e [SPARK-2887] fix bug of countApproxDistinct() when have more than one partition fix bug of countApproxDistinct() when have more than one partition Author: Davies Liu davies@gmail.com Closes #1812 from davies/approx and squashes the following commits: bf757ce [Davies Liu] fix bug of countApproxDistinct() when have more than one partition (cherry picked from commit ffd1f59a62a9dd9a4d5a7b09490b9d01ff1cd42d) Signed-off-by: Patrick Wendell pwend...@gmail.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cc8a7e97 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cc8a7e97 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cc8a7e97 Branch: refs/heads/branch-1.1 Commit: cc8a7e97e1c9190fcb6093ad9c94e7f0730af94c Parents: c204a74 Author: Davies Liu davies@gmail.com Authored: Wed Aug 6 21:22:13 2014 -0700 Committer: Patrick Wendell pwend...@gmail.com Committed: Wed Aug 6 21:22:27 2014 -0700 -- core/src/main/scala/org/apache/spark/rdd/RDD.scala | 2 +- core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala | 10 +- 2 files changed, 6 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/cc8a7e97/core/src/main/scala/org/apache/spark/rdd/RDD.scala -- diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala index e1c49e3..0159003 100644 --- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala @@ -1004,7 +1004,7 @@ abstract class RDD[T: ClassTag]( }, (h1: HyperLogLogPlus, h2: HyperLogLogPlus) = { h1.addAll(h2) -h2 +h1 }).cardinality() } http://git-wip-us.apache.org/repos/asf/spark/blob/cc8a7e97/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala index b31e3a0..4a7dc8d 100644 --- a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala +++ b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala @@ -81,11 +81,11 @@ class RDDSuite extends FunSuite with SharedSparkContext { def error(est: Long, size: Long) = math.abs(est - size) / size.toDouble -val size = 100 -val uniformDistro = for (i - 1 to 10) yield i % size -val simpleRdd = sc.makeRDD(uniformDistro) -assert(error(simpleRdd.countApproxDistinct(4, 0), size) 0.4) -assert(error(simpleRdd.countApproxDistinct(8, 0), size) 0.1) +val size = 1000 +val uniformDistro = for (i - 1 to 5000) yield i % size +val simpleRdd = sc.makeRDD(uniformDistro, 10) +assert(error(simpleRdd.countApproxDistinct(8, 0), size) 0.2) +assert(error(simpleRdd.countApproxDistinct(12, 0), size) 0.1) } test(SparkContext.union) { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org