spark git commit: [SPARK-11462][STREAMING] Add JavaStreamingListener
Repository: spark Updated Branches: refs/heads/master 0ce6f9b2d -> 1f0f14efe [SPARK-11462][STREAMING] Add JavaStreamingListener Currently, StreamingListener is not Java friendly because it exposes some Scala collections to Java users directly, such as Option, Map. This PR added a Java version of StreamingListener and a bunch of Java friendly classes for Java users. Author: zsxwingAuthor: Shixiong Zhu Closes #9420 from zsxwing/java-streaming-listener. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1f0f14ef Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1f0f14ef Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1f0f14ef Branch: refs/heads/master Commit: 1f0f14efe35f986e338ee2cbc1ef2a9ce7395c00 Parents: 0ce6f9b Author: zsxwing Authored: Mon Nov 9 17:38:19 2015 -0800 Committer: Tathagata Das Committed: Mon Nov 9 17:38:19 2015 -0800 -- .../api/java/JavaStreamingListener.scala| 168 +++ .../api/java/JavaStreamingListenerWrapper.scala | 122 .../JavaStreamingListenerAPISuite.java | 85 ++ .../JavaStreamingListenerWrapperSuite.scala | 290 +++ 4 files changed, 665 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1f0f14ef/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingListener.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingListener.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingListener.scala new file mode 100644 index 000..c86c710 --- /dev/null +++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingListener.scala @@ -0,0 +1,168 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.streaming.api.java + +import org.apache.spark.streaming.Time + +/** + * A listener interface for receiving information about an ongoing streaming computation. + */ +private[streaming] class JavaStreamingListener { + + /** Called when a receiver has been started */ + def onReceiverStarted(receiverStarted: JavaStreamingListenerReceiverStarted): Unit = { } + + /** Called when a receiver has reported an error */ + def onReceiverError(receiverError: JavaStreamingListenerReceiverError): Unit = { } + + /** Called when a receiver has been stopped */ + def onReceiverStopped(receiverStopped: JavaStreamingListenerReceiverStopped): Unit = { } + + /** Called when a batch of jobs has been submitted for processing. */ + def onBatchSubmitted(batchSubmitted: JavaStreamingListenerBatchSubmitted): Unit = { } + + /** Called when processing of a batch of jobs has started. */ + def onBatchStarted(batchStarted: JavaStreamingListenerBatchStarted): Unit = { } + + /** Called when processing of a batch of jobs has completed. */ + def onBatchCompleted(batchCompleted: JavaStreamingListenerBatchCompleted): Unit = { } + + /** Called when processing of a job of a batch has started. */ + def onOutputOperationStarted( + outputOperationStarted: JavaStreamingListenerOutputOperationStarted): Unit = { } + + /** Called when processing of a job of a batch has completed. */ + def onOutputOperationCompleted( + outputOperationCompleted: JavaStreamingListenerOutputOperationCompleted): Unit = { } +} + +/** + * Base trait for events related to JavaStreamingListener + */ +private[streaming] sealed trait JavaStreamingListenerEvent + +private[streaming] class JavaStreamingListenerBatchSubmitted(val batchInfo: JavaBatchInfo) + extends JavaStreamingListenerEvent + +private[streaming] class JavaStreamingListenerBatchCompleted(val batchInfo: JavaBatchInfo) + extends JavaStreamingListenerEvent + +private[streaming] class JavaStreamingListenerBatchStarted(val batchInfo: JavaBatchInfo) + extends JavaStreamingListenerEvent + +private[streaming] class
spark git commit: [SPARK-11359][STREAMING][KINESIS] Checkpoint to DynamoDB even when new data doesn't come in
Repository: spark Updated Branches: refs/heads/branch-1.6 bdd8a6bd4 -> 9e80db7c7 [SPARK-11359][STREAMING][KINESIS] Checkpoint to DynamoDB even when new data doesn't come in Currently, the checkpoints to DynamoDB occur only when new data comes in, as we update the clock for the checkpointState. This PR makes the checkpoint a scheduled execution based on the `checkpointInterval`. Author: Burak YavuzCloses #9421 from brkyvz/kinesis-checkpoint. (cherry picked from commit a3a7c9103e136035d65a5564f9eb0fa04727c4f3) Signed-off-by: Tathagata Das Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9e80db7c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9e80db7c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9e80db7c Branch: refs/heads/branch-1.6 Commit: 9e80db7c7d1600691a5c012610e3f28f35210d46 Parents: bdd8a6b Author: Burak Yavuz Authored: Mon Nov 9 14:39:18 2015 -0800 Committer: Tathagata Das Committed: Mon Nov 9 14:39:30 2015 -0800 -- .../kinesis/KinesisCheckpointState.scala| 54 --- .../streaming/kinesis/KinesisCheckpointer.scala | 133 .../streaming/kinesis/KinesisReceiver.scala | 38 - .../kinesis/KinesisRecordProcessor.scala| 59 ++- .../kinesis/KinesisCheckpointerSuite.scala | 152 +++ .../kinesis/KinesisReceiverSuite.scala | 96 +++- 6 files changed, 349 insertions(+), 183 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9e80db7c/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointState.scala -- diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointState.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointState.scala deleted file mode 100644 index 83a4537..000 --- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointState.scala +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - *http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.spark.streaming.kinesis - -import org.apache.spark.Logging -import org.apache.spark.streaming.Duration -import org.apache.spark.util.{Clock, ManualClock, SystemClock} - -/** - * This is a helper class for managing checkpoint clocks. - * - * @param checkpointInterval - * @param currentClock. Default to current SystemClock if none is passed in (mocking purposes) - */ -private[kinesis] class KinesisCheckpointState( -checkpointInterval: Duration, -currentClock: Clock = new SystemClock()) - extends Logging { - - /* Initialize the checkpoint clock using the given currentClock + checkpointInterval millis */ - val checkpointClock = new ManualClock() - checkpointClock.setTime(currentClock.getTimeMillis() + checkpointInterval.milliseconds) - - /** - * Check if it's time to checkpoint based on the current time and the derived time - * for the next checkpoint - * - * @return true if it's time to checkpoint - */ - def shouldCheckpoint(): Boolean = { -new SystemClock().getTimeMillis() > checkpointClock.getTimeMillis() - } - - /** - * Advance the checkpoint clock by the checkpoint interval. - */ - def advanceCheckpoint(): Unit = { -checkpointClock.advance(checkpointInterval.milliseconds) - } -} http://git-wip-us.apache.org/repos/asf/spark/blob/9e80db7c/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointer.scala -- diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointer.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointer.scala new file mode 100644 index 000..1ca6d43 --- /dev/null +++
spark git commit: [SPARK-11198][STREAMING][KINESIS] Support de-aggregation of records during recovery
Repository: spark Updated Branches: refs/heads/master 61f9c8711 -> 26062d226 [SPARK-11198][STREAMING][KINESIS] Support de-aggregation of records during recovery While the KCL handles de-aggregation during the regular operation, during recovery we use the lower level api, and therefore need to de-aggregate the records. tdas Testing is an issue, we need protobuf magic to do the aggregated records. Maybe we could depend on KPL for tests? Author: Burak Yavuz <brk...@gmail.com> Closes #9403 from brkyvz/kinesis-deaggregation. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/26062d22 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/26062d22 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/26062d22 Branch: refs/heads/master Commit: 26062d22607e1f9854bc2588ba22a4e0f8bba48c Parents: 61f9c87 Author: Burak Yavuz <brk...@gmail.com> Authored: Mon Nov 9 17:18:49 2015 -0800 Committer: Tathagata Das <tathagata.das1...@gmail.com> Committed: Mon Nov 9 17:18:49 2015 -0800 -- extras/kinesis-asl/pom.xml | 6 + .../kinesis/KinesisBackedBlockRDD.scala | 6 +- .../streaming/kinesis/KinesisReceiver.scala | 1 - .../kinesis/KinesisRecordProcessor.scala| 2 +- .../streaming/kinesis/KinesisTestUtils.scala| 235 .../kinesis/KinesisBackedBlockRDDSuite.scala| 12 +- .../streaming/kinesis/KinesisStreamSuite.scala | 17 +- .../streaming/kinesis/KinesisTestUtils.scala| 266 +++ pom.xml | 2 + 9 files changed, 299 insertions(+), 248 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/26062d22/extras/kinesis-asl/pom.xml -- diff --git a/extras/kinesis-asl/pom.xml b/extras/kinesis-asl/pom.xml index ef72d97..519a920 100644 --- a/extras/kinesis-asl/pom.xml +++ b/extras/kinesis-asl/pom.xml @@ -65,6 +65,12 @@ ${aws.java.sdk.version} + com.amazonaws + amazon-kinesis-producer + ${aws.kinesis.producer.version} + test + + org.mockito mockito-core test http://git-wip-us.apache.org/repos/asf/spark/blob/26062d22/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala -- diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala index 000897a..691c179 100644 --- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala +++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala @@ -23,6 +23,7 @@ import scala.util.control.NonFatal import com.amazonaws.auth.{AWSCredentials, DefaultAWSCredentialsProviderChain} import com.amazonaws.services.kinesis.AmazonKinesisClient +import com.amazonaws.services.kinesis.clientlibrary.types.UserRecord import com.amazonaws.services.kinesis.model._ import org.apache.spark._ @@ -210,7 +211,10 @@ class KinesisSequenceRangeIterator( s"getting records using shard iterator") { client.getRecords(getRecordsRequest) } -(getRecordsResult.getRecords.iterator().asScala, getRecordsResult.getNextShardIterator) +// De-aggregate records, if KPL was used in producing the records. The KCL automatically +// handles de-aggregation during regular operation. This code path is used during recovery +val recordIterator = UserRecord.deaggregate(getRecordsResult.getRecords) +(recordIterator.iterator().asScala, getRecordsResult.getNextShardIterator) } /** http://git-wip-us.apache.org/repos/asf/spark/blob/26062d22/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala -- diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala index 50993f1..97dbb91 100644 --- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala +++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala @@ -216,7 +216,6 @@ private[kinesis] class KinesisReceiver[T]( val metadata = SequenceNumberRange(streamName, shardId, records.get(0).getSequenceNumber(), records.get(records.size() - 1).getSequenceNumber()) blockGenerator.addMultipleDataWithCallback(dataIterator, metadata) - } }
spark git commit: [SPARK-11198][STREAMING][KINESIS] Support de-aggregation of records during recovery
Repository: spark Updated Branches: refs/heads/branch-1.6 34e824d90 -> 116b7158f [SPARK-11198][STREAMING][KINESIS] Support de-aggregation of records during recovery While the KCL handles de-aggregation during the regular operation, during recovery we use the lower level api, and therefore need to de-aggregate the records. tdas Testing is an issue, we need protobuf magic to do the aggregated records. Maybe we could depend on KPL for tests? Author: Burak Yavuz <brk...@gmail.com> Closes #9403 from brkyvz/kinesis-deaggregation. (cherry picked from commit 26062d22607e1f9854bc2588ba22a4e0f8bba48c) Signed-off-by: Tathagata Das <tathagata.das1...@gmail.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/116b7158 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/116b7158 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/116b7158 Branch: refs/heads/branch-1.6 Commit: 116b7158fa27cf9dbd935be1f395c68d2f8928ec Parents: 34e824d Author: Burak Yavuz <brk...@gmail.com> Authored: Mon Nov 9 17:18:49 2015 -0800 Committer: Tathagata Das <tathagata.das1...@gmail.com> Committed: Mon Nov 9 17:18:59 2015 -0800 -- extras/kinesis-asl/pom.xml | 6 + .../kinesis/KinesisBackedBlockRDD.scala | 6 +- .../streaming/kinesis/KinesisReceiver.scala | 1 - .../kinesis/KinesisRecordProcessor.scala| 2 +- .../streaming/kinesis/KinesisTestUtils.scala| 235 .../kinesis/KinesisBackedBlockRDDSuite.scala| 12 +- .../streaming/kinesis/KinesisStreamSuite.scala | 17 +- .../streaming/kinesis/KinesisTestUtils.scala| 266 +++ pom.xml | 2 + 9 files changed, 299 insertions(+), 248 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/116b7158/extras/kinesis-asl/pom.xml -- diff --git a/extras/kinesis-asl/pom.xml b/extras/kinesis-asl/pom.xml index ef72d97..519a920 100644 --- a/extras/kinesis-asl/pom.xml +++ b/extras/kinesis-asl/pom.xml @@ -65,6 +65,12 @@ ${aws.java.sdk.version} + com.amazonaws + amazon-kinesis-producer + ${aws.kinesis.producer.version} + test + + org.mockito mockito-core test http://git-wip-us.apache.org/repos/asf/spark/blob/116b7158/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala -- diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala index 000897a..691c179 100644 --- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala +++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala @@ -23,6 +23,7 @@ import scala.util.control.NonFatal import com.amazonaws.auth.{AWSCredentials, DefaultAWSCredentialsProviderChain} import com.amazonaws.services.kinesis.AmazonKinesisClient +import com.amazonaws.services.kinesis.clientlibrary.types.UserRecord import com.amazonaws.services.kinesis.model._ import org.apache.spark._ @@ -210,7 +211,10 @@ class KinesisSequenceRangeIterator( s"getting records using shard iterator") { client.getRecords(getRecordsRequest) } -(getRecordsResult.getRecords.iterator().asScala, getRecordsResult.getNextShardIterator) +// De-aggregate records, if KPL was used in producing the records. The KCL automatically +// handles de-aggregation during regular operation. This code path is used during recovery +val recordIterator = UserRecord.deaggregate(getRecordsResult.getRecords) +(recordIterator.iterator().asScala, getRecordsResult.getNextShardIterator) } /** http://git-wip-us.apache.org/repos/asf/spark/blob/116b7158/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala -- diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala index 50993f1..97dbb91 100644 --- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala +++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala @@ -216,7 +216,6 @@ private[kinesis] class KinesisReceiver[T]( val metadata = SequenceNumberRange(streamName, shardId, records.get(0).getSequenceNumb
spark git commit: [SPARK-5569][STREAMING] fix ObjectInputStreamWithLoader for supporting load array classes.
Repository: spark Updated Branches: refs/heads/master 8f888eea1 -> 17f499920 [SPARK-5569][STREAMING] fix ObjectInputStreamWithLoader for supporting load array classes. When use Kafka DirectStream API to create checkpoint and restore saved checkpoint when restart, ClassNotFound exception would occur. The reason for this error is that ObjectInputStreamWithLoader extends the ObjectInputStream class and override its resolveClass method. But Instead of Using Class.forName(desc,false,loader), Spark uses loader.loadClass(desc) to instance the class, which do not works with array class. For example: Class.forName("[Lorg.apache.spark.streaming.kafka.OffsetRange.",false,loader) works well while loader.loadClass("[Lorg.apache.spark.streaming.kafka.OffsetRange") would throw an class not found exception. details of the difference between Class.forName and loader.loadClass can be found here. http://bugs.java.com/view_bug.do?bug_id=6446627 Author: maxwellAuthor: DEMING ZHU Closes #8955 from maxwellzdm/master. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/17f49992 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/17f49992 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/17f49992 Branch: refs/heads/master Commit: 17f499920776e0e995434cfa300ff2ff38658fa8 Parents: 8f888ee Author: maxwell Authored: Tue Oct 27 01:31:28 2015 -0700 Committer: Tathagata Das Committed: Tue Oct 27 01:31:28 2015 -0700 -- .../org/apache/spark/streaming/Checkpoint.scala | 4 ++- .../spark/streaming/CheckpointSuite.scala | 35 ++-- 2 files changed, 36 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/17f49992/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala index 8a6050f..b7de6dd 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala @@ -352,7 +352,9 @@ class ObjectInputStreamWithLoader(inputStream_ : InputStream, loader: ClassLoade override def resolveClass(desc: ObjectStreamClass): Class[_] = { try { - return loader.loadClass(desc.getName()) + // scalastyle:off classforname + return Class.forName(desc.getName(), false, loader) + // scalastyle:on classforname } catch { case e: Exception => } http://git-wip-us.apache.org/repos/asf/spark/blob/17f49992/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala -- diff --git a/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala index a695653..84f5294 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala @@ -17,7 +17,8 @@ package org.apache.spark.streaming -import java.io.File +import java.io.{ObjectOutputStream, ByteArrayOutputStream, ByteArrayInputStream, File} +import org.apache.spark.TestUtils import scala.collection.mutable.{ArrayBuffer, SynchronizedBuffer} import scala.reflect.ClassTag @@ -34,7 +35,7 @@ import org.scalatest.time.SpanSugar._ import org.apache.spark.streaming.dstream.{DStream, FileInputDStream} import org.apache.spark.streaming.scheduler.{ConstantEstimator, RateTestInputDStream, RateTestReceiver} -import org.apache.spark.util.{Clock, ManualClock, Utils} +import org.apache.spark.util.{MutableURLClassLoader, Clock, ManualClock, Utils} /** * This test suites tests the checkpointing functionality of DStreams - @@ -579,6 +580,36 @@ class CheckpointSuite extends TestSuiteBase { } } + // This tests whether spark can deserialize array object + // refer to SPARK-5569 + test("recovery from checkpoint contains array object") { +// create a class which is invisible to app class loader +val jar = TestUtils.createJarWithClasses( + classNames = Seq("testClz"), + toStringValue = "testStringValue" + ) + +// invisible to current class loader +val appClassLoader = getClass.getClassLoader +intercept[ClassNotFoundException](appClassLoader.loadClass("testClz")) + +// visible to mutableURLClassLoader +val loader = new MutableURLClassLoader( + Array(jar), appClassLoader) +assert(loader.loadClass("testClz").newInstance().toString ==
spark git commit: [SPARK-11270][STREAMING] Add improved equality testing for TopicAndPartition from the Kafka Streaming API
Repository: spark Updated Branches: refs/heads/master feb8d6a44 -> 8f888eea1 [SPARK-11270][STREAMING] Add improved equality testing for TopicAndPartition from the Kafka Streaming API jerryshao tdas I know this is kind of minor, and I know you all are busy, but this brings this class in line with the `OffsetRange` class, and makes tests a little more concise. Instead of doing something like: ``` assert topic_and_partition_instance._topic == "foo" assert topic_and_partition_instance._partition == 0 ``` You can do something like: ``` assert topic_and_partition_instance == TopicAndPartition("foo", 0) ``` Before: ``` >>> from pyspark.streaming.kafka import TopicAndPartition >>> TopicAndPartition("foo", 0) == TopicAndPartition("foo", 0) False ``` After: ``` >>> from pyspark.streaming.kafka import TopicAndPartition >>> TopicAndPartition("foo", 0) == TopicAndPartition("foo", 0) True ``` I couldn't find any tests - am I missing something? Author: Nick Evans <m...@nicolasevans.org> Closes #9236 from manygrams/topic_and_partition_equality. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8f888eea Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8f888eea Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8f888eea Branch: refs/heads/master Commit: 8f888eea1aef5a28916ec406a99fc19648681ecf Parents: feb8d6a Author: Nick Evans <m...@nicolasevans.org> Authored: Tue Oct 27 01:29:06 2015 -0700 Committer: Tathagata Das <tathagata.das1...@gmail.com> Committed: Tue Oct 27 01:29:06 2015 -0700 -- python/pyspark/streaming/kafka.py | 10 ++ python/pyspark/streaming/tests.py | 10 ++ 2 files changed, 20 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8f888eea/python/pyspark/streaming/kafka.py -- diff --git a/python/pyspark/streaming/kafka.py b/python/pyspark/streaming/kafka.py index b35bbaf..06e1591 100644 --- a/python/pyspark/streaming/kafka.py +++ b/python/pyspark/streaming/kafka.py @@ -254,6 +254,16 @@ class TopicAndPartition(object): def _jTopicAndPartition(self, helper): return helper.createTopicAndPartition(self._topic, self._partition) +def __eq__(self, other): +if isinstance(other, self.__class__): +return (self._topic == other._topic +and self._partition == other._partition) +else: +return False + +def __ne__(self, other): +return not self.__eq__(other) + class Broker(object): """ http://git-wip-us.apache.org/repos/asf/spark/blob/8f888eea/python/pyspark/streaming/tests.py -- diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py index 2c908da..f7fa481 100644 --- a/python/pyspark/streaming/tests.py +++ b/python/pyspark/streaming/tests.py @@ -898,6 +898,16 @@ class KafkaStreamTests(PySparkStreamingTestCase): self.assertEqual(offsetRanges, [OffsetRange(topic, 0, long(0), long(6))]) +def test_topic_and_partition_equality(self): +topic_and_partition_a = TopicAndPartition("foo", 0) +topic_and_partition_b = TopicAndPartition("foo", 0) +topic_and_partition_c = TopicAndPartition("bar", 0) +topic_and_partition_d = TopicAndPartition("foo", 1) + +self.assertEqual(topic_and_partition_a, topic_and_partition_b) +self.assertNotEqual(topic_and_partition_a, topic_and_partition_c) +self.assertNotEqual(topic_and_partition_a, topic_and_partition_d) + class FlumeStreamTests(PySparkStreamingTestCase): timeout = 20 # seconds - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-11270][STREAMING] Add improved equality testing for TopicAndPartition from the Kafka Streaming API
Repository: spark Updated Branches: refs/heads/branch-1.5 8a6e63c78 -> abb0ca7a9 [SPARK-11270][STREAMING] Add improved equality testing for TopicAndPartition from the Kafka Streaming API jerryshao tdas I know this is kind of minor, and I know you all are busy, but this brings this class in line with the `OffsetRange` class, and makes tests a little more concise. Instead of doing something like: ``` assert topic_and_partition_instance._topic == "foo" assert topic_and_partition_instance._partition == 0 ``` You can do something like: ``` assert topic_and_partition_instance == TopicAndPartition("foo", 0) ``` Before: ``` >>> from pyspark.streaming.kafka import TopicAndPartition >>> TopicAndPartition("foo", 0) == TopicAndPartition("foo", 0) False ``` After: ``` >>> from pyspark.streaming.kafka import TopicAndPartition >>> TopicAndPartition("foo", 0) == TopicAndPartition("foo", 0) True ``` I couldn't find any tests - am I missing something? Author: Nick Evans <m...@nicolasevans.org> Closes #9236 from manygrams/topic_and_partition_equality. (cherry picked from commit 8f888eea1aef5a28916ec406a99fc19648681ecf) Signed-off-by: Tathagata Das <tathagata.das1...@gmail.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/abb0ca7a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/abb0ca7a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/abb0ca7a Branch: refs/heads/branch-1.5 Commit: abb0ca7a947f4803f3d16d65d1f6c53930890dee Parents: 8a6e63c Author: Nick Evans <m...@nicolasevans.org> Authored: Tue Oct 27 01:29:06 2015 -0700 Committer: Tathagata Das <tathagata.das1...@gmail.com> Committed: Tue Oct 27 01:30:29 2015 -0700 -- python/pyspark/streaming/kafka.py | 10 ++ python/pyspark/streaming/tests.py | 10 ++ 2 files changed, 20 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/abb0ca7a/python/pyspark/streaming/kafka.py -- diff --git a/python/pyspark/streaming/kafka.py b/python/pyspark/streaming/kafka.py index 8a814c6..f7b59d6 100644 --- a/python/pyspark/streaming/kafka.py +++ b/python/pyspark/streaming/kafka.py @@ -254,6 +254,16 @@ class TopicAndPartition(object): def _jTopicAndPartition(self, helper): return helper.createTopicAndPartition(self._topic, self._partition) +def __eq__(self, other): +if isinstance(other, self.__class__): +return (self._topic == other._topic +and self._partition == other._partition) +else: +return False + +def __ne__(self, other): +return not self.__eq__(other) + class Broker(object): """ http://git-wip-us.apache.org/repos/asf/spark/blob/abb0ca7a/python/pyspark/streaming/tests.py -- diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py index cfea95b..a8c7b51 100644 --- a/python/pyspark/streaming/tests.py +++ b/python/pyspark/streaming/tests.py @@ -887,6 +887,16 @@ class KafkaStreamTests(PySparkStreamingTestCase): self.assertEqual(offsetRanges, [OffsetRange(topic, 0, long(0), long(6))]) +def test_topic_and_partition_equality(self): +topic_and_partition_a = TopicAndPartition("foo", 0) +topic_and_partition_b = TopicAndPartition("foo", 0) +topic_and_partition_c = TopicAndPartition("bar", 0) +topic_and_partition_d = TopicAndPartition("foo", 1) + +self.assertEqual(topic_and_partition_a, topic_and_partition_b) +self.assertNotEqual(topic_and_partition_a, topic_and_partition_c) +self.assertNotEqual(topic_and_partition_a, topic_and_partition_d) + class FlumeStreamTests(PySparkStreamingTestCase): timeout = 20 # seconds - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-11324][STREAMING] Flag for closing Write Ahead Logs after a write
Repository: spark Updated Branches: refs/heads/master 9dba5fb2b -> 4f030b9e8 [SPARK-11324][STREAMING] Flag for closing Write Ahead Logs after a write Currently the Write Ahead Log in Spark Streaming flushes data as writes need to be made. S3 does not support flushing of data, data is written once the stream is actually closed. In case of failure, the data for the last minute (default rolling interval) will not be properly written. Therefore we need a flag to close the stream after the write, so that we achieve read after write consistency. cc tdas zsxwing Author: Burak Yavuz <brk...@gmail.com> Closes #9285 from brkyvz/caw-wal. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4f030b9e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4f030b9e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4f030b9e Branch: refs/heads/master Commit: 4f030b9e82172659d250281782ac573cbd1438fc Parents: 9dba5fb Author: Burak Yavuz <brk...@gmail.com> Authored: Tue Oct 27 16:01:26 2015 -0700 Committer: Tathagata Das <tathagata.das1...@gmail.com> Committed: Tue Oct 27 16:01:26 2015 -0700 -- .../streaming/util/FileBasedWriteAheadLog.scala | 6 +++- .../streaming/util/WriteAheadLogUtils.scala | 15 - .../streaming/util/WriteAheadLogSuite.scala | 32 +++- 3 files changed, 44 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4f030b9e/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLog.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLog.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLog.scala index 9f4a4d6..bc3f248 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLog.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLog.scala @@ -47,7 +47,8 @@ private[streaming] class FileBasedWriteAheadLog( logDirectory: String, hadoopConf: Configuration, rollingIntervalSecs: Int, -maxFailures: Int +maxFailures: Int, +closeFileAfterWrite: Boolean ) extends WriteAheadLog with Logging { import FileBasedWriteAheadLog._ @@ -80,6 +81,9 @@ private[streaming] class FileBasedWriteAheadLog( while (!succeeded && failures < maxFailures) { try { fileSegment = getLogWriter(time).write(byteBuffer) +if (closeFileAfterWrite) { + resetWriter() +} succeeded = true } catch { case ex: Exception => http://git-wip-us.apache.org/repos/asf/spark/blob/4f030b9e/streaming/src/main/scala/org/apache/spark/streaming/util/WriteAheadLogUtils.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/WriteAheadLogUtils.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/WriteAheadLogUtils.scala index 7f6ff12..0ea970e 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/util/WriteAheadLogUtils.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/util/WriteAheadLogUtils.scala @@ -31,11 +31,15 @@ private[streaming] object WriteAheadLogUtils extends Logging { val RECEIVER_WAL_ROLLING_INTERVAL_CONF_KEY = "spark.streaming.receiver.writeAheadLog.rollingIntervalSecs" val RECEIVER_WAL_MAX_FAILURES_CONF_KEY = "spark.streaming.receiver.writeAheadLog.maxFailures" + val RECEIVER_WAL_CLOSE_AFTER_WRITE_CONF_KEY = +"spark.streaming.receiver.writeAheadLog.closeFileAfterWrite" val DRIVER_WAL_CLASS_CONF_KEY = "spark.streaming.driver.writeAheadLog.class" val DRIVER_WAL_ROLLING_INTERVAL_CONF_KEY = "spark.streaming.driver.writeAheadLog.rollingIntervalSecs" val DRIVER_WAL_MAX_FAILURES_CONF_KEY = "spark.streaming.driver.writeAheadLog.maxFailures" + val DRIVER_WAL_CLOSE_AFTER_WRITE_CONF_KEY = +"spark.streaming.driver.writeAheadLog.closeFileAfterWrite" val DEFAULT_ROLLING_INTERVAL_SECS = 60 val DEFAULT_MAX_FAILURES = 3 @@ -60,6 +64,14 @@ private[streaming] object WriteAheadLogUtils extends Logging { } } + def shouldCloseFileAfterWrite(conf: SparkConf, isDriver: Boolean): Boolean = { +if (isDriver) { + conf.getBoolean(DRIVER_WAL_CLOSE_AFTER_WRITE_CONF_KEY, defaultValue = false) +} else { + conf.getBoolean(RECEIVER_WAL_CLOSE_AFTER_WRITE_CONF_KEY, defaultValue = false) +} + } + /** * Create a WriteAheadLog for the driver. If configured with custom WAL class, it will try * to create instance of that class, ot
spark git commit: [SPARK-10891][STREAMING][KINESIS] Add MessageHandler to KinesisUtils.createStream similar to Direct Kafka
Repository: spark Updated Branches: refs/heads/master 80279ac18 -> 63accc796 [SPARK-10891][STREAMING][KINESIS] Add MessageHandler to KinesisUtils.createStream similar to Direct Kafka This PR allows users to map a Kinesis `Record` to a generic `T` when creating a Kinesis stream. This is particularly useful, if you would like to do extra work with Kinesis metadata such as sequence number, and partition key. TODO: - [x] add tests Author: Burak YavuzCloses #8954 from brkyvz/kinesis-handler. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/63accc79 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/63accc79 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/63accc79 Branch: refs/heads/master Commit: 63accc79625d8a03d0624717af5e1d81b18a6da3 Parents: 80279ac Author: Burak Yavuz Authored: Sun Oct 25 21:18:35 2015 -0700 Committer: Tathagata Das Committed: Sun Oct 25 21:18:35 2015 -0700 -- .../kinesis/KinesisBackedBlockRDD.scala | 35 ++- .../streaming/kinesis/KinesisInputDStream.scala | 15 +- .../streaming/kinesis/KinesisReceiver.scala | 18 +- .../kinesis/KinesisRecordProcessor.scala| 4 +- .../spark/streaming/kinesis/KinesisUtils.scala | 247 +-- .../kinesis/JavaKinesisStreamSuite.java | 29 ++- .../kinesis/KinesisBackedBlockRDDSuite.scala| 16 +- .../kinesis/KinesisReceiverSuite.scala | 4 +- .../streaming/kinesis/KinesisStreamSuite.scala | 44 +++- 9 files changed, 337 insertions(+), 75 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/63accc79/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala -- diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala index 5d32fa6..000897a 100644 --- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala +++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala @@ -18,6 +18,7 @@ package org.apache.spark.streaming.kinesis import scala.collection.JavaConverters._ +import scala.reflect.ClassTag import scala.util.control.NonFatal import com.amazonaws.auth.{AWSCredentials, DefaultAWSCredentialsProviderChain} @@ -67,7 +68,7 @@ class KinesisBackedBlockRDDPartition( * sequence numbers of the corresponding blocks. */ private[kinesis] -class KinesisBackedBlockRDD( +class KinesisBackedBlockRDD[T: ClassTag]( @transient sc: SparkContext, val regionName: String, val endpointUrl: String, @@ -75,8 +76,9 @@ class KinesisBackedBlockRDD( @transient val arrayOfseqNumberRanges: Array[SequenceNumberRanges], @transient isBlockIdValid: Array[Boolean] = Array.empty, val retryTimeoutMs: Int = 1, +val messageHandler: Record => T = KinesisUtils.defaultMessageHandler _, val awsCredentialsOption: Option[SerializableAWSCredentials] = None - ) extends BlockRDD[Array[Byte]](sc, blockIds) { + ) extends BlockRDD[T](sc, blockIds) { require(blockIds.length == arrayOfseqNumberRanges.length, "Number of blockIds is not equal to the number of sequence number ranges") @@ -90,23 +92,23 @@ class KinesisBackedBlockRDD( } } - override def compute(split: Partition, context: TaskContext): Iterator[Array[Byte]] = { + override def compute(split: Partition, context: TaskContext): Iterator[T] = { val blockManager = SparkEnv.get.blockManager val partition = split.asInstanceOf[KinesisBackedBlockRDDPartition] val blockId = partition.blockId -def getBlockFromBlockManager(): Option[Iterator[Array[Byte]]] = { +def getBlockFromBlockManager(): Option[Iterator[T]] = { logDebug(s"Read partition data of $this from block manager, block $blockId") - blockManager.get(blockId).map(_.data.asInstanceOf[Iterator[Array[Byte]]]) + blockManager.get(blockId).map(_.data.asInstanceOf[Iterator[T]]) } -def getBlockFromKinesis(): Iterator[Array[Byte]] = { - val credenentials = awsCredentialsOption.getOrElse { +def getBlockFromKinesis(): Iterator[T] = { + val credentials = awsCredentialsOption.getOrElse { new DefaultAWSCredentialsProviderChain().getCredentials() } partition.seqNumberRanges.ranges.iterator.flatMap { range => -new KinesisSequenceRangeIterator( - credenentials, endpointUrl, regionName, range, retryTimeoutMs) +new KinesisSequenceRangeIterator(credentials, endpointUrl, regionName, + range,
spark git commit: [SPARK-11127][STREAMING] upgrade AWS SDK and Kinesis Client Library (KCL)
Repository: spark Updated Branches: refs/heads/master 85e654c5e -> 87f82a5fb [SPARK-11127][STREAMING] upgrade AWS SDK and Kinesis Client Library (KCL) AWS SDK 1.9.40 is the latest 1.9.x release. KCL 1.5.1 is the latest release that using AWS SDK 1.9.x. The main goal is to have Kinesis consumer be able to read messages generated from Kinesis Producer Library (KPL). The API should be compatible with old versions. tdas brkyvz Author: Xiangrui Meng <m...@databricks.com> Closes #9153 from mengxr/SPARK-11127. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/87f82a5f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/87f82a5f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/87f82a5f Branch: refs/heads/master Commit: 87f82a5fb9c4350a97c761411069245f07aad46f Parents: 85e654c Author: Xiangrui Meng <m...@databricks.com> Authored: Sun Oct 25 21:57:34 2015 -0700 Committer: Tathagata Das <tathagata.das1...@gmail.com> Committed: Sun Oct 25 21:57:34 2015 -0700 -- pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/87f82a5f/pom.xml -- diff --git a/pom.xml b/pom.xml index 445e65c..3dfc434 100644 --- a/pom.xml +++ b/pom.xml @@ -152,8 +152,8 @@ 1.7.7 hadoop2 0.7.1 -1.9.16 -1.3.0 +1.9.40 +1.4.0 4.3.2 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-11063] [STREAMING] Change preferredLocations of Receiver's RDD to hosts rather than hostports
Repository: spark Updated Branches: refs/heads/master 596681794 -> 67582132b [SPARK-11063] [STREAMING] Change preferredLocations of Receiver's RDD to hosts rather than hostports The format of RDD's preferredLocations must be hostname but the format of Streaming Receiver's scheduling executors is hostport. So it doesn't work. This PR converts `schedulerExecutors` to `hosts` before creating Receiver's RDD. Author: zsxwingCloses #9075 from zsxwing/SPARK-11063. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/67582132 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/67582132 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/67582132 Branch: refs/heads/master Commit: 67582132bffbaaeaadc5cf8218f6239d03c39da0 Parents: 5966817 Author: zsxwing Authored: Mon Oct 19 15:35:14 2015 -0700 Committer: Tathagata Das Committed: Mon Oct 19 15:35:14 2015 -0700 -- .../scheduler/ReceiverSchedulingPolicy.scala| 3 ++- .../streaming/scheduler/ReceiverTracker.scala | 4 +++- .../scheduler/ReceiverTrackerSuite.scala| 24 3 files changed, 29 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/67582132/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala index 10b5a7f..d2b0be7 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala @@ -21,6 +21,7 @@ import scala.collection.Map import scala.collection.mutable import org.apache.spark.streaming.receiver.Receiver +import org.apache.spark.util.Utils /** * A class that tries to schedule receivers with evenly distributed. There are two phases for @@ -79,7 +80,7 @@ private[streaming] class ReceiverSchedulingPolicy { return receivers.map(_.streamId -> Seq.empty).toMap } -val hostToExecutors = executors.groupBy(_.split(":")(0)) +val hostToExecutors = executors.groupBy(executor => Utils.parseHostPort(executor)._1) val scheduledExecutors = Array.fill(receivers.length)(new mutable.ArrayBuffer[String]) val numReceiversOnExecutor = mutable.HashMap[String, Int]() // Set the initial value to 0 http://git-wip-us.apache.org/repos/asf/spark/blob/67582132/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala index d053e9e..2ce80d6 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala @@ -551,7 +551,9 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false if (scheduledExecutors.isEmpty) { ssc.sc.makeRDD(Seq(receiver), 1) } else { - ssc.sc.makeRDD(Seq(receiver -> scheduledExecutors)) + val preferredLocations = +scheduledExecutors.map(hostPort => Utils.parseHostPort(hostPort)._1).distinct + ssc.sc.makeRDD(Seq(receiver -> preferredLocations)) } receiverRDD.setName(s"Receiver $receiverId") ssc.sparkContext.setJobDescription(s"Streaming job running receiver $receiverId") http://git-wip-us.apache.org/repos/asf/spark/blob/67582132/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala -- diff --git a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala index 45138b7..fda86ae 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala @@ -22,6 +22,8 @@ import scala.collection.mutable.ArrayBuffer import org.scalatest.concurrent.Eventually._ import org.scalatest.time.SpanSugar._ +import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskStart, TaskLocality} +import org.apache.spark.scheduler.TaskLocality.TaskLocality import
spark git commit: [SPARK-11063] [STREAMING] Change preferredLocations of Receiver's RDD to hosts rather than hostports
Repository: spark Updated Branches: refs/heads/branch-1.5 648074096 -> 5186ec8ac [SPARK-11063] [STREAMING] Change preferredLocations of Receiver's RDD to hosts rather than hostports The format of RDD's preferredLocations must be hostname but the format of Streaming Receiver's scheduling executors is hostport. So it doesn't work. This PR converts `schedulerExecutors` to `hosts` before creating Receiver's RDD. Author: zsxwingCloses #9075 from zsxwing/SPARK-11063. (cherry picked from commit 67582132bffbaaeaadc5cf8218f6239d03c39da0) Signed-off-by: Tathagata Das Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5186ec8a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5186ec8a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5186ec8a Branch: refs/heads/branch-1.5 Commit: 5186ec8aca53ffdffbd41599b9fe1f3c5902de01 Parents: 6480740 Author: zsxwing Authored: Mon Oct 19 15:35:14 2015 -0700 Committer: Tathagata Das Committed: Mon Oct 19 15:35:46 2015 -0700 -- .../scheduler/ReceiverSchedulingPolicy.scala| 3 ++- .../streaming/scheduler/ReceiverTracker.scala | 4 +++- .../scheduler/ReceiverTrackerSuite.scala| 24 3 files changed, 29 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5186ec8a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala index 10b5a7f..d2b0be7 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala @@ -21,6 +21,7 @@ import scala.collection.Map import scala.collection.mutable import org.apache.spark.streaming.receiver.Receiver +import org.apache.spark.util.Utils /** * A class that tries to schedule receivers with evenly distributed. There are two phases for @@ -79,7 +80,7 @@ private[streaming] class ReceiverSchedulingPolicy { return receivers.map(_.streamId -> Seq.empty).toMap } -val hostToExecutors = executors.groupBy(_.split(":")(0)) +val hostToExecutors = executors.groupBy(executor => Utils.parseHostPort(executor)._1) val scheduledExecutors = Array.fill(receivers.length)(new mutable.ArrayBuffer[String]) val numReceiversOnExecutor = mutable.HashMap[String, Int]() // Set the initial value to 0 http://git-wip-us.apache.org/repos/asf/spark/blob/5186ec8a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala index 204e614..7b8b68a 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala @@ -551,7 +551,9 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false if (scheduledExecutors.isEmpty) { ssc.sc.makeRDD(Seq(receiver), 1) } else { - ssc.sc.makeRDD(Seq(receiver -> scheduledExecutors)) + val preferredLocations = +scheduledExecutors.map(hostPort => Utils.parseHostPort(hostPort)._1).distinct + ssc.sc.makeRDD(Seq(receiver -> preferredLocations)) } receiverRDD.setName(s"Receiver $receiverId") ssc.sparkContext.setJobDescription(s"Streaming job running receiver $receiverId") http://git-wip-us.apache.org/repos/asf/spark/blob/5186ec8a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala -- diff --git a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala index 45138b7..fda86ae 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala @@ -22,6 +22,8 @@ import scala.collection.mutable.ArrayBuffer import org.scalatest.concurrent.Eventually._ import org.scalatest.time.SpanSugar._ +import
spark git commit: [SPARK-10974] [STREAMING] Add progress bar for output operation column and use red dots for failed batches
Repository: spark Updated Branches: refs/heads/master 3d683a139 -> 369d786f5 [SPARK-10974] [STREAMING] Add progress bar for output operation column and use red dots for failed batches Screenshot: https://cloud.githubusercontent.com/assets/1000778/10342571/385d9340-6d4c-11e5-8e79-1fa4c3c98f81.png;> Also fixed the description and duration for output operations that don't have spark jobs. https://cloud.githubusercontent.com/assets/1000778/10342775/4bd52a0e-6d4d-11e5-99bc-26265a9fc792.png;> Author: zsxwingCloses #9010 from zsxwing/output-op-progress-bar. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/369d786f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/369d786f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/369d786f Branch: refs/heads/master Commit: 369d786f58580e7df73e7e23f27390d37269d0de Parents: 3d683a1 Author: zsxwing Authored: Fri Oct 16 13:53:06 2015 -0700 Committer: Tathagata Das Committed: Fri Oct 16 13:53:06 2015 -0700 -- .../spark/streaming/ui/static/streaming-page.js | 26 +-- .../apache/spark/streaming/DStreamGraph.scala | 2 +- .../spark/streaming/scheduler/BatchInfo.scala | 23 +-- .../apache/spark/streaming/scheduler/Job.scala | 30 +++- .../streaming/scheduler/JobScheduler.scala | 12 +- .../spark/streaming/scheduler/JobSet.scala | 17 +- .../scheduler/OutputOperationInfo.scala | 6 +- .../spark/streaming/ui/AllBatchesTable.scala| 40 +++-- .../apache/spark/streaming/ui/BatchPage.scala | 174 --- .../apache/spark/streaming/ui/BatchUIData.scala | 67 ++- .../ui/StreamingJobProgressListener.scala | 14 ++ .../streaming/StreamingListenerSuite.scala | 16 +- .../spark/streaming/UISeleniumSuite.scala | 2 +- .../ui/StreamingJobProgressListenerSuite.scala | 30 ++-- 14 files changed, 258 insertions(+), 201 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/369d786f/streaming/src/main/resources/org/apache/spark/streaming/ui/static/streaming-page.js -- diff --git a/streaming/src/main/resources/org/apache/spark/streaming/ui/static/streaming-page.js b/streaming/src/main/resources/org/apache/spark/streaming/ui/static/streaming-page.js index 4886b68..f82323a 100644 --- a/streaming/src/main/resources/org/apache/spark/streaming/ui/static/streaming-page.js +++ b/streaming/src/main/resources/org/apache/spark/streaming/ui/static/streaming-page.js @@ -154,34 +154,40 @@ function drawTimeline(id, data, minX, maxX, minY, maxY, unitY, batchInterval) { var lastClickedBatch = null; var lastTimeout = null; +function isFailedBatch(batchTime) { +return $("#batch-" + batchTime).attr("isFailed") == "true"; +} + // Add points to the line. However, we make it invisible at first. But when the user moves mouse // over a point, it will be displayed with its detail. svg.selectAll(".point") .data(data) .enter().append("circle") -.attr("stroke", "white") // white and opacity = 0 make it invisible -.attr("fill", "white") -.attr("opacity", "0") +.attr("stroke", function(d) { return isFailedBatch(d.x) ? "red" : "white";}) // white and opacity = 0 make it invisible +.attr("fill", function(d) { return isFailedBatch(d.x) ? "red" : "white";}) +.attr("opacity", function(d) { return isFailedBatch(d.x) ? "1" : "0";}) .style("cursor", "pointer") .attr("cx", function(d) { return x(d.x); }) .attr("cy", function(d) { return y(d.y); }) -.attr("r", function(d) { return 3; }) +.attr("r", function(d) { return isFailedBatch(d.x) ? "2" : "0";}) .on('mouseover', function(d) { var tip = formatYValue(d.y) + " " + unitY + " at " + timeFormat[d.x]; showBootstrapTooltip(d3.select(this).node(), tip); // show the point d3.select(this) -.attr("stroke", "steelblue") -.attr("fill", "steelblue") -.attr("opacity", "1"); +.attr("stroke", function(d) { return isFailedBatch(d.x) ? "red" : "steelblue";}) +.attr("fill", function(d) { return isFailedBatch(d.x) ? "red" : "steelblue";}) +.attr("opacity", "1") +.attr("r", "3"); }) .on('mouseout', function() { hideBootstrapTooltip(d3.select(this).node()); // hide the point d3.select(this) -.attr("stroke", "white") -
spark git commit: [SPARK-10955] [STREAMING] Add a warning if dynamic allocation for Streaming applications
Repository: spark Updated Branches: refs/heads/branch-1.5 ba601b1ac -> 3df750029 [SPARK-10955] [STREAMING] Add a warning if dynamic allocation for Streaming applications Dynamic allocation can be painful for streaming apps and can lose data. Log a warning for streaming applications if dynamic allocation is enabled. Author: Hari ShreedharanCloses #8998 from harishreedharan/ss-log-error and squashes the following commits: 462b264 [Hari Shreedharan] Improve log message. 2733d94 [Hari Shreedharan] Minor change to warning message. eaa48cc [Hari Shreedharan] Log a warning instead of failing the application if dynamic allocation is enabled. 725f090 [Hari Shreedharan] Add config parameter to allow dynamic allocation if the user explicitly sets it. b3f9a95 [Hari Shreedharan] Disable dynamic allocation and kill app if it is enabled. a4a5212 [Hari Shreedharan] [streaming] SPARK-10955. Disable dynamic allocation for Streaming applications. (cherry picked from commit 09841290055770a619a2e72fbaef1a5e694916ae) Signed-off-by: Tathagata Das Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3df75002 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3df75002 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3df75002 Branch: refs/heads/branch-1.5 Commit: 3df7500299030cf4e002591cb3af5804aa4563da Parents: ba601b1 Author: Hari Shreedharan Authored: Thu Oct 8 18:53:38 2015 -0700 Committer: Tathagata Das Committed: Thu Oct 8 18:53:52 2015 -0700 -- .../scala/org/apache/spark/streaming/StreamingContext.scala | 9 - 1 file changed, 8 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3df75002/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala index f4f0869..bcd98ea 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala @@ -44,7 +44,7 @@ import org.apache.spark.streaming.dstream._ import org.apache.spark.streaming.receiver.{ActorReceiver, ActorSupervisorStrategy, Receiver} import org.apache.spark.streaming.scheduler.{JobScheduler, StreamingListener} import org.apache.spark.streaming.ui.{StreamingJobProgressListener, StreamingTab} -import org.apache.spark.util.{CallSite, ShutdownHookManager, ThreadUtils} +import org.apache.spark.util.{CallSite, ShutdownHookManager, ThreadUtils, Utils} /** * Main entry point for Spark Streaming functionality. It provides methods used to create @@ -570,6 +570,13 @@ class StreamingContext private[streaming] ( ) } } + +if (Utils.isDynamicAllocationEnabled(sc.conf)) { + logWarning("Dynamic Allocation is enabled for this application. " + +"Enabling Dynamic allocation for Spark Streaming applications can cause data loss if " + +"Write Ahead Log is not enabled for non-replayable sources like Flume. " + +"See the programming guide for details on how to enable the Write Ahead Log") +} } /** - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-10955] [STREAMING] Add a warning if dynamic allocation for Streaming applications
Repository: spark Updated Branches: refs/heads/master fa3e4d8f5 -> 098412900 [SPARK-10955] [STREAMING] Add a warning if dynamic allocation for Streaming applications Dynamic allocation can be painful for streaming apps and can lose data. Log a warning for streaming applications if dynamic allocation is enabled. Author: Hari ShreedharanCloses #8998 from harishreedharan/ss-log-error and squashes the following commits: 462b264 [Hari Shreedharan] Improve log message. 2733d94 [Hari Shreedharan] Minor change to warning message. eaa48cc [Hari Shreedharan] Log a warning instead of failing the application if dynamic allocation is enabled. 725f090 [Hari Shreedharan] Add config parameter to allow dynamic allocation if the user explicitly sets it. b3f9a95 [Hari Shreedharan] Disable dynamic allocation and kill app if it is enabled. a4a5212 [Hari Shreedharan] [streaming] SPARK-10955. Disable dynamic allocation for Streaming applications. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/09841290 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/09841290 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/09841290 Branch: refs/heads/master Commit: 09841290055770a619a2e72fbaef1a5e694916ae Parents: fa3e4d8 Author: Hari Shreedharan Authored: Thu Oct 8 18:53:38 2015 -0700 Committer: Tathagata Das Committed: Thu Oct 8 18:53:38 2015 -0700 -- .../scala/org/apache/spark/streaming/StreamingContext.scala | 9 - 1 file changed, 8 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/09841290/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala index 94fea63..9b2632c 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala @@ -44,7 +44,7 @@ import org.apache.spark.streaming.dstream._ import org.apache.spark.streaming.receiver.{ActorReceiver, ActorSupervisorStrategy, Receiver} import org.apache.spark.streaming.scheduler.{JobScheduler, StreamingListener} import org.apache.spark.streaming.ui.{StreamingJobProgressListener, StreamingTab} -import org.apache.spark.util.{CallSite, ShutdownHookManager, ThreadUtils} +import org.apache.spark.util.{CallSite, ShutdownHookManager, ThreadUtils, Utils} /** * Main entry point for Spark Streaming functionality. It provides methods used to create @@ -564,6 +564,13 @@ class StreamingContext private[streaming] ( ) } } + +if (Utils.isDynamicAllocationEnabled(sc.conf)) { + logWarning("Dynamic Allocation is enabled for this application. " + +"Enabling Dynamic allocation for Spark Streaming applications can cause data loss if " + +"Write Ahead Log is not enabled for non-replayable sources like Flume. " + +"See the programming guide for details on how to enable the Write Ahead Log") +} } /** - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-10885] [STREAMING] Display the failed output op in Streaming UI
Repository: spark Updated Branches: refs/heads/master 5e035403d -> ffe6831e4 [SPARK-10885] [STREAMING] Display the failed output op in Streaming UI This PR implements the following features for both `master` and `branch-1.5`. 1. Display the failed output op count in the batch list 2. Display the failure reason of output op in the batch detail page Screenshots: https://cloud.githubusercontent.com/assets/1000778/10198387/5b2b97ec-67ce-11e5-81c2-f818b9d2f3ad.png;> https://cloud.githubusercontent.com/assets/1000778/10198388/5b76ac14-67ce-11e5-8c8b-de2683c5b485.png;> There are still two remaining problems in the UI. 1. If an output operation doesn't run any spark job, we cannot get the its duration since now it's the sum of all jobs' durations. 2. If an output operation doesn't run any spark job, we cannot get the description since it's the latest job's call site. We need to add new `StreamingListenerEvent` about output operations to fix them. So I'd like to fix them only for `master` in another PR. Author: zsxwingCloses #8950 from zsxwing/batch-failure. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ffe6831e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ffe6831e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ffe6831e Branch: refs/heads/master Commit: ffe6831e49e28eb855f857fdfa5dd99341e80c9d Parents: 5e03540 Author: zsxwing Authored: Tue Oct 6 16:51:03 2015 -0700 Committer: Tathagata Das Committed: Tue Oct 6 16:51:03 2015 -0700 -- .../spark/streaming/scheduler/BatchInfo.scala | 10 ++ .../spark/streaming/scheduler/JobSet.scala | 1 + .../spark/streaming/ui/AllBatchesTable.scala| 15 ++- .../apache/spark/streaming/ui/BatchPage.scala | 134 --- .../apache/spark/streaming/ui/BatchUIData.scala | 6 +- .../spark/streaming/UISeleniumSuite.scala | 4 +- 6 files changed, 143 insertions(+), 27 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ffe6831e/streaming/src/main/scala/org/apache/spark/streaming/scheduler/BatchInfo.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/BatchInfo.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/BatchInfo.scala index 3c86956..463f899 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/BatchInfo.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/BatchInfo.scala @@ -41,6 +41,8 @@ case class BatchInfo( private var _failureReasons: Map[Int, String] = Map.empty + private var _numOutputOp: Int = 0 + @deprecated("Use streamIdToInputInfo instead", "1.5.0") def streamIdToNumRecords: Map[Int, Long] = streamIdToInputInfo.mapValues(_.numRecords) @@ -77,4 +79,12 @@ case class BatchInfo( /** Failure reasons corresponding to every output ops in the batch */ private[streaming] def failureReasons = _failureReasons + + /** Set the number of output operations in this batch */ + private[streaming] def setNumOutputOp(numOutputOp: Int): Unit = { +_numOutputOp = numOutputOp + } + + /** Return the number of output operations in this batch */ + private[streaming] def numOutputOp: Int = _numOutputOp } http://git-wip-us.apache.org/repos/asf/spark/blob/ffe6831e/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobSet.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobSet.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobSet.scala index 255ccf0..08f63cc 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobSet.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobSet.scala @@ -81,6 +81,7 @@ case class JobSet( if (processingEndTime >= 0) Some(processingEndTime) else None ) binfo.setFailureReason(failureReasons) +binfo.setNumOutputOp(jobs.size) binfo } } http://git-wip-us.apache.org/repos/asf/spark/blob/ffe6831e/streaming/src/main/scala/org/apache/spark/streaming/ui/AllBatchesTable.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/AllBatchesTable.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/AllBatchesTable.scala index f702bd5..3e6590d 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/ui/AllBatchesTable.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/AllBatchesTable.scala @@ -107,9 +107,10 @@ private[ui] class ActiveBatchTable( private[ui] class CompletedBatchTable(batches:
spark git commit: [SPARK-10885] [STREAMING] Display the failed output op in Streaming UI
Repository: spark Updated Branches: refs/heads/branch-1.5 6847be6d1 -> 84f510c4f [SPARK-10885] [STREAMING] Display the failed output op in Streaming UI This PR implements the following features for both `master` and `branch-1.5`. 1. Display the failed output op count in the batch list 2. Display the failure reason of output op in the batch detail page Screenshots: https://cloud.githubusercontent.com/assets/1000778/10198387/5b2b97ec-67ce-11e5-81c2-f818b9d2f3ad.png;> https://cloud.githubusercontent.com/assets/1000778/10198388/5b76ac14-67ce-11e5-8c8b-de2683c5b485.png;> There are still two remaining problems in the UI. 1. If an output operation doesn't run any spark job, we cannot get the its duration since now it's the sum of all jobs' durations. 2. If an output operation doesn't run any spark job, we cannot get the description since it's the latest job's call site. We need to add new `StreamingListenerEvent` about output operations to fix them. So I'd like to fix them only for `master` in another PR. Author: zsxwingCloses #8950 from zsxwing/batch-failure. (cherry picked from commit ffe6831e49e28eb855f857fdfa5dd99341e80c9d) Signed-off-by: Tathagata Das Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/84f510c4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/84f510c4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/84f510c4 Branch: refs/heads/branch-1.5 Commit: 84f510c4fa06e43bd35e2dc8e1008d0590cbe266 Parents: 6847be6 Author: zsxwing Authored: Tue Oct 6 16:51:03 2015 -0700 Committer: Tathagata Das Committed: Tue Oct 6 16:51:31 2015 -0700 -- .../spark/streaming/scheduler/BatchInfo.scala | 10 ++ .../spark/streaming/scheduler/JobSet.scala | 1 + .../spark/streaming/ui/AllBatchesTable.scala| 15 ++- .../apache/spark/streaming/ui/BatchPage.scala | 134 --- .../apache/spark/streaming/ui/BatchUIData.scala | 6 +- .../spark/streaming/UISeleniumSuite.scala | 4 +- 6 files changed, 143 insertions(+), 27 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/84f510c4/streaming/src/main/scala/org/apache/spark/streaming/scheduler/BatchInfo.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/BatchInfo.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/BatchInfo.scala index 3c86956..463f899 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/BatchInfo.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/BatchInfo.scala @@ -41,6 +41,8 @@ case class BatchInfo( private var _failureReasons: Map[Int, String] = Map.empty + private var _numOutputOp: Int = 0 + @deprecated("Use streamIdToInputInfo instead", "1.5.0") def streamIdToNumRecords: Map[Int, Long] = streamIdToInputInfo.mapValues(_.numRecords) @@ -77,4 +79,12 @@ case class BatchInfo( /** Failure reasons corresponding to every output ops in the batch */ private[streaming] def failureReasons = _failureReasons + + /** Set the number of output operations in this batch */ + private[streaming] def setNumOutputOp(numOutputOp: Int): Unit = { +_numOutputOp = numOutputOp + } + + /** Return the number of output operations in this batch */ + private[streaming] def numOutputOp: Int = _numOutputOp } http://git-wip-us.apache.org/repos/asf/spark/blob/84f510c4/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobSet.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobSet.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobSet.scala index 255ccf0..08f63cc 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobSet.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobSet.scala @@ -81,6 +81,7 @@ case class JobSet( if (processingEndTime >= 0) Some(processingEndTime) else None ) binfo.setFailureReason(failureReasons) +binfo.setNumOutputOp(jobs.size) binfo } } http://git-wip-us.apache.org/repos/asf/spark/blob/84f510c4/streaming/src/main/scala/org/apache/spark/streaming/ui/AllBatchesTable.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/AllBatchesTable.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/AllBatchesTable.scala index f702bd5..3e6590d 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/ui/AllBatchesTable.scala +++
spark git commit: [SPARK-10900] [STREAMING] Add output operation events to StreamingListener
Repository: spark Updated Branches: refs/heads/master a609eb20d -> be7c5ff1a [SPARK-10900] [STREAMING] Add output operation events to StreamingListener Add output operation events to StreamingListener so as to implement the following UI features: 1. Progress bar of a batch in the batch list. 2. Be able to display output operation `description` and `duration` when there is no spark job in a Streaming job. Author: zsxwingCloses #8958 from zsxwing/output-operation-events. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/be7c5ff1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/be7c5ff1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/be7c5ff1 Branch: refs/heads/master Commit: be7c5ff1ad02ce1c03113c98656a4e0c0c3cee83 Parents: a609eb2 Author: zsxwing Authored: Mon Oct 5 19:23:41 2015 -0700 Committer: Tathagata Das Committed: Mon Oct 5 19:23:41 2015 -0700 -- .../apache/spark/streaming/DStreamGraph.scala | 6 ++- .../apache/spark/streaming/scheduler/Job.scala | 7 .../streaming/scheduler/JobScheduler.scala | 20 + .../scheduler/OutputOperationInfo.scala | 44 .../streaming/scheduler/StreamingListener.scala | 16 +++ .../scheduler/StreamingListenerBus.scala| 4 ++ .../streaming/StreamingListenerSuite.scala | 37 7 files changed, 125 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/be7c5ff1/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala b/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala index ebbcb6b..de79c9e 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala @@ -111,7 +111,11 @@ final private[streaming] class DStreamGraph extends Serializable with Logging { def generateJobs(time: Time): Seq[Job] = { logDebug("Generating jobs for time " + time) val jobs = this.synchronized { - outputStreams.flatMap(outputStream => outputStream.generateJob(time)) + outputStreams.flatMap { outputStream => +val jobOption = outputStream.generateJob(time) +jobOption.foreach(_.setCallSite(outputStream.creationSite.longForm)) +jobOption + } } logDebug("Generated " + jobs.length + " jobs for time " + time) jobs http://git-wip-us.apache.org/repos/asf/spark/blob/be7c5ff1/streaming/src/main/scala/org/apache/spark/streaming/scheduler/Job.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/Job.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/Job.scala index 3c481bf..1373053 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/Job.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/Job.scala @@ -29,6 +29,7 @@ class Job(val time: Time, func: () => _) { private var _outputOpId: Int = _ private var isSet = false private var _result: Try[_] = null + private var _callSite: String = "Unknown" def run() { _result = Try(func()) @@ -70,5 +71,11 @@ class Job(val time: Time, func: () => _) { _outputOpId = outputOpId } + def setCallSite(callSite: String): Unit = { +_callSite = callSite + } + + def callSite: String = _callSite + override def toString: String = id } http://git-wip-us.apache.org/repos/asf/spark/blob/be7c5ff1/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala index 66afbf1..0a4a396 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala @@ -30,8 +30,8 @@ import org.apache.spark.util.{EventLoop, ThreadUtils} private[scheduler] sealed trait JobSchedulerEvent -private[scheduler] case class JobStarted(job: Job) extends JobSchedulerEvent -private[scheduler] case class JobCompleted(job: Job) extends JobSchedulerEvent +private[scheduler] case class JobStarted(job: Job, startTime: Long) extends JobSchedulerEvent +private[scheduler] case class JobCompleted(job: Job, completedTime: Long) extends JobSchedulerEvent private[scheduler] case class
spark git commit: [SPARK-10224] [STREAMING] Fix the issue that blockIntervalTimer won't call updateCurrentBuffer when stopping
Repository: spark Updated Branches: refs/heads/master 5548a2547 -> 44c28abf1 [SPARK-10224] [STREAMING] Fix the issue that blockIntervalTimer won't call updateCurrentBuffer when stopping `blockIntervalTimer.stop(interruptTimer = false)` doesn't guarantee calling `updateCurrentBuffer`. So it's possible that `blockIntervalTimer` will exit when `updateCurrentBuffer` is not empty. Then the data in `currentBuffer` will be lost. To reproduce it, you can add `Thread.sleep(200)` in this line (https://github.com/apache/spark/blob/69c9c177160e32a2fbc9b36ecc52156077fca6fc/streaming/src/main/scala/org/apache/spark/streaming/util/RecurringTimer.scala#L100) and run `StreamingContexSuite`. I cannot write a unit test to reproduce it because I cannot find an approach to force `RecurringTimer` suspend at this line for a few milliseconds. There was a failure in Jenkins here: https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/41455/console This PR updates RecurringTimer to make sure `stop(interruptTimer = false)` will call `callback` at least once after the `stop` method is called. Author: zsxwingCloses #8417 from zsxwing/SPARK-10224. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/44c28abf Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/44c28abf Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/44c28abf Branch: refs/heads/master Commit: 44c28abf120754c0175c65ffd3d4587a350b3798 Parents: 5548a25 Author: zsxwing Authored: Wed Sep 23 01:28:02 2015 -0700 Committer: Tathagata Das Committed: Wed Sep 23 01:28:02 2015 -0700 -- .../spark/streaming/util/RecurringTimer.scala | 19 +++-- .../receiver/BlockGeneratorSuite.scala | 7 +- .../streaming/util/RecurringTimerSuite.scala| 83 3 files changed, 100 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/44c28abf/streaming/src/main/scala/org/apache/spark/streaming/util/RecurringTimer.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/RecurringTimer.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/RecurringTimer.scala index dd32ad5..0148cb5 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/util/RecurringTimer.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/util/RecurringTimer.scala @@ -72,8 +72,10 @@ class RecurringTimer(clock: Clock, period: Long, callback: (Long) => Unit, name: /** * Stop the timer, and return the last time the callback was made. - * interruptTimer = true will interrupt the callback + * - interruptTimer = true will interrupt the callback * if it is in progress (not guaranteed to give correct time in this case). + * - interruptTimer = false guarantees that there will be at least one callback after `stop` has + * been called. */ def stop(interruptTimer: Boolean): Long = synchronized { if (!stopped) { @@ -87,18 +89,23 @@ class RecurringTimer(clock: Clock, period: Long, callback: (Long) => Unit, name: prevTime } + private def triggerActionForNextInterval(): Unit = { +clock.waitTillTime(nextTime) +callback(nextTime) +prevTime = nextTime +nextTime += period +logDebug("Callback for " + name + " called at time " + prevTime) + } + /** * Repeatedly call the callback every interval. */ private def loop() { try { while (!stopped) { -clock.waitTillTime(nextTime) -callback(nextTime) -prevTime = nextTime -nextTime += period -logDebug("Callback for " + name + " called at time " + prevTime) +triggerActionForNextInterval() } + triggerActionForNextInterval() } catch { case e: InterruptedException => } http://git-wip-us.apache.org/repos/asf/spark/blob/44c28abf/streaming/src/test/scala/org/apache/spark/streaming/receiver/BlockGeneratorSuite.scala -- diff --git a/streaming/src/test/scala/org/apache/spark/streaming/receiver/BlockGeneratorSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/receiver/BlockGeneratorSuite.scala index a38cc60..2f11b25 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/receiver/BlockGeneratorSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/receiver/BlockGeneratorSuite.scala @@ -184,9 +184,10 @@ class BlockGeneratorSuite extends SparkFunSuite with BeforeAndAfter { // Verify that the final data is present in the final generated block and // pushed before complete stop assert(blockGenerator.isStopped() === false) // generator
spark git commit: [SPARK-10769] [STREAMING] [TESTS] Fix o.a.s.streaming.CheckpointSuite.maintains rate controller
Repository: spark Updated Branches: refs/heads/branch-1.5 6a616d0d0 -> 4174b94f0 [SPARK-10769] [STREAMING] [TESTS] Fix o.a.s.streaming.CheckpointSuite.maintains rate controller Fixed the following failure in https://amplab.cs.berkeley.edu/jenkins/job/NewSparkPullRequestBuilder/1787/testReport/junit/org.apache.spark.streaming/CheckpointSuite/recovery_maintains_rate_controller/ ``` sbt.ForkMain$ForkError: The code passed to eventually never returned normally. Attempted 660 times over 10.4439201 seconds. Last failure message: 9223372036854775807 did not equal 200. at org.scalatest.concurrent.Eventually$class.tryTryAgain$1(Eventually.scala:420) at org.scalatest.concurrent.Eventually$class.eventually(Eventually.scala:438) at org.scalatest.concurrent.Eventually$.eventually(Eventually.scala:478) at org.scalatest.concurrent.Eventually$class.eventually(Eventually.scala:336) at org.scalatest.concurrent.Eventually$.eventually(Eventually.scala:478) at org.apache.spark.streaming.CheckpointSuite$$anonfun$15.apply$mcV$sp(CheckpointSuite.scala:413) at org.apache.spark.streaming.CheckpointSuite$$anonfun$15.apply(CheckpointSuite.scala:396) at org.apache.spark.streaming.CheckpointSuite$$anonfun$15.apply(CheckpointSuite.scala:396) at org.scalatest.Transformer$$anonfun$apply$1.apply$mcV$sp(Transformer.scala:22) at org.scalatest.OutcomeOf$class.outcomeOf(OutcomeOf.scala:85) at org.scalatest.OutcomeOf$.outcomeOf(OutcomeOf.scala:104) at org.scalatest.Transformer.apply(Transformer.scala:22) ``` In this test, it calls `advanceTimeWithRealDelay(ssc, 2)` to run two batch jobs. However, one race condition is these two jobs can finish before the receiver is registered. Then `UpdateRateLimit` won't be sent to the receiver and `getDefaultBlockGeneratorRateLimit` cannot be updated. Here are the logs related to this issue: ``` 15/09/22 19:28:26.154 pool-1-thread-1-ScalaTest-running-CheckpointSuite INFO CheckpointSuite: Manual clock before advancing = 2500 15/09/22 19:28:26.869 JobScheduler INFO JobScheduler: Finished job streaming job 3000 ms.0 from job set of time 3000 ms 15/09/22 19:28:26.869 JobScheduler INFO JobScheduler: Total delay: 1442975303.869 s for time 3000 ms (execution: 0.711 s) 15/09/22 19:28:26.873 JobScheduler INFO JobScheduler: Finished job streaming job 3500 ms.0 from job set of time 3500 ms 15/09/22 19:28:26.873 JobScheduler INFO JobScheduler: Total delay: 1442975303.373 s for time 3500 ms (execution: 0.004 s) 15/09/22 19:28:26.879 sparkDriver-akka.actor.default-dispatcher-3 INFO ReceiverTracker: Registered receiver for stream 0 from localhost:57749 15/09/22 19:28:27.154 pool-1-thread-1-ScalaTest-running-CheckpointSuite INFO CheckpointSuite: Manual clock after advancing = 3500 ``` `advanceTimeWithRealDelay(ssc, 2)` triggered job 3000ms and 3500ms but the receiver was registered after job 3000ms and 3500ms finished. So we should make sure the receiver online before running `advanceTimeWithRealDelay(ssc, 2)`. Author: zsxwingCloses #8877 from zsxwing/SPARK-10769. (cherry picked from commit 50e4634236668a0195390f0080d0ac230d428d05) Signed-off-by: Tathagata Das Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4174b94f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4174b94f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4174b94f Branch: refs/heads/branch-1.5 Commit: 4174b94f05282ca51f1219aa6aba3226e205aee0 Parents: 6a616d0 Author: zsxwing Authored: Wed Sep 23 01:29:30 2015 -0700 Committer: Tathagata Das Committed: Wed Sep 23 01:30:21 2015 -0700 -- .../scala/org/apache/spark/streaming/CheckpointSuite.scala | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4174b94f/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala -- diff --git a/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala index 1bba7a1..a695653 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala @@ -408,10 +408,14 @@ class CheckpointSuite extends TestSuiteBase { ssc = new StreamingContext(checkpointDir) ssc.start() -val outputNew = advanceTimeWithRealDelay(ssc, 2) eventually(timeout(10.seconds)) { assert(RateTestReceiver.getActive().nonEmpty) +} + +advanceTimeWithRealDelay(ssc, 2) + +
spark git commit: [SPARK-10224] [STREAMING] Fix the issue that blockIntervalTimer won't call updateCurrentBuffer when stopping
Repository: spark Updated Branches: refs/heads/branch-1.5 8a23ef59b -> 6a616d0d0 [SPARK-10224] [STREAMING] Fix the issue that blockIntervalTimer won't call updateCurrentBuffer when stopping `blockIntervalTimer.stop(interruptTimer = false)` doesn't guarantee calling `updateCurrentBuffer`. So it's possible that `blockIntervalTimer` will exit when `updateCurrentBuffer` is not empty. Then the data in `currentBuffer` will be lost. To reproduce it, you can add `Thread.sleep(200)` in this line (https://github.com/apache/spark/blob/69c9c177160e32a2fbc9b36ecc52156077fca6fc/streaming/src/main/scala/org/apache/spark/streaming/util/RecurringTimer.scala#L100) and run `StreamingContexSuite`. I cannot write a unit test to reproduce it because I cannot find an approach to force `RecurringTimer` suspend at this line for a few milliseconds. There was a failure in Jenkins here: https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/41455/console This PR updates RecurringTimer to make sure `stop(interruptTimer = false)` will call `callback` at least once after the `stop` method is called. Author: zsxwingCloses #8417 from zsxwing/SPARK-10224. (cherry picked from commit 44c28abf120754c0175c65ffd3d4587a350b3798) Signed-off-by: Tathagata Das Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6a616d0d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6a616d0d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6a616d0d Branch: refs/heads/branch-1.5 Commit: 6a616d0d02c3fe5d570249695e9ed747bf087dbf Parents: 8a23ef5 Author: zsxwing Authored: Wed Sep 23 01:28:02 2015 -0700 Committer: Tathagata Das Committed: Wed Sep 23 01:28:16 2015 -0700 -- .../spark/streaming/util/RecurringTimer.scala | 19 +++-- .../receiver/BlockGeneratorSuite.scala | 7 +- .../streaming/util/RecurringTimerSuite.scala| 83 3 files changed, 100 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6a616d0d/streaming/src/main/scala/org/apache/spark/streaming/util/RecurringTimer.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/RecurringTimer.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/RecurringTimer.scala index dd32ad5..0148cb5 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/util/RecurringTimer.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/util/RecurringTimer.scala @@ -72,8 +72,10 @@ class RecurringTimer(clock: Clock, period: Long, callback: (Long) => Unit, name: /** * Stop the timer, and return the last time the callback was made. - * interruptTimer = true will interrupt the callback + * - interruptTimer = true will interrupt the callback * if it is in progress (not guaranteed to give correct time in this case). + * - interruptTimer = false guarantees that there will be at least one callback after `stop` has + * been called. */ def stop(interruptTimer: Boolean): Long = synchronized { if (!stopped) { @@ -87,18 +89,23 @@ class RecurringTimer(clock: Clock, period: Long, callback: (Long) => Unit, name: prevTime } + private def triggerActionForNextInterval(): Unit = { +clock.waitTillTime(nextTime) +callback(nextTime) +prevTime = nextTime +nextTime += period +logDebug("Callback for " + name + " called at time " + prevTime) + } + /** * Repeatedly call the callback every interval. */ private def loop() { try { while (!stopped) { -clock.waitTillTime(nextTime) -callback(nextTime) -prevTime = nextTime -nextTime += period -logDebug("Callback for " + name + " called at time " + prevTime) +triggerActionForNextInterval() } + triggerActionForNextInterval() } catch { case e: InterruptedException => } http://git-wip-us.apache.org/repos/asf/spark/blob/6a616d0d/streaming/src/test/scala/org/apache/spark/streaming/receiver/BlockGeneratorSuite.scala -- diff --git a/streaming/src/test/scala/org/apache/spark/streaming/receiver/BlockGeneratorSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/receiver/BlockGeneratorSuite.scala index a38cc60..2f11b25 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/receiver/BlockGeneratorSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/receiver/BlockGeneratorSuite.scala @@ -184,9 +184,10 @@ class BlockGeneratorSuite extends SparkFunSuite with BeforeAndAfter { // Verify that the final data is
spark git commit: [SPARK-10769] [STREAMING] [TESTS] Fix o.a.s.streaming.CheckpointSuite.maintains rate controller
Repository: spark Updated Branches: refs/heads/master 44c28abf1 -> 50e463423 [SPARK-10769] [STREAMING] [TESTS] Fix o.a.s.streaming.CheckpointSuite.maintains rate controller Fixed the following failure in https://amplab.cs.berkeley.edu/jenkins/job/NewSparkPullRequestBuilder/1787/testReport/junit/org.apache.spark.streaming/CheckpointSuite/recovery_maintains_rate_controller/ ``` sbt.ForkMain$ForkError: The code passed to eventually never returned normally. Attempted 660 times over 10.4439201 seconds. Last failure message: 9223372036854775807 did not equal 200. at org.scalatest.concurrent.Eventually$class.tryTryAgain$1(Eventually.scala:420) at org.scalatest.concurrent.Eventually$class.eventually(Eventually.scala:438) at org.scalatest.concurrent.Eventually$.eventually(Eventually.scala:478) at org.scalatest.concurrent.Eventually$class.eventually(Eventually.scala:336) at org.scalatest.concurrent.Eventually$.eventually(Eventually.scala:478) at org.apache.spark.streaming.CheckpointSuite$$anonfun$15.apply$mcV$sp(CheckpointSuite.scala:413) at org.apache.spark.streaming.CheckpointSuite$$anonfun$15.apply(CheckpointSuite.scala:396) at org.apache.spark.streaming.CheckpointSuite$$anonfun$15.apply(CheckpointSuite.scala:396) at org.scalatest.Transformer$$anonfun$apply$1.apply$mcV$sp(Transformer.scala:22) at org.scalatest.OutcomeOf$class.outcomeOf(OutcomeOf.scala:85) at org.scalatest.OutcomeOf$.outcomeOf(OutcomeOf.scala:104) at org.scalatest.Transformer.apply(Transformer.scala:22) ``` In this test, it calls `advanceTimeWithRealDelay(ssc, 2)` to run two batch jobs. However, one race condition is these two jobs can finish before the receiver is registered. Then `UpdateRateLimit` won't be sent to the receiver and `getDefaultBlockGeneratorRateLimit` cannot be updated. Here are the logs related to this issue: ``` 15/09/22 19:28:26.154 pool-1-thread-1-ScalaTest-running-CheckpointSuite INFO CheckpointSuite: Manual clock before advancing = 2500 15/09/22 19:28:26.869 JobScheduler INFO JobScheduler: Finished job streaming job 3000 ms.0 from job set of time 3000 ms 15/09/22 19:28:26.869 JobScheduler INFO JobScheduler: Total delay: 1442975303.869 s for time 3000 ms (execution: 0.711 s) 15/09/22 19:28:26.873 JobScheduler INFO JobScheduler: Finished job streaming job 3500 ms.0 from job set of time 3500 ms 15/09/22 19:28:26.873 JobScheduler INFO JobScheduler: Total delay: 1442975303.373 s for time 3500 ms (execution: 0.004 s) 15/09/22 19:28:26.879 sparkDriver-akka.actor.default-dispatcher-3 INFO ReceiverTracker: Registered receiver for stream 0 from localhost:57749 15/09/22 19:28:27.154 pool-1-thread-1-ScalaTest-running-CheckpointSuite INFO CheckpointSuite: Manual clock after advancing = 3500 ``` `advanceTimeWithRealDelay(ssc, 2)` triggered job 3000ms and 3500ms but the receiver was registered after job 3000ms and 3500ms finished. So we should make sure the receiver online before running `advanceTimeWithRealDelay(ssc, 2)`. Author: zsxwingCloses #8877 from zsxwing/SPARK-10769. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/50e46342 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/50e46342 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/50e46342 Branch: refs/heads/master Commit: 50e4634236668a0195390f0080d0ac230d428d05 Parents: 44c28ab Author: zsxwing Authored: Wed Sep 23 01:29:30 2015 -0700 Committer: Tathagata Das Committed: Wed Sep 23 01:29:30 2015 -0700 -- .../scala/org/apache/spark/streaming/CheckpointSuite.scala | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/50e46342/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala -- diff --git a/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala index 1bba7a1..a695653 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala @@ -408,10 +408,14 @@ class CheckpointSuite extends TestSuiteBase { ssc = new StreamingContext(checkpointDir) ssc.start() -val outputNew = advanceTimeWithRealDelay(ssc, 2) eventually(timeout(10.seconds)) { assert(RateTestReceiver.getActive().nonEmpty) +} + +advanceTimeWithRealDelay(ssc, 2) + +eventually(timeout(10.seconds)) { assert(RateTestReceiver.getActive().get.getDefaultBlockGeneratorRateLimit() === 200) }
spark git commit: [SPARK-10652] [SPARK-10742] [STREAMING] Set meaningful job descriptions for all streaming jobs
Repository: spark Updated Branches: refs/heads/master 558e9c7e6 -> 5548a2547 [SPARK-10652] [SPARK-10742] [STREAMING] Set meaningful job descriptions for all streaming jobs Here is the screenshot after adding the job descriptions to threads that run receivers and the scheduler thread running the batch jobs. ## All jobs page * Added job descriptions with links to relevant batch details page ![image](https://cloud.githubusercontent.com/assets/663212/9924165/cda4a372-5cb1-11e5-91ca-d43a32c699e9.png) ## All stages page * Added stage descriptions with links to relevant batch details page ![image](https://cloud.githubusercontent.com/assets/663212/9923814/2cce266a-5cae-11e5-8a3f-dad84d06c50e.png) ## Streaming batch details page * Added the +details link ![image](https://cloud.githubusercontent.com/assets/663212/9921977/24014a32-5c98-11e5-958e-457b6c38065b.png) Author: Tathagata Das <tathagata.das1...@gmail.com> Closes #8791 from tdas/SPARK-10652. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5548a254 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5548a254 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5548a254 Branch: refs/heads/master Commit: 5548a254755bb84edae2768b94ab1816e1b49b91 Parents: 558e9c7 Author: Tathagata Das <tathagata.das1...@gmail.com> Authored: Tue Sep 22 22:44:09 2015 -0700 Committer: Tathagata Das <tathagata.das1...@gmail.com> Committed: Tue Sep 22 22:44:09 2015 -0700 -- .../scala/org/apache/spark/ui/UIUtils.scala | 62 +- .../org/apache/spark/ui/jobs/AllJobsPage.scala | 14 +++-- .../org/apache/spark/ui/jobs/StageTable.scala | 7 +-- .../org/apache/spark/ui/UIUtilsSuite.scala | 66 .../spark/streaming/StreamingContext.scala | 4 +- .../streaming/scheduler/JobScheduler.scala | 15 - .../streaming/scheduler/ReceiverTracker.scala | 5 +- .../apache/spark/streaming/ui/BatchPage.scala | 33 ++ .../spark/streaming/StreamingContextSuite.scala | 2 +- 9 files changed, 179 insertions(+), 29 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5548a254/core/src/main/scala/org/apache/spark/ui/UIUtils.scala -- diff --git a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala index f2da417..21dc8f0 100644 --- a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala +++ b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala @@ -18,9 +18,11 @@ package org.apache.spark.ui import java.text.SimpleDateFormat -import java.util.{Locale, Date} +import java.util.{Date, Locale} -import scala.xml.{Node, Text, Unparsed} +import scala.util.control.NonFatal +import scala.xml._ +import scala.xml.transform.{RewriteRule, RuleTransformer} import org.apache.spark.Logging import org.apache.spark.ui.scope.RDDOperationGraph @@ -395,4 +397,60 @@ private[spark] object UIUtils extends Logging { } + /** + * Returns HTML rendering of a job or stage description. It will try to parse the string as HTML + * and make sure that it only contains anchors with root-relative links. Otherwise, + * the whole string will rendered as a simple escaped text. + * + * Note: In terms of security, only anchor tags with root relative links are supported. So any + * attempts to embed links outside Spark UI, or other tags like
spark git commit: [SPARK-10652] [SPARK-10742] [STREAMING] Set meaningful job descriptions for all streaming jobs
Repository: spark Updated Branches: refs/heads/branch-1.5 7f07cc6d0 -> 8a23ef59b [SPARK-10652] [SPARK-10742] [STREAMING] Set meaningful job descriptions for all streaming jobs Here is the screenshot after adding the job descriptions to threads that run receivers and the scheduler thread running the batch jobs. ## All jobs page * Added job descriptions with links to relevant batch details page ![image](https://cloud.githubusercontent.com/assets/663212/9924165/cda4a372-5cb1-11e5-91ca-d43a32c699e9.png) ## All stages page * Added stage descriptions with links to relevant batch details page ![image](https://cloud.githubusercontent.com/assets/663212/9923814/2cce266a-5cae-11e5-8a3f-dad84d06c50e.png) ## Streaming batch details page * Added the +details link ![image](https://cloud.githubusercontent.com/assets/663212/9921977/24014a32-5c98-11e5-958e-457b6c38065b.png) Author: Tathagata Das <tathagata.das1...@gmail.com> Closes #8791 from tdas/SPARK-10652. (cherry picked from commit 5548a254755bb84edae2768b94ab1816e1b49b91) Signed-off-by: Tathagata Das <tathagata.das1...@gmail.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8a23ef59 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8a23ef59 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8a23ef59 Branch: refs/heads/branch-1.5 Commit: 8a23ef59bc462370cf8729613aebbd1639f6be0f Parents: 7f07cc6 Author: Tathagata Das <tathagata.das1...@gmail.com> Authored: Tue Sep 22 22:44:09 2015 -0700 Committer: Tathagata Das <tathagata.das1...@gmail.com> Committed: Tue Sep 22 22:45:05 2015 -0700 -- .../scala/org/apache/spark/ui/UIUtils.scala | 62 +- .../org/apache/spark/ui/jobs/AllJobsPage.scala | 14 +++-- .../org/apache/spark/ui/jobs/StageTable.scala | 7 +-- .../org/apache/spark/ui/UIUtilsSuite.scala | 66 .../spark/streaming/StreamingContext.scala | 4 +- .../streaming/scheduler/JobScheduler.scala | 15 - .../streaming/scheduler/ReceiverTracker.scala | 5 +- .../apache/spark/streaming/ui/BatchPage.scala | 33 ++ .../spark/streaming/StreamingContextSuite.scala | 2 +- 9 files changed, 179 insertions(+), 29 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8a23ef59/core/src/main/scala/org/apache/spark/ui/UIUtils.scala -- diff --git a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala index f2da417..21dc8f0 100644 --- a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala +++ b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala @@ -18,9 +18,11 @@ package org.apache.spark.ui import java.text.SimpleDateFormat -import java.util.{Locale, Date} +import java.util.{Date, Locale} -import scala.xml.{Node, Text, Unparsed} +import scala.util.control.NonFatal +import scala.xml._ +import scala.xml.transform.{RewriteRule, RuleTransformer} import org.apache.spark.Logging import org.apache.spark.ui.scope.RDDOperationGraph @@ -395,4 +397,60 @@ private[spark] object UIUtils extends Logging { } + /** + * Returns HTML rendering of a job or stage description. It will try to parse the string as HTML + * and make sure that it only contains anchors with root-relative links. Otherwise, + * the whole string will rendered as a simple escaped text. + * + * Note: In terms of security, only anchor tags with root relative links are supported. So any + * attempts to embed links outside Spark UI, or other tags like
spark git commit: [SPARK-10649] [STREAMING] Prevent inheriting job group and irrelevant job description in streaming jobs
Repository: spark Updated Branches: refs/heads/master 7c4f852bf -> 72869883f [SPARK-10649] [STREAMING] Prevent inheriting job group and irrelevant job description in streaming jobs The job group, and job descriptions information is passed through thread local properties, and get inherited by child threads. In case of spark streaming, the streaming jobs inherit these properties from the thread that called streamingContext.start(). This may not make sense. 1. Job group: This is mainly used for cancelling a group of jobs together. It does not make sense to cancel streaming jobs like this, as the effect will be unpredictable. And its not a valid usecase any way, to cancel a streaming context, call streamingContext.stop() 2. Job description: This is used to pass on nice text descriptions for jobs to show up in the UI. The job description of the thread that calls streamingContext.start() is not useful for all the streaming jobs, as it does not make sense for all of the streaming jobs to have the same description, and the description may or may not be related to streaming. The solution in this PR is meant for the Spark master branch, where local properties are inherited by cloning the properties. The job group and job description in the thread that starts the streaming scheduler are explicitly removed, so that all the subsequent child threads does not inherit them. Also, the starting is done in a new child thread, so that setting the job group and description for streaming, does not change those properties in the thread that called streamingContext.start(). Author: Tathagata Das <tathagata.das1...@gmail.com> Closes #8781 from tdas/SPARK-10649. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/72869883 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/72869883 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/72869883 Branch: refs/heads/master Commit: 72869883f12b6e0a4e5aad79c0ac2cfdb4d83f09 Parents: 7c4f852 Author: Tathagata Das <tathagata.das1...@gmail.com> Authored: Mon Sep 21 16:47:52 2015 -0700 Committer: Tathagata Das <tathagata.das1...@gmail.com> Committed: Mon Sep 21 16:47:52 2015 -0700 -- .../org/apache/spark/util/ThreadUtils.scala | 59 .../apache/spark/util/ThreadUtilsSuite.scala| 24 +++- .../spark/streaming/StreamingContext.scala | 15 - .../spark/streaming/StreamingContextSuite.scala | 32 +++ 4 files changed, 126 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/72869883/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala -- diff --git a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala index ca5624a..22e291a 100644 --- a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala +++ b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala @@ -21,6 +21,7 @@ package org.apache.spark.util import java.util.concurrent._ import scala.concurrent.{ExecutionContext, ExecutionContextExecutor} +import scala.util.control.NonFatal import com.google.common.util.concurrent.{MoreExecutors, ThreadFactoryBuilder} @@ -86,4 +87,62 @@ private[spark] object ThreadUtils { val threadFactory = new ThreadFactoryBuilder().setDaemon(true).setNameFormat(threadName).build() Executors.newSingleThreadScheduledExecutor(threadFactory) } + + /** + * Run a piece of code in a new thread and return the result. Exception in the new thread is + * thrown in the caller thread with an adjusted stack trace that removes references to this + * method for clarity. The exception stack traces will be like the following + * + * SomeException: exception-message + * at CallerClass.body-method (sourcefile.scala) + * at ... run in separate thread using org.apache.spark.util.ThreadUtils ... () + * at CallerClass.caller-method (sourcefile.scala) + * ... + */ + def runInNewThread[T]( + threadName: String, + isDaemon: Boolean = true)(body: => T): T = { +@volatile var exception: Option[Throwable] = None +@volatile var result: T = null.asInstanceOf[T] + +val thread = new Thread(threadName) { + override def run(): Unit = { +try { + result = body +} catch { + case NonFatal(e) => +exception = Some(e) +} + } +} +thread.setDaemon(isDaemon) +thread.start() +thread.join() + +exception match { + case Some(realException) => +// Remove the part of the stack that shows method calls into this helper method +// This means drop everything from t
spark git commit: [SPARK-10492] [STREAMING] [DOCUMENTATION] Update Streaming documentation about rate limiting and backpressure
Repository: spark Updated Branches: refs/heads/master e6f8d3686 -> 52b24a602 [SPARK-10492] [STREAMING] [DOCUMENTATION] Update Streaming documentation about rate limiting and backpressure Author: Tathagata Das <tathagata.das1...@gmail.com> Closes #8656 from tdas/SPARK-10492 and squashes the following commits: 986cdd6 [Tathagata Das] Added information on backpressure Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/52b24a60 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/52b24a60 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/52b24a60 Branch: refs/heads/master Commit: 52b24a602ad615a7f6aa427aefb1c7444c05d298 Parents: e6f8d36 Author: Tathagata Das <tathagata.das1...@gmail.com> Authored: Tue Sep 8 14:54:43 2015 -0700 Committer: Tathagata Das <tathagata.das1...@gmail.com> Committed: Tue Sep 8 14:54:43 2015 -0700 -- docs/configuration.md | 13 + docs/streaming-programming-guide.md | 13 - 2 files changed, 25 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/52b24a60/docs/configuration.md -- diff --git a/docs/configuration.md b/docs/configuration.md index a2cc7a3..e287591 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -1434,6 +1434,19 @@ Apart from these, the following properties are also available, and may be useful Property NameDefaultMeaning + spark.streaming.backpressure.enabled + false + +Enables or disables Spark Streaming's internal backpressure mechanism (since 1.5). +This enables the Spark Streaming to control the receiving rate based on the +current batch scheduling delays and processing times so that the system receives +only as fast as the system can process. Internally, this dynamically sets the +maximum receiving rate of receivers. This rate is upper bounded by the values +`spark.streaming.receiver.maxRate` and `spark.streaming.kafka.maxRatePerPartition` +if they are set (see below). + + + spark.streaming.blockInterval 200ms http://git-wip-us.apache.org/repos/asf/spark/blob/52b24a60/docs/streaming-programming-guide.md -- diff --git a/docs/streaming-programming-guide.md b/docs/streaming-programming-guide.md index a1acf83..c751dbb 100644 --- a/docs/streaming-programming-guide.md +++ b/docs/streaming-programming-guide.md @@ -1807,7 +1807,7 @@ To run a Spark Streaming applications, you need to have the following. + *Mesos* - [Marathon](https://github.com/mesosphere/marathon) has been used to achieve this with Mesos. -- *[Since Spark 1.2] Configuring write ahead logs* - Since Spark 1.2, +- *Configuring write ahead logs* - Since Spark 1.2, we have introduced _write ahead logs_ for achieving strong fault-tolerance guarantees. If enabled, all the data received from a receiver gets written into a write ahead log in the configuration checkpoint directory. This prevents data loss on driver @@ -1822,6 +1822,17 @@ To run a Spark Streaming applications, you need to have the following. stored in a replicated storage system. This can be done by setting the storage level for the input stream to `StorageLevel.MEMORY_AND_DISK_SER`. +- *Setting the max receiving rate* - If the cluster resources is not large enough for the streaming + application to process data as fast as it is being received, the receivers can be rate limited + by setting a maximum rate limit in terms of records / sec. + See the [configuration parameters](configuration.html#spark-streaming) + `spark.streaming.receiver.maxRate` for receivers and `spark.streaming.kafka.maxRatePerPartition` + for Direct Kafka approach. In Spark 1.5, we have introduced a feature called *backpressure* that + eliminate the need to set this rate limit, as Spark Streaming automatically figures out the + rate limits and dynamically adjusts them if the processing conditions change. This backpressure + can be enabled by setting the [configuration parameter](configuration.html#spark-streaming) + `spark.streaming.backpressure.enabled` to `true`. + ### Upgrading Application Code {:.no_toc} - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-10492] [STREAMING] [DOCUMENTATION] Update Streaming documentation about rate limiting and backpressure
Repository: spark Updated Branches: refs/heads/branch-1.5 7fd4674fc -> 63c72b93e [SPARK-10492] [STREAMING] [DOCUMENTATION] Update Streaming documentation about rate limiting and backpressure Author: Tathagata Das <tathagata.das1...@gmail.com> Closes #8656 from tdas/SPARK-10492 and squashes the following commits: 986cdd6 [Tathagata Das] Added information on backpressure (cherry picked from commit 52b24a602ad615a7f6aa427aefb1c7444c05d298) Signed-off-by: Tathagata Das <tathagata.das1...@gmail.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/63c72b93 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/63c72b93 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/63c72b93 Branch: refs/heads/branch-1.5 Commit: 63c72b93eb51685814543a39caf9a6d221e2583c Parents: 7fd4674 Author: Tathagata Das <tathagata.das1...@gmail.com> Authored: Tue Sep 8 14:54:43 2015 -0700 Committer: Tathagata Das <tathagata.das1...@gmail.com> Committed: Tue Sep 8 14:54:54 2015 -0700 -- docs/configuration.md | 13 + docs/streaming-programming-guide.md | 13 - 2 files changed, 25 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/63c72b93/docs/configuration.md -- diff --git a/docs/configuration.md b/docs/configuration.md index 77c5cbc..353efdb 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -1438,6 +1438,19 @@ Apart from these, the following properties are also available, and may be useful Property NameDefaultMeaning + spark.streaming.backpressure.enabled + false + +Enables or disables Spark Streaming's internal backpressure mechanism (since 1.5). +This enables the Spark Streaming to control the receiving rate based on the +current batch scheduling delays and processing times so that the system receives +only as fast as the system can process. Internally, this dynamically sets the +maximum receiving rate of receivers. This rate is upper bounded by the values +`spark.streaming.receiver.maxRate` and `spark.streaming.kafka.maxRatePerPartition` +if they are set (see below). + + + spark.streaming.blockInterval 200ms http://git-wip-us.apache.org/repos/asf/spark/blob/63c72b93/docs/streaming-programming-guide.md -- diff --git a/docs/streaming-programming-guide.md b/docs/streaming-programming-guide.md index a1acf83..c751dbb 100644 --- a/docs/streaming-programming-guide.md +++ b/docs/streaming-programming-guide.md @@ -1807,7 +1807,7 @@ To run a Spark Streaming applications, you need to have the following. + *Mesos* - [Marathon](https://github.com/mesosphere/marathon) has been used to achieve this with Mesos. -- *[Since Spark 1.2] Configuring write ahead logs* - Since Spark 1.2, +- *Configuring write ahead logs* - Since Spark 1.2, we have introduced _write ahead logs_ for achieving strong fault-tolerance guarantees. If enabled, all the data received from a receiver gets written into a write ahead log in the configuration checkpoint directory. This prevents data loss on driver @@ -1822,6 +1822,17 @@ To run a Spark Streaming applications, you need to have the following. stored in a replicated storage system. This can be done by setting the storage level for the input stream to `StorageLevel.MEMORY_AND_DISK_SER`. +- *Setting the max receiving rate* - If the cluster resources is not large enough for the streaming + application to process data as fast as it is being received, the receivers can be rate limited + by setting a maximum rate limit in terms of records / sec. + See the [configuration parameters](configuration.html#spark-streaming) + `spark.streaming.receiver.maxRate` for receivers and `spark.streaming.kafka.maxRatePerPartition` + for Direct Kafka approach. In Spark 1.5, we have introduced a feature called *backpressure* that + eliminate the need to set this rate limit, as Spark Streaming automatically figures out the + rate limits and dynamically adjusts them if the processing conditions change. This backpressure + can be enabled by setting the [configuration parameter](configuration.html#spark-streaming) + `spark.streaming.backpressure.enabled` to `true`. + ### Upgrading Application Code {:.no_toc} - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-10369] [STREAMING] Don't remove ReceiverTrackingInfo when deregisterReceivering since we may reuse it later
Repository: spark Updated Branches: refs/heads/master 72f6dbf7b -> 4a5fe0916 [SPARK-10369] [STREAMING] Don't remove ReceiverTrackingInfo when deregisterReceivering since we may reuse it later `deregisterReceiver` should not remove `ReceiverTrackingInfo`. Otherwise, it will throw `java.util.NoSuchElementException: key not found` when restarting it. Author: zsxwingCloses #8538 from zsxwing/SPARK-10369. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4a5fe091 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4a5fe091 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4a5fe091 Branch: refs/heads/master Commit: 4a5fe091658b1d06f427e404a11a84fc84f953c5 Parents: 72f6dbf Author: zsxwing Authored: Mon Aug 31 12:19:11 2015 -0700 Committer: Tathagata Das Committed: Mon Aug 31 12:19:11 2015 -0700 -- .../streaming/scheduler/ReceiverTracker.scala | 4 +- .../scheduler/ReceiverTrackerSuite.scala| 51 2 files changed, 53 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4a5fe091/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala index 3d532a6..f86fd44 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala @@ -291,7 +291,7 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false ReceiverTrackingInfo( streamId, ReceiverState.INACTIVE, None, None, None, None, Some(errorInfo)) } -receiverTrackingInfos -= streamId +receiverTrackingInfos(streamId) = newReceiverTrackingInfo listenerBus.post(StreamingListenerReceiverStopped(newReceiverTrackingInfo.toReceiverInfo)) val messageWithError = if (error != null && !error.isEmpty) { s"$message - $error" @@ -483,7 +483,7 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false context.reply(true) // Local messages case AllReceiverIds => -context.reply(receiverTrackingInfos.keys.toSeq) +context.reply(receiverTrackingInfos.filter(_._2.state != ReceiverState.INACTIVE).keys.toSeq) case StopAllReceivers => assert(isTrackerStopping || isTrackerStopped) stopReceivers() http://git-wip-us.apache.org/repos/asf/spark/blob/4a5fe091/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala -- diff --git a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala index dd292ba..45138b7 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala @@ -60,6 +60,26 @@ class ReceiverTrackerSuite extends TestSuiteBase { } } } + + test("should restart receiver after stopping it") { +withStreamingContext(new StreamingContext(conf, Milliseconds(100))) { ssc => + @volatile var startTimes = 0 + ssc.addStreamingListener(new StreamingListener { +override def onReceiverStarted(receiverStarted: StreamingListenerReceiverStarted): Unit = { + startTimes += 1 +} + }) + val input = ssc.receiverStream(new StoppableReceiver) + val output = new TestOutputStream(input) + output.register() + ssc.start() + StoppableReceiver.shouldStop = true + eventually(timeout(10 seconds), interval(10 millis)) { +// The receiver is stopped once, so if it's restarted, it should be started twice. +assert(startTimes === 2) + } +} + } } /** An input DStream with for testing rate controlling */ @@ -132,3 +152,34 @@ private[streaming] object RateTestReceiver { def getActive(): Option[RateTestReceiver] = Option(activeReceiver) } + +/** + * A custom receiver that could be stopped via StoppableReceiver.shouldStop + */ +class StoppableReceiver extends Receiver[Int](StorageLevel.MEMORY_ONLY) { + + var receivingThreadOption: Option[Thread] = None + + def onStart() { +val thread = new Thread() { + override def run() { +while (!StoppableReceiver.shouldStop) { + Thread.sleep(10) +} +
spark git commit: [SPARK-10369] [STREAMING] Don't remove ReceiverTrackingInfo when deregisterReceivering since we may reuse it later
Repository: spark Updated Branches: refs/heads/branch-1.5 bf5b2f26b -> 33ce274cd [SPARK-10369] [STREAMING] Don't remove ReceiverTrackingInfo when deregisterReceivering since we may reuse it later `deregisterReceiver` should not remove `ReceiverTrackingInfo`. Otherwise, it will throw `java.util.NoSuchElementException: key not found` when restarting it. Author: zsxwingCloses #8538 from zsxwing/SPARK-10369. (cherry picked from commit 4a5fe091658b1d06f427e404a11a84fc84f953c5) Signed-off-by: Tathagata Das Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/33ce274c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/33ce274c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/33ce274c Branch: refs/heads/branch-1.5 Commit: 33ce274cdf7538b5816f1a400b2fad394ec2a056 Parents: bf5b2f2 Author: zsxwing Authored: Mon Aug 31 12:19:11 2015 -0700 Committer: Tathagata Das Committed: Mon Aug 31 12:19:48 2015 -0700 -- .../streaming/scheduler/ReceiverTracker.scala | 4 +- .../scheduler/ReceiverTrackerSuite.scala| 51 2 files changed, 53 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/33ce274c/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala index 3d532a6..f86fd44 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala @@ -291,7 +291,7 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false ReceiverTrackingInfo( streamId, ReceiverState.INACTIVE, None, None, None, None, Some(errorInfo)) } -receiverTrackingInfos -= streamId +receiverTrackingInfos(streamId) = newReceiverTrackingInfo listenerBus.post(StreamingListenerReceiverStopped(newReceiverTrackingInfo.toReceiverInfo)) val messageWithError = if (error != null && !error.isEmpty) { s"$message - $error" @@ -483,7 +483,7 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false context.reply(true) // Local messages case AllReceiverIds => -context.reply(receiverTrackingInfos.keys.toSeq) +context.reply(receiverTrackingInfos.filter(_._2.state != ReceiverState.INACTIVE).keys.toSeq) case StopAllReceivers => assert(isTrackerStopping || isTrackerStopped) stopReceivers() http://git-wip-us.apache.org/repos/asf/spark/blob/33ce274c/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala -- diff --git a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala index dd292ba..45138b7 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala @@ -60,6 +60,26 @@ class ReceiverTrackerSuite extends TestSuiteBase { } } } + + test("should restart receiver after stopping it") { +withStreamingContext(new StreamingContext(conf, Milliseconds(100))) { ssc => + @volatile var startTimes = 0 + ssc.addStreamingListener(new StreamingListener { +override def onReceiverStarted(receiverStarted: StreamingListenerReceiverStarted): Unit = { + startTimes += 1 +} + }) + val input = ssc.receiverStream(new StoppableReceiver) + val output = new TestOutputStream(input) + output.register() + ssc.start() + StoppableReceiver.shouldStop = true + eventually(timeout(10 seconds), interval(10 millis)) { +// The receiver is stopped once, so if it's restarted, it should be started twice. +assert(startTimes === 2) + } +} + } } /** An input DStream with for testing rate controlling */ @@ -132,3 +152,34 @@ private[streaming] object RateTestReceiver { def getActive(): Option[RateTestReceiver] = Option(activeReceiver) } + +/** + * A custom receiver that could be stopped via StoppableReceiver.shouldStop + */ +class StoppableReceiver extends Receiver[Int](StorageLevel.MEMORY_ONLY) { + + var receivingThreadOption: Option[Thread] = None + + def onStart() { +val thread =
spark git commit: [SPARK-9786] [STREAMING] [KAFKA] fix backpressure so it works with defa…
Repository: spark Updated Branches: refs/heads/branch-1.5 2239a2036 - 88991dc4f [SPARK-9786] [STREAMING] [KAFKA] fix backpressure so it works with defa⦠â¦ult maxRatePerPartition setting of 0 Author: cody koeninger c...@koeninger.org Closes #8413 from koeninger/backpressure-testing-master. (cherry picked from commit d9c25dec87e6da7d66a47ff94e7eefa008081b9d) Signed-off-by: Tathagata Das tathagata.das1...@gmail.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/88991dc4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/88991dc4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/88991dc4 Branch: refs/heads/branch-1.5 Commit: 88991dc4f04b0c88466c6eab5ada43506c981341 Parents: 2239a20 Author: cody koeninger c...@koeninger.org Authored: Mon Aug 24 23:26:14 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Mon Aug 24 23:26:27 2015 -0700 -- .../spark/streaming/kafka/DirectKafkaInputDStream.scala | 9 +++-- 1 file changed, 7 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/88991dc4/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala -- diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala index 8a17707..194 100644 --- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala +++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala @@ -95,8 +95,13 @@ class DirectKafkaInputDStream[ val effectiveRateLimitPerPartition = estimatedRateLimit .filter(_ 0) - .map(limit = Math.min(maxRateLimitPerPartition, (limit / numPartitions))) - .getOrElse(maxRateLimitPerPartition) + .map { limit = +if (maxRateLimitPerPartition 0) { + Math.min(maxRateLimitPerPartition, (limit / numPartitions)) +} else { + limit / numPartitions +} + }.getOrElse(maxRateLimitPerPartition) if (effectiveRateLimitPerPartition 0) { val secsPerBatch = context.graph.batchDuration.milliseconds.toDouble / 1000 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-10137] [STREAMING] Avoid to restart receivers if scheduleReceivers returns balanced results
Repository: spark Updated Branches: refs/heads/branch-1.5 88991dc4f - bb1357f36 [SPARK-10137] [STREAMING] Avoid to restart receivers if scheduleReceivers returns balanced results This PR fixes the following cases for `ReceiverSchedulingPolicy`. 1) Assume there are 4 executors: host1, host2, host3, host4, and 5 receivers: r1, r2, r3, r4, r5. Then `ReceiverSchedulingPolicy.scheduleReceivers` will return (r1 - host1, r2 - host2, r3 - host3, r4 - host4, r5 - host1). Let's assume r1 starts at first on `host1` as `scheduleReceivers` suggested, and try to register with ReceiverTracker. But the previous `ReceiverSchedulingPolicy.rescheduleReceiver` will return (host2, host3, host4) according to the current executor weights (host1 - 1.0, host2 - 0.5, host3 - 0.5, host4 - 0.5), so ReceiverTracker will reject `r1`. This is unexpected since r1 is starting exactly where `scheduleReceivers` suggested. This case can be fixed by ignoring the information of the receiver that is rescheduling in `receiverTrackingInfoMap`. 2) Assume there are 3 executors (host1, host2, host3) and each executors has 3 cores, and 3 receivers: r1, r2, r3. Assume r1 is running on host1. Now r2 is restarting, the previous `ReceiverSchedulingPolicy.rescheduleReceiver` will always return (host1, host2, host3). So it's possible that r2 will be scheduled to host1 by TaskScheduler. r3 is similar. Then at last, it's possible that there are 3 receivers running on host1, while host2 and host3 are idle. This issue can be fixed by returning only executors that have the minimum wight rather than returning at least 3 executors. Author: zsxwing zsxw...@gmail.com Closes #8340 from zsxwing/fix-receiver-scheduling. (cherry picked from commit f023aa2fcc1d1dbb82aee568be0a8f2457c309ae) Signed-off-by: Tathagata Das tathagata.das1...@gmail.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bb1357f3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bb1357f3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bb1357f3 Branch: refs/heads/branch-1.5 Commit: bb1357f362cdd96b854c2a0a193496ce709cdbdd Parents: 88991dc Author: zsxwing zsxw...@gmail.com Authored: Mon Aug 24 23:34:50 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Mon Aug 24 23:35:02 2015 -0700 -- .../scheduler/ReceiverSchedulingPolicy.scala| 58 +++--- .../streaming/scheduler/ReceiverTracker.scala | 106 --- .../ReceiverSchedulingPolicySuite.scala | 13 +-- 3 files changed, 120 insertions(+), 57 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/bb1357f3/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala index ef5b687..10b5a7f 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala @@ -22,6 +22,36 @@ import scala.collection.mutable import org.apache.spark.streaming.receiver.Receiver +/** + * A class that tries to schedule receivers with evenly distributed. There are two phases for + * scheduling receivers. + * + * - The first phase is global scheduling when ReceiverTracker is starting and we need to schedule + * all receivers at the same time. ReceiverTracker will call `scheduleReceivers` at this phase. + * It will try to schedule receivers with evenly distributed. ReceiverTracker should update its + * receiverTrackingInfoMap according to the results of `scheduleReceivers`. + * `ReceiverTrackingInfo.scheduledExecutors` for each receiver will set to an executor list that + * contains the scheduled locations. Then when a receiver is starting, it will send a register + * request and `ReceiverTracker.registerReceiver` will be called. In + * `ReceiverTracker.registerReceiver`, if a receiver's scheduled executors is set, it should check + * if the location of this receiver is one of the scheduled executors, if not, the register will + * be rejected. + * - The second phase is local scheduling when a receiver is restarting. There are two cases of + * receiver restarting: + * - If a receiver is restarting because it's rejected due to the real location and the scheduled + * executors mismatching, in other words, it fails to start in one of the locations that + * `scheduleReceivers` suggested, `ReceiverTracker` should firstly choose the executors that are + * still alive in the list of
spark git commit: [SPARK-9786] [STREAMING] [KAFKA] fix backpressure so it works with defa…
Repository: spark Updated Branches: refs/heads/master 5175ca0c8 - d9c25dec8 [SPARK-9786] [STREAMING] [KAFKA] fix backpressure so it works with defa⦠â¦ult maxRatePerPartition setting of 0 Author: cody koeninger c...@koeninger.org Closes #8413 from koeninger/backpressure-testing-master. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d9c25dec Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d9c25dec Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d9c25dec Branch: refs/heads/master Commit: d9c25dec87e6da7d66a47ff94e7eefa008081b9d Parents: 5175ca0 Author: cody koeninger c...@koeninger.org Authored: Mon Aug 24 23:26:14 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Mon Aug 24 23:26:14 2015 -0700 -- .../spark/streaming/kafka/DirectKafkaInputDStream.scala | 9 +++-- 1 file changed, 7 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d9c25dec/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala -- diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala index 8a17707..194 100644 --- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala +++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala @@ -95,8 +95,13 @@ class DirectKafkaInputDStream[ val effectiveRateLimitPerPartition = estimatedRateLimit .filter(_ 0) - .map(limit = Math.min(maxRateLimitPerPartition, (limit / numPartitions))) - .getOrElse(maxRateLimitPerPartition) + .map { limit = +if (maxRateLimitPerPartition 0) { + Math.min(maxRateLimitPerPartition, (limit / numPartitions)) +} else { + limit / numPartitions +} + }.getOrElse(maxRateLimitPerPartition) if (effectiveRateLimitPerPartition 0) { val secsPerBatch = context.graph.batchDuration.milliseconds.toDouble / 1000 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-10137] [STREAMING] Avoid to restart receivers if scheduleReceivers returns balanced results
Repository: spark Updated Branches: refs/heads/master d9c25dec8 - f023aa2fc [SPARK-10137] [STREAMING] Avoid to restart receivers if scheduleReceivers returns balanced results This PR fixes the following cases for `ReceiverSchedulingPolicy`. 1) Assume there are 4 executors: host1, host2, host3, host4, and 5 receivers: r1, r2, r3, r4, r5. Then `ReceiverSchedulingPolicy.scheduleReceivers` will return (r1 - host1, r2 - host2, r3 - host3, r4 - host4, r5 - host1). Let's assume r1 starts at first on `host1` as `scheduleReceivers` suggested, and try to register with ReceiverTracker. But the previous `ReceiverSchedulingPolicy.rescheduleReceiver` will return (host2, host3, host4) according to the current executor weights (host1 - 1.0, host2 - 0.5, host3 - 0.5, host4 - 0.5), so ReceiverTracker will reject `r1`. This is unexpected since r1 is starting exactly where `scheduleReceivers` suggested. This case can be fixed by ignoring the information of the receiver that is rescheduling in `receiverTrackingInfoMap`. 2) Assume there are 3 executors (host1, host2, host3) and each executors has 3 cores, and 3 receivers: r1, r2, r3. Assume r1 is running on host1. Now r2 is restarting, the previous `ReceiverSchedulingPolicy.rescheduleReceiver` will always return (host1, host2, host3). So it's possible that r2 will be scheduled to host1 by TaskScheduler. r3 is similar. Then at last, it's possible that there are 3 receivers running on host1, while host2 and host3 are idle. This issue can be fixed by returning only executors that have the minimum wight rather than returning at least 3 executors. Author: zsxwing zsxw...@gmail.com Closes #8340 from zsxwing/fix-receiver-scheduling. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f023aa2f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f023aa2f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f023aa2f Branch: refs/heads/master Commit: f023aa2fcc1d1dbb82aee568be0a8f2457c309ae Parents: d9c25de Author: zsxwing zsxw...@gmail.com Authored: Mon Aug 24 23:34:50 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Mon Aug 24 23:34:50 2015 -0700 -- .../scheduler/ReceiverSchedulingPolicy.scala| 58 +++--- .../streaming/scheduler/ReceiverTracker.scala | 106 --- .../ReceiverSchedulingPolicySuite.scala | 13 +-- 3 files changed, 120 insertions(+), 57 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f023aa2f/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala index ef5b687..10b5a7f 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala @@ -22,6 +22,36 @@ import scala.collection.mutable import org.apache.spark.streaming.receiver.Receiver +/** + * A class that tries to schedule receivers with evenly distributed. There are two phases for + * scheduling receivers. + * + * - The first phase is global scheduling when ReceiverTracker is starting and we need to schedule + * all receivers at the same time. ReceiverTracker will call `scheduleReceivers` at this phase. + * It will try to schedule receivers with evenly distributed. ReceiverTracker should update its + * receiverTrackingInfoMap according to the results of `scheduleReceivers`. + * `ReceiverTrackingInfo.scheduledExecutors` for each receiver will set to an executor list that + * contains the scheduled locations. Then when a receiver is starting, it will send a register + * request and `ReceiverTracker.registerReceiver` will be called. In + * `ReceiverTracker.registerReceiver`, if a receiver's scheduled executors is set, it should check + * if the location of this receiver is one of the scheduled executors, if not, the register will + * be rejected. + * - The second phase is local scheduling when a receiver is restarting. There are two cases of + * receiver restarting: + * - If a receiver is restarting because it's rejected due to the real location and the scheduled + * executors mismatching, in other words, it fails to start in one of the locations that + * `scheduleReceivers` suggested, `ReceiverTracker` should firstly choose the executors that are + * still alive in the list of scheduled executors, then use them to launch the receiver job. + * - If a receiver is restarting without a scheduled executors list,
spark git commit: [SPARK-9791] [PACKAGE] Change private class to private class to prevent unnecessary classes from showing up in the docs
Repository: spark Updated Branches: refs/heads/branch-1.5 36bc50c8d - d003373bd [SPARK-9791] [PACKAGE] Change private class to private class to prevent unnecessary classes from showing up in the docs In addition, some random cleanup of import ordering Author: Tathagata Das tathagata.das1...@gmail.com Closes #8387 from tdas/SPARK-9791 and squashes the following commits: 67f3ee9 [Tathagata Das] Change private class to private[package] class to prevent them from showing up in the docs (cherry picked from commit 7478c8b66d6a2b1179f20c38b49e27e37b0caec3) Signed-off-by: Tathagata Das tathagata.das1...@gmail.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d003373b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d003373b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d003373b Branch: refs/heads/branch-1.5 Commit: d003373bd8557ed255125940f736e44f8722e8e3 Parents: 36bc50c Author: Tathagata Das tathagata.das1...@gmail.com Authored: Mon Aug 24 12:40:09 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Mon Aug 24 12:40:23 2015 -0700 -- .../spark/streaming/flume/FlumeUtils.scala | 2 +- .../apache/spark/streaming/kafka/Broker.scala | 6 ++-- .../spark/streaming/kafka/KafkaTestUtils.scala | 10 +++--- .../spark/streaming/kafka/KafkaUtils.scala | 36 +--- .../spark/streaming/kafka/OffsetRange.scala | 8 - .../apache/spark/streaming/mqtt/MQTTUtils.scala | 6 ++-- .../spark/streaming/mqtt/MQTTTestUtils.scala| 2 +- .../streaming/kinesis/KinesisTestUtils.scala| 2 +- .../spark/streaming/util/WriteAheadLog.java | 2 ++ .../util/WriteAheadLogRecordHandle.java | 2 ++ .../receiver/ReceivedBlockHandler.scala | 2 +- .../streaming/scheduler/ReceiverTracker.scala | 2 +- .../apache/spark/streaming/ui/BatchPage.scala | 2 +- 13 files changed, 28 insertions(+), 54 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d003373b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeUtils.scala -- diff --git a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeUtils.scala b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeUtils.scala index 095bfb0..a65a9b9 100644 --- a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeUtils.scala +++ b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeUtils.scala @@ -247,7 +247,7 @@ object FlumeUtils { * This is a helper class that wraps the methods in FlumeUtils into more Python-friendly class and * function so that it can be easily instantiated and called from Python's FlumeUtils. */ -private class FlumeUtilsPythonHelper { +private[flume] class FlumeUtilsPythonHelper { def createStream( jssc: JavaStreamingContext, http://git-wip-us.apache.org/repos/asf/spark/blob/d003373b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/Broker.scala -- diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/Broker.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/Broker.scala index 5a74feb..9159051 100644 --- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/Broker.scala +++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/Broker.scala @@ -20,11 +20,9 @@ package org.apache.spark.streaming.kafka import org.apache.spark.annotation.Experimental /** - * :: Experimental :: - * Represent the host and port info for a Kafka broker. - * Differs from the Kafka project's internal kafka.cluster.Broker, which contains a server ID + * Represents the host and port info for a Kafka broker. + * Differs from the Kafka project's internal kafka.cluster.Broker, which contains a server ID. */ -@Experimental final class Broker private( /** Broker's hostname */ val host: String, http://git-wip-us.apache.org/repos/asf/spark/blob/d003373b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaTestUtils.scala -- diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaTestUtils.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaTestUtils.scala index b608b75..79a9db4 100644 --- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaTestUtils.scala +++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaTestUtils.scala @@ -20,9 +20,8 @@ package org.apache.spark.streaming.kafka import java.io.File import java.lang.{Integer = JInt} import java.net.InetSocketAddress -import
spark git commit: [SPARK-10168] [STREAMING] Fix the issue that maven publishes wrong artifact jars
Repository: spark Updated Branches: refs/heads/branch-1.5 b40059dbd - 36bc50c8d [SPARK-10168] [STREAMING] Fix the issue that maven publishes wrong artifact jars This PR removed the `outputFile` configuration from pom.xml and updated `tests.py` to search jars for both sbt build and maven build. I ran ` mvn -Pkinesis-asl -DskipTests clean install` locally, and verified the jars in my local repository were correct. I also checked Python tests for maven build, and it passed all tests. Author: zsxwing zsxw...@gmail.com Closes #8373 from zsxwing/SPARK-10168 and squashes the following commits: e0b5818 [zsxwing] Fix the sbt build c697627 [zsxwing] Add the jar pathes to the exception message be1d8a5 [zsxwing] Fix the issue that maven publishes wrong artifact jars (cherry picked from commit 4e0395ddb764d092b5b38447af49e196e590e0f0) Signed-off-by: Tathagata Das tathagata.das1...@gmail.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/36bc50c8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/36bc50c8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/36bc50c8 Branch: refs/heads/branch-1.5 Commit: 36bc50c8d377f3e628f7d608d58a76ea508e9697 Parents: b40059d Author: zsxwing zsxw...@gmail.com Authored: Mon Aug 24 12:38:01 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Mon Aug 24 12:38:10 2015 -0700 -- external/flume-assembly/pom.xml | 1 - external/kafka-assembly/pom.xml | 1 - external/mqtt-assembly/pom.xml | 1 - extras/kinesis-asl-assembly/pom.xml | 1 - python/pyspark/streaming/tests.py | 47 ++-- 5 files changed, 26 insertions(+), 25 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/36bc50c8/external/flume-assembly/pom.xml -- diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml index e05e431..561ed4b 100644 --- a/external/flume-assembly/pom.xml +++ b/external/flume-assembly/pom.xml @@ -115,7 +115,6 @@ artifactIdmaven-shade-plugin/artifactId configuration shadedArtifactAttachedfalse/shadedArtifactAttached - outputFile${project.build.directory}/scala-${scala.binary.version}/spark-streaming-flume-assembly-${project.version}.jar/outputFile artifactSet includes include*:*/include http://git-wip-us.apache.org/repos/asf/spark/blob/36bc50c8/external/kafka-assembly/pom.xml -- diff --git a/external/kafka-assembly/pom.xml b/external/kafka-assembly/pom.xml index 36342f3..6f4e2a8 100644 --- a/external/kafka-assembly/pom.xml +++ b/external/kafka-assembly/pom.xml @@ -142,7 +142,6 @@ artifactIdmaven-shade-plugin/artifactId configuration shadedArtifactAttachedfalse/shadedArtifactAttached - outputFile${project.build.directory}/scala-${scala.binary.version}/spark-streaming-kafka-assembly-${project.version}.jar/outputFile artifactSet includes include*:*/include http://git-wip-us.apache.org/repos/asf/spark/blob/36bc50c8/external/mqtt-assembly/pom.xml -- diff --git a/external/mqtt-assembly/pom.xml b/external/mqtt-assembly/pom.xml index f3e3f93..8412600 100644 --- a/external/mqtt-assembly/pom.xml +++ b/external/mqtt-assembly/pom.xml @@ -132,7 +132,6 @@ artifactIdmaven-shade-plugin/artifactId configuration shadedArtifactAttachedfalse/shadedArtifactAttached - outputFile${project.build.directory}/scala-${scala.binary.version}/spark-streaming-mqtt-assembly-${project.version}.jar/outputFile artifactSet includes include*:*/include http://git-wip-us.apache.org/repos/asf/spark/blob/36bc50c8/extras/kinesis-asl-assembly/pom.xml -- diff --git a/extras/kinesis-asl-assembly/pom.xml b/extras/kinesis-asl-assembly/pom.xml index 3ca5386..51af3e6 100644 --- a/extras/kinesis-asl-assembly/pom.xml +++ b/extras/kinesis-asl-assembly/pom.xml @@ -137,7 +137,6 @@ artifactIdmaven-shade-plugin/artifactId configuration shadedArtifactAttachedfalse/shadedArtifactAttached - outputFile${project.build.directory}/scala-${scala.binary.version}/spark-streaming-kinesis-asl-assembly-${project.version}.jar/outputFile artifactSet includes include*:*/include http://git-wip-us.apache.org/repos/asf/spark/blob/36bc50c8/python/pyspark/streaming/tests.py -- diff --git a/python/pyspark/streaming/tests.py
spark git commit: [SPARK-9791] [PACKAGE] Change private class to private class to prevent unnecessary classes from showing up in the docs
Repository: spark Updated Branches: refs/heads/master 4e0395ddb - 7478c8b66 [SPARK-9791] [PACKAGE] Change private class to private class to prevent unnecessary classes from showing up in the docs In addition, some random cleanup of import ordering Author: Tathagata Das tathagata.das1...@gmail.com Closes #8387 from tdas/SPARK-9791 and squashes the following commits: 67f3ee9 [Tathagata Das] Change private class to private[package] class to prevent them from showing up in the docs Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7478c8b6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7478c8b6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7478c8b6 Branch: refs/heads/master Commit: 7478c8b66d6a2b1179f20c38b49e27e37b0caec3 Parents: 4e0395d Author: Tathagata Das tathagata.das1...@gmail.com Authored: Mon Aug 24 12:40:09 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Mon Aug 24 12:40:09 2015 -0700 -- .../spark/streaming/flume/FlumeUtils.scala | 2 +- .../apache/spark/streaming/kafka/Broker.scala | 6 ++-- .../spark/streaming/kafka/KafkaTestUtils.scala | 10 +++--- .../spark/streaming/kafka/KafkaUtils.scala | 36 +--- .../spark/streaming/kafka/OffsetRange.scala | 8 - .../apache/spark/streaming/mqtt/MQTTUtils.scala | 6 ++-- .../spark/streaming/mqtt/MQTTTestUtils.scala| 2 +- .../streaming/kinesis/KinesisTestUtils.scala| 2 +- .../spark/streaming/util/WriteAheadLog.java | 2 ++ .../util/WriteAheadLogRecordHandle.java | 2 ++ .../receiver/ReceivedBlockHandler.scala | 2 +- .../streaming/scheduler/ReceiverTracker.scala | 2 +- .../apache/spark/streaming/ui/BatchPage.scala | 2 +- 13 files changed, 28 insertions(+), 54 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7478c8b6/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeUtils.scala -- diff --git a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeUtils.scala b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeUtils.scala index 095bfb0..a65a9b9 100644 --- a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeUtils.scala +++ b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumeUtils.scala @@ -247,7 +247,7 @@ object FlumeUtils { * This is a helper class that wraps the methods in FlumeUtils into more Python-friendly class and * function so that it can be easily instantiated and called from Python's FlumeUtils. */ -private class FlumeUtilsPythonHelper { +private[flume] class FlumeUtilsPythonHelper { def createStream( jssc: JavaStreamingContext, http://git-wip-us.apache.org/repos/asf/spark/blob/7478c8b6/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/Broker.scala -- diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/Broker.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/Broker.scala index 5a74feb..9159051 100644 --- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/Broker.scala +++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/Broker.scala @@ -20,11 +20,9 @@ package org.apache.spark.streaming.kafka import org.apache.spark.annotation.Experimental /** - * :: Experimental :: - * Represent the host and port info for a Kafka broker. - * Differs from the Kafka project's internal kafka.cluster.Broker, which contains a server ID + * Represents the host and port info for a Kafka broker. + * Differs from the Kafka project's internal kafka.cluster.Broker, which contains a server ID. */ -@Experimental final class Broker private( /** Broker's hostname */ val host: String, http://git-wip-us.apache.org/repos/asf/spark/blob/7478c8b6/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaTestUtils.scala -- diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaTestUtils.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaTestUtils.scala index b608b75..79a9db4 100644 --- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaTestUtils.scala +++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaTestUtils.scala @@ -20,9 +20,8 @@ package org.apache.spark.streaming.kafka import java.io.File import java.lang.{Integer = JInt} import java.net.InetSocketAddress -import java.util.{Map = JMap} -import java.util.Properties import java.util.concurrent.TimeoutException +import java.util.{Map = JMap
spark git commit: [SPARK-10168] [STREAMING] Fix the issue that maven publishes wrong artifact jars
Repository: spark Updated Branches: refs/heads/master 053d94fcf - 4e0395ddb [SPARK-10168] [STREAMING] Fix the issue that maven publishes wrong artifact jars This PR removed the `outputFile` configuration from pom.xml and updated `tests.py` to search jars for both sbt build and maven build. I ran ` mvn -Pkinesis-asl -DskipTests clean install` locally, and verified the jars in my local repository were correct. I also checked Python tests for maven build, and it passed all tests. Author: zsxwing zsxw...@gmail.com Closes #8373 from zsxwing/SPARK-10168 and squashes the following commits: e0b5818 [zsxwing] Fix the sbt build c697627 [zsxwing] Add the jar pathes to the exception message be1d8a5 [zsxwing] Fix the issue that maven publishes wrong artifact jars Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4e0395dd Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4e0395dd Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4e0395dd Branch: refs/heads/master Commit: 4e0395ddb764d092b5b38447af49e196e590e0f0 Parents: 053d94f Author: zsxwing zsxw...@gmail.com Authored: Mon Aug 24 12:38:01 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Mon Aug 24 12:38:01 2015 -0700 -- external/flume-assembly/pom.xml | 1 - external/kafka-assembly/pom.xml | 1 - external/mqtt-assembly/pom.xml | 1 - extras/kinesis-asl-assembly/pom.xml | 1 - python/pyspark/streaming/tests.py | 47 ++-- 5 files changed, 26 insertions(+), 25 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4e0395dd/external/flume-assembly/pom.xml -- diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml index e05e431..561ed4b 100644 --- a/external/flume-assembly/pom.xml +++ b/external/flume-assembly/pom.xml @@ -115,7 +115,6 @@ artifactIdmaven-shade-plugin/artifactId configuration shadedArtifactAttachedfalse/shadedArtifactAttached - outputFile${project.build.directory}/scala-${scala.binary.version}/spark-streaming-flume-assembly-${project.version}.jar/outputFile artifactSet includes include*:*/include http://git-wip-us.apache.org/repos/asf/spark/blob/4e0395dd/external/kafka-assembly/pom.xml -- diff --git a/external/kafka-assembly/pom.xml b/external/kafka-assembly/pom.xml index 36342f3..6f4e2a8 100644 --- a/external/kafka-assembly/pom.xml +++ b/external/kafka-assembly/pom.xml @@ -142,7 +142,6 @@ artifactIdmaven-shade-plugin/artifactId configuration shadedArtifactAttachedfalse/shadedArtifactAttached - outputFile${project.build.directory}/scala-${scala.binary.version}/spark-streaming-kafka-assembly-${project.version}.jar/outputFile artifactSet includes include*:*/include http://git-wip-us.apache.org/repos/asf/spark/blob/4e0395dd/external/mqtt-assembly/pom.xml -- diff --git a/external/mqtt-assembly/pom.xml b/external/mqtt-assembly/pom.xml index f3e3f93..8412600 100644 --- a/external/mqtt-assembly/pom.xml +++ b/external/mqtt-assembly/pom.xml @@ -132,7 +132,6 @@ artifactIdmaven-shade-plugin/artifactId configuration shadedArtifactAttachedfalse/shadedArtifactAttached - outputFile${project.build.directory}/scala-${scala.binary.version}/spark-streaming-mqtt-assembly-${project.version}.jar/outputFile artifactSet includes include*:*/include http://git-wip-us.apache.org/repos/asf/spark/blob/4e0395dd/extras/kinesis-asl-assembly/pom.xml -- diff --git a/extras/kinesis-asl-assembly/pom.xml b/extras/kinesis-asl-assembly/pom.xml index 3ca5386..51af3e6 100644 --- a/extras/kinesis-asl-assembly/pom.xml +++ b/extras/kinesis-asl-assembly/pom.xml @@ -137,7 +137,6 @@ artifactIdmaven-shade-plugin/artifactId configuration shadedArtifactAttachedfalse/shadedArtifactAttached - outputFile${project.build.directory}/scala-${scala.binary.version}/spark-streaming-kinesis-asl-assembly-${project.version}.jar/outputFile artifactSet includes include*:*/include http://git-wip-us.apache.org/repos/asf/spark/blob/4e0395dd/python/pyspark/streaming/tests.py -- diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py index 510a4f2..cfea95b 100644 --- a/python/pyspark/streaming/tests.py +++ b/python/pyspark/streaming/tests.py @@
spark git commit: [SPARK-10142] [STREAMING] Made python checkpoint recovery handle non-local checkpoint paths and existing SparkContexts
Repository: spark Updated Branches: refs/heads/branch-1.5 00f812d38 - b40059dbd [SPARK-10142] [STREAMING] Made python checkpoint recovery handle non-local checkpoint paths and existing SparkContexts The current code only checks checkpoint files in local filesystem, and always tries to create a new Python SparkContext (even if one already exists). The solution is to do the following: 1. Use the same code path as Java to check whether a valid checkpoint exists 2. Create a new Python SparkContext only if there no active one. There is not test for the path as its hard to test with distributed filesystem paths in a local unit test. I am going to test it with a distributed file system manually to verify that this patch works. Author: Tathagata Das tathagata.das1...@gmail.com Closes #8366 from tdas/SPARK-10142 and squashes the following commits: 3afa666 [Tathagata Das] Added tests 2dd4ae5 [Tathagata Das] Added the check to not create a context if one already exists 9bf151b [Tathagata Das] Made python checkpoint recovery use java to find the checkpoint files (cherry picked from commit 053d94fcf32268369b5a40837271f15d6af41aa4) Signed-off-by: Tathagata Das tathagata.das1...@gmail.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b40059db Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b40059db Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b40059db Branch: refs/heads/branch-1.5 Commit: b40059dbda4dafbb883a53fbd5c5f69bc01a3e19 Parents: 00f812d Author: Tathagata Das tathagata.das1...@gmail.com Authored: Sun Aug 23 19:24:32 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Sun Aug 23 19:24:42 2015 -0700 -- python/pyspark/streaming/context.py | 22 ++ python/pyspark/streaming/tests.py | 43 .../org/apache/spark/streaming/Checkpoint.scala | 9 3 files changed, 58 insertions(+), 16 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b40059db/python/pyspark/streaming/context.py -- diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py index e3ba70e..4069d7a 100644 --- a/python/pyspark/streaming/context.py +++ b/python/pyspark/streaming/context.py @@ -150,26 +150,30 @@ class StreamingContext(object): @param checkpointPath: Checkpoint directory used in an earlier streaming program @param setupFunc: Function to create a new context and setup DStreams -# TODO: support checkpoint in HDFS -if not os.path.exists(checkpointPath) or not os.listdir(checkpointPath): +cls._ensure_initialized() +gw = SparkContext._gateway + +# Check whether valid checkpoint information exists in the given path +if gw.jvm.CheckpointReader.read(checkpointPath).isEmpty(): ssc = setupFunc() ssc.checkpoint(checkpointPath) return ssc -cls._ensure_initialized() -gw = SparkContext._gateway - try: jssc = gw.jvm.JavaStreamingContext(checkpointPath) except Exception: print(failed to load StreamingContext from checkpoint, file=sys.stderr) raise -jsc = jssc.sparkContext() -conf = SparkConf(_jconf=jsc.getConf()) -sc = SparkContext(conf=conf, gateway=gw, jsc=jsc) +# If there is already an active instance of Python SparkContext use it, or create a new one +if not SparkContext._active_spark_context: +jsc = jssc.sparkContext() +conf = SparkConf(_jconf=jsc.getConf()) +SparkContext(conf=conf, gateway=gw, jsc=jsc) + +sc = SparkContext._active_spark_context + # update ctx in serializer -SparkContext._active_spark_context = sc cls._transformerSerializer.ctx = sc return StreamingContext(sc, None, jssc) http://git-wip-us.apache.org/repos/asf/spark/blob/b40059db/python/pyspark/streaming/tests.py -- diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py index 214d5be..510a4f2 100644 --- a/python/pyspark/streaming/tests.py +++ b/python/pyspark/streaming/tests.py @@ -603,6 +603,10 @@ class CheckpointTests(unittest.TestCase): def tearDown(self): if self.ssc is not None: self.ssc.stop(True) +if self.sc is not None: +self.sc.stop() +if self.cpd is not None: +shutil.rmtree(self.cpd) def test_get_or_create_and_get_active_or_create(self): inputd = tempfile.mkdtemp() @@ -622,8 +626,12 @@ class CheckpointTests(unittest.TestCase
spark git commit: [SPARK-10122] [PYSPARK] [STREAMING] Fix getOffsetRanges bug in PySpark-Streaming transform function
Repository: spark Updated Branches: refs/heads/branch-1.5 817c38a0a - 4e72839b7 [SPARK-10122] [PYSPARK] [STREAMING] Fix getOffsetRanges bug in PySpark-Streaming transform function Details of the bug and explanations can be seen in [SPARK-10122](https://issues.apache.org/jira/browse/SPARK-10122). tdas , please help to review. Author: jerryshao ss...@hortonworks.com Closes #8347 from jerryshao/SPARK-10122 and squashes the following commits: 4039b16 [jerryshao] Fix getOffsetRanges in transform() bug Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4e72839b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4e72839b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4e72839b Branch: refs/heads/branch-1.5 Commit: 4e72839b7b1e0b925837b49534a07188a603d838 Parents: 817c38a Author: jerryshao ss...@hortonworks.com Authored: Fri Aug 21 13:10:11 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Fri Aug 21 13:17:48 2015 -0700 -- python/pyspark/streaming/dstream.py | 5 - python/pyspark/streaming/tests.py | 4 +++- 2 files changed, 7 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4e72839b/python/pyspark/streaming/dstream.py -- diff --git a/python/pyspark/streaming/dstream.py b/python/pyspark/streaming/dstream.py index 8dcb964..698336c 100644 --- a/python/pyspark/streaming/dstream.py +++ b/python/pyspark/streaming/dstream.py @@ -610,7 +610,10 @@ class TransformedDStream(DStream): self.is_checkpointed = False self._jdstream_val = None -if (isinstance(prev, TransformedDStream) and +# Using type() to avoid folding the functions and compacting the DStreams which is not +# not strictly a object of TransformedDStream. +# Changed here is to avoid bug in KafkaTransformedDStream when calling offsetRanges(). +if (type(prev) is TransformedDStream and not prev.is_cached and not prev.is_checkpointed): prev_func = prev.func self.func = lambda t, rdd: func(t, prev_func(t, rdd)) http://git-wip-us.apache.org/repos/asf/spark/blob/4e72839b/python/pyspark/streaming/tests.py -- diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py index 6108c84..214d5be 100644 --- a/python/pyspark/streaming/tests.py +++ b/python/pyspark/streaming/tests.py @@ -850,7 +850,9 @@ class KafkaStreamTests(PySparkStreamingTestCase): offsetRanges.append(o) return rdd -stream.transform(transformWithOffsetRanges).foreachRDD(lambda rdd: rdd.count()) +# Test whether it is ok mixing KafkaTransformedDStream and TransformedDStream together, +# only the TransformedDstreams can be folded together. +stream.transform(transformWithOffsetRanges).map(lambda kv: kv[1]).count().pprint() self.ssc.start() self.wait_for(offsetRanges, 1) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9812] [STREAMING] Fix Python 3 compatibility issue in PySpark Streaming and some docs
Repository: spark Updated Branches: refs/heads/master 2f2686a73 - 1f29d502e [SPARK-9812] [STREAMING] Fix Python 3 compatibility issue in PySpark Streaming and some docs This PR includes the following fixes: 1. Use `range` instead of `xrange` in `queue_stream.py` to support Python 3. 2. Fix the issue that `utf8_decoder` will return `bytes` rather than `str` when receiving an empty `bytes` in Python 3. 3. Fix the commands in docs so that the user can copy them directly to the command line. The previous commands was broken in the middle of a path, so when copying to the command line, the path would be split to two parts by the extra spaces, which forces the user to fix it manually. Author: zsxwing zsxw...@gmail.com Closes #8315 from zsxwing/SPARK-9812. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1f29d502 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1f29d502 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1f29d502 Branch: refs/heads/master Commit: 1f29d502e7ecd6faa185d70dc714f9ea3922fb6d Parents: 2f2686a Author: zsxwing zsxw...@gmail.com Authored: Wed Aug 19 18:36:01 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Wed Aug 19 18:36:01 2015 -0700 -- examples/src/main/python/streaming/direct_kafka_wordcount.py | 6 +++--- examples/src/main/python/streaming/flume_wordcount.py| 5 +++-- examples/src/main/python/streaming/kafka_wordcount.py| 5 +++-- examples/src/main/python/streaming/mqtt_wordcount.py | 5 +++-- examples/src/main/python/streaming/queue_stream.py | 4 ++-- python/pyspark/streaming/flume.py| 4 +++- python/pyspark/streaming/kafka.py| 4 +++- python/pyspark/streaming/kinesis.py | 4 +++- 8 files changed, 23 insertions(+), 14 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1f29d502/examples/src/main/python/streaming/direct_kafka_wordcount.py -- diff --git a/examples/src/main/python/streaming/direct_kafka_wordcount.py b/examples/src/main/python/streaming/direct_kafka_wordcount.py index 6ef188a..ea20678 100644 --- a/examples/src/main/python/streaming/direct_kafka_wordcount.py +++ b/examples/src/main/python/streaming/direct_kafka_wordcount.py @@ -23,8 +23,8 @@ http://kafka.apache.org/documentation.html#quickstart and then run the example -`$ bin/spark-submit --jars external/kafka-assembly/target/scala-*/\ - spark-streaming-kafka-assembly-*.jar \ +`$ bin/spark-submit --jars \ + external/kafka-assembly/target/scala-*/spark-streaming-kafka-assembly-*.jar \ examples/src/main/python/streaming/direct_kafka_wordcount.py \ localhost:9092 test` @@ -37,7 +37,7 @@ from pyspark.streaming.kafka import KafkaUtils if __name__ == __main__: if len(sys.argv) != 3: -print sys.stderr, Usage: direct_kafka_wordcount.py broker_list topic +print(Usage: direct_kafka_wordcount.py broker_list topic, file=sys.stderr) exit(-1) sc = SparkContext(appName=PythonStreamingDirectKafkaWordCount) http://git-wip-us.apache.org/repos/asf/spark/blob/1f29d502/examples/src/main/python/streaming/flume_wordcount.py -- diff --git a/examples/src/main/python/streaming/flume_wordcount.py b/examples/src/main/python/streaming/flume_wordcount.py index 091b64d..d75bc6d 100644 --- a/examples/src/main/python/streaming/flume_wordcount.py +++ b/examples/src/main/python/streaming/flume_wordcount.py @@ -23,8 +23,9 @@ https://flume.apache.org/documentation.html and then run the example -`$ bin/spark-submit --jars external/flume-assembly/target/scala-*/\ - spark-streaming-flume-assembly-*.jar examples/src/main/python/streaming/flume_wordcount.py \ +`$ bin/spark-submit --jars \ + external/flume-assembly/target/scala-*/spark-streaming-flume-assembly-*.jar \ + examples/src/main/python/streaming/flume_wordcount.py \ localhost 12345 from __future__ import print_function http://git-wip-us.apache.org/repos/asf/spark/blob/1f29d502/examples/src/main/python/streaming/kafka_wordcount.py -- diff --git a/examples/src/main/python/streaming/kafka_wordcount.py b/examples/src/main/python/streaming/kafka_wordcount.py index b178e78..8d697f6 100644 --- a/examples/src/main/python/streaming/kafka_wordcount.py +++ b/examples/src/main/python/streaming/kafka_wordcount.py @@ -23,8 +23,9 @@ http://kafka.apache.org/documentation.html#quickstart and then run the example -`$ bin/spark-submit --jars external/kafka-assembly/target/scala-*/\ -
spark git commit: [SPARK-10125] [STREAMING] Fix a potential deadlock in JobGenerator.stop
Repository: spark Updated Branches: refs/heads/master 1f29d502e - affc8a887 [SPARK-10125] [STREAMING] Fix a potential deadlock in JobGenerator.stop Because `lazy val` uses `this` lock, if JobGenerator.stop and JobGenerator.doCheckpoint (JobGenerator.shouldCheckpoint has not yet been initialized) run at the same time, it may hang. Here are the stack traces for the deadlock: ```Java pool-1-thread-1-ScalaTest-running-StreamingListenerSuite #11 prio=5 os_prio=31 tid=0x7fd35d094800 nid=0x5703 in Object.wait() [0x00012ecaf000] java.lang.Thread.State: WAITING (on object monitor) at java.lang.Object.wait(Native Method) at java.lang.Thread.join(Thread.java:1245) - locked 0x0007b5d8d7f8 (a org.apache.spark.util.EventLoop$$anon$1) at java.lang.Thread.join(Thread.java:1319) at org.apache.spark.util.EventLoop.stop(EventLoop.scala:81) at org.apache.spark.streaming.scheduler.JobGenerator.stop(JobGenerator.scala:155) - locked 0x0007b5d8cea0 (a org.apache.spark.streaming.scheduler.JobGenerator) at org.apache.spark.streaming.scheduler.JobScheduler.stop(JobScheduler.scala:95) - locked 0x0007b5d8ced8 (a org.apache.spark.streaming.scheduler.JobScheduler) at org.apache.spark.streaming.StreamingContext.stop(StreamingContext.scala:687) JobGenerator #67 daemon prio=5 os_prio=31 tid=0x7fd35c3b9800 nid=0x9f03 waiting for monitor entry [0x000139e4a000] java.lang.Thread.State: BLOCKED (on object monitor) at org.apache.spark.streaming.scheduler.JobGenerator.shouldCheckpoint$lzycompute(JobGenerator.scala:63) - waiting to lock 0x0007b5d8cea0 (a org.apache.spark.streaming.scheduler.JobGenerator) at org.apache.spark.streaming.scheduler.JobGenerator.shouldCheckpoint(JobGenerator.scala:63) at org.apache.spark.streaming.scheduler.JobGenerator.doCheckpoint(JobGenerator.scala:290) at org.apache.spark.streaming.scheduler.JobGenerator.org$apache$spark$streaming$scheduler$JobGenerator$$processEvent(JobGenerator.scala:182) at org.apache.spark.streaming.scheduler.JobGenerator$$anon$1.onReceive(JobGenerator.scala:83) at org.apache.spark.streaming.scheduler.JobGenerator$$anon$1.onReceive(JobGenerator.scala:82) at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) ``` I can use this patch to produce this deadlock: https://github.com/zsxwing/spark/commit/8a88f28d1331003a65fabef48ae3d22a7c21f05f And a timeout build in Jenkins due to this deadlock: https://amplab.cs.berkeley.edu/jenkins/job/NewSparkPullRequestBuilder/1654/ This PR initializes `checkpointWriter` before `eventLoop` uses it to avoid this deadlock. Author: zsxwing zsxw...@gmail.com Closes #8326 from zsxwing/SPARK-10125. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/affc8a88 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/affc8a88 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/affc8a88 Branch: refs/heads/master Commit: affc8a887ede9fdc2ca6051833954cd10918c869 Parents: 1f29d50 Author: zsxwing zsxw...@gmail.com Authored: Wed Aug 19 19:43:09 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Wed Aug 19 19:43:09 2015 -0700 -- .../org/apache/spark/streaming/scheduler/JobGenerator.scala | 4 1 file changed, 4 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/affc8a88/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala index 9f2117a..2de035d 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala @@ -79,6 +79,10 @@ class JobGenerator(jobScheduler: JobScheduler) extends Logging { def start(): Unit = synchronized { if (eventLoop != null) return // generator has already been started +// Call checkpointWriter here to initialize it before eventLoop uses it to avoid a deadlock. +// See SPARK-10125 +checkpointWriter + eventLoop = new EventLoop[JobGeneratorEvent](JobGenerator) { override protected def onReceive(event: JobGeneratorEvent): Unit = processEvent(event) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-10125] [STREAMING] Fix a potential deadlock in JobGenerator.stop
Repository: spark Updated Branches: refs/heads/branch-1.5 a3ed2c31e - 63922fa4d [SPARK-10125] [STREAMING] Fix a potential deadlock in JobGenerator.stop Because `lazy val` uses `this` lock, if JobGenerator.stop and JobGenerator.doCheckpoint (JobGenerator.shouldCheckpoint has not yet been initialized) run at the same time, it may hang. Here are the stack traces for the deadlock: ```Java pool-1-thread-1-ScalaTest-running-StreamingListenerSuite #11 prio=5 os_prio=31 tid=0x7fd35d094800 nid=0x5703 in Object.wait() [0x00012ecaf000] java.lang.Thread.State: WAITING (on object monitor) at java.lang.Object.wait(Native Method) at java.lang.Thread.join(Thread.java:1245) - locked 0x0007b5d8d7f8 (a org.apache.spark.util.EventLoop$$anon$1) at java.lang.Thread.join(Thread.java:1319) at org.apache.spark.util.EventLoop.stop(EventLoop.scala:81) at org.apache.spark.streaming.scheduler.JobGenerator.stop(JobGenerator.scala:155) - locked 0x0007b5d8cea0 (a org.apache.spark.streaming.scheduler.JobGenerator) at org.apache.spark.streaming.scheduler.JobScheduler.stop(JobScheduler.scala:95) - locked 0x0007b5d8ced8 (a org.apache.spark.streaming.scheduler.JobScheduler) at org.apache.spark.streaming.StreamingContext.stop(StreamingContext.scala:687) JobGenerator #67 daemon prio=5 os_prio=31 tid=0x7fd35c3b9800 nid=0x9f03 waiting for monitor entry [0x000139e4a000] java.lang.Thread.State: BLOCKED (on object monitor) at org.apache.spark.streaming.scheduler.JobGenerator.shouldCheckpoint$lzycompute(JobGenerator.scala:63) - waiting to lock 0x0007b5d8cea0 (a org.apache.spark.streaming.scheduler.JobGenerator) at org.apache.spark.streaming.scheduler.JobGenerator.shouldCheckpoint(JobGenerator.scala:63) at org.apache.spark.streaming.scheduler.JobGenerator.doCheckpoint(JobGenerator.scala:290) at org.apache.spark.streaming.scheduler.JobGenerator.org$apache$spark$streaming$scheduler$JobGenerator$$processEvent(JobGenerator.scala:182) at org.apache.spark.streaming.scheduler.JobGenerator$$anon$1.onReceive(JobGenerator.scala:83) at org.apache.spark.streaming.scheduler.JobGenerator$$anon$1.onReceive(JobGenerator.scala:82) at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) ``` I can use this patch to produce this deadlock: https://github.com/zsxwing/spark/commit/8a88f28d1331003a65fabef48ae3d22a7c21f05f And a timeout build in Jenkins due to this deadlock: https://amplab.cs.berkeley.edu/jenkins/job/NewSparkPullRequestBuilder/1654/ This PR initializes `checkpointWriter` before `eventLoop` uses it to avoid this deadlock. Author: zsxwing zsxw...@gmail.com Closes #8326 from zsxwing/SPARK-10125. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/63922fa4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/63922fa4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/63922fa4 Branch: refs/heads/branch-1.5 Commit: 63922fa4dd5fb4a24e6f8c984b080698ca3b0a26 Parents: a3ed2c3 Author: zsxwing zsxw...@gmail.com Authored: Wed Aug 19 19:43:09 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Wed Aug 19 19:44:33 2015 -0700 -- .../org/apache/spark/streaming/scheduler/JobGenerator.scala | 4 1 file changed, 4 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/63922fa4/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala index 9f2117a..2de035d 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala @@ -79,6 +79,10 @@ class JobGenerator(jobScheduler: JobScheduler) extends Logging { def start(): Unit = synchronized { if (eventLoop != null) return // generator has already been started +// Call checkpointWriter here to initialize it before eventLoop uses it to avoid a deadlock. +// See SPARK-10125 +checkpointWriter + eventLoop = new EventLoop[JobGeneratorEvent](JobGenerator) { override protected def onReceive(event: JobGeneratorEvent): Unit = processEvent(event) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9812] [STREAMING] Fix Python 3 compatibility issue in PySpark Streaming and some docs
Repository: spark Updated Branches: refs/heads/branch-1.5 321cb99ca - 16414dae0 [SPARK-9812] [STREAMING] Fix Python 3 compatibility issue in PySpark Streaming and some docs This PR includes the following fixes: 1. Use `range` instead of `xrange` in `queue_stream.py` to support Python 3. 2. Fix the issue that `utf8_decoder` will return `bytes` rather than `str` when receiving an empty `bytes` in Python 3. 3. Fix the commands in docs so that the user can copy them directly to the command line. The previous commands was broken in the middle of a path, so when copying to the command line, the path would be split to two parts by the extra spaces, which forces the user to fix it manually. Author: zsxwing zsxw...@gmail.com Closes #8315 from zsxwing/SPARK-9812. (cherry picked from commit 1f29d502e7ecd6faa185d70dc714f9ea3922fb6d) Signed-off-by: Tathagata Das tathagata.das1...@gmail.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/16414dae Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/16414dae Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/16414dae Branch: refs/heads/branch-1.5 Commit: 16414dae03b427506b2a1ebb7d405e6fa3bdad17 Parents: 321cb99 Author: zsxwing zsxw...@gmail.com Authored: Wed Aug 19 18:36:01 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Wed Aug 19 18:36:10 2015 -0700 -- examples/src/main/python/streaming/direct_kafka_wordcount.py | 6 +++--- examples/src/main/python/streaming/flume_wordcount.py| 5 +++-- examples/src/main/python/streaming/kafka_wordcount.py| 5 +++-- examples/src/main/python/streaming/mqtt_wordcount.py | 5 +++-- examples/src/main/python/streaming/queue_stream.py | 4 ++-- python/pyspark/streaming/flume.py| 4 +++- python/pyspark/streaming/kafka.py| 4 +++- python/pyspark/streaming/kinesis.py | 4 +++- 8 files changed, 23 insertions(+), 14 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/16414dae/examples/src/main/python/streaming/direct_kafka_wordcount.py -- diff --git a/examples/src/main/python/streaming/direct_kafka_wordcount.py b/examples/src/main/python/streaming/direct_kafka_wordcount.py index 6ef188a..ea20678 100644 --- a/examples/src/main/python/streaming/direct_kafka_wordcount.py +++ b/examples/src/main/python/streaming/direct_kafka_wordcount.py @@ -23,8 +23,8 @@ http://kafka.apache.org/documentation.html#quickstart and then run the example -`$ bin/spark-submit --jars external/kafka-assembly/target/scala-*/\ - spark-streaming-kafka-assembly-*.jar \ +`$ bin/spark-submit --jars \ + external/kafka-assembly/target/scala-*/spark-streaming-kafka-assembly-*.jar \ examples/src/main/python/streaming/direct_kafka_wordcount.py \ localhost:9092 test` @@ -37,7 +37,7 @@ from pyspark.streaming.kafka import KafkaUtils if __name__ == __main__: if len(sys.argv) != 3: -print sys.stderr, Usage: direct_kafka_wordcount.py broker_list topic +print(Usage: direct_kafka_wordcount.py broker_list topic, file=sys.stderr) exit(-1) sc = SparkContext(appName=PythonStreamingDirectKafkaWordCount) http://git-wip-us.apache.org/repos/asf/spark/blob/16414dae/examples/src/main/python/streaming/flume_wordcount.py -- diff --git a/examples/src/main/python/streaming/flume_wordcount.py b/examples/src/main/python/streaming/flume_wordcount.py index 091b64d..d75bc6d 100644 --- a/examples/src/main/python/streaming/flume_wordcount.py +++ b/examples/src/main/python/streaming/flume_wordcount.py @@ -23,8 +23,9 @@ https://flume.apache.org/documentation.html and then run the example -`$ bin/spark-submit --jars external/flume-assembly/target/scala-*/\ - spark-streaming-flume-assembly-*.jar examples/src/main/python/streaming/flume_wordcount.py \ +`$ bin/spark-submit --jars \ + external/flume-assembly/target/scala-*/spark-streaming-flume-assembly-*.jar \ + examples/src/main/python/streaming/flume_wordcount.py \ localhost 12345 from __future__ import print_function http://git-wip-us.apache.org/repos/asf/spark/blob/16414dae/examples/src/main/python/streaming/kafka_wordcount.py -- diff --git a/examples/src/main/python/streaming/kafka_wordcount.py b/examples/src/main/python/streaming/kafka_wordcount.py index b178e78..8d697f6 100644 --- a/examples/src/main/python/streaming/kafka_wordcount.py +++ b/examples/src/main/python/streaming/kafka_wordcount.py @@ -23,8 +23,9 @@
spark git commit: [SPARK-9967] [SPARK-10099] [STREAMING] Renamed conf spark.streaming.backpressure.{enable--enabled} and fixed deprecated annotations
Repository: spark Updated Branches: refs/heads/branch-1.5 3ceee5572 - 392bd19d6 [SPARK-9967] [SPARK-10099] [STREAMING] Renamed conf spark.streaming.backpressure.{enable--enabled} and fixed deprecated annotations Small changes - Renamed conf spark.streaming.backpressure.{enable -- enabled} - Change Java Deprecated annotations to Scala deprecated annotation with more information. Author: Tathagata Das tathagata.das1...@gmail.com Closes #8299 from tdas/SPARK-9967. (cherry picked from commit bc9a0e03235865d2ec33372f6400dec8c770778a) Signed-off-by: Tathagata Das tathagata.das1...@gmail.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/392bd19d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/392bd19d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/392bd19d Branch: refs/heads/branch-1.5 Commit: 392bd19d678567751cd3844d9d166a7491c5887e Parents: 3ceee55 Author: Tathagata Das tathagata.das1...@gmail.com Authored: Tue Aug 18 23:37:57 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Tue Aug 18 23:38:13 2015 -0700 -- .../main/scala/org/apache/spark/api/java/JavaRDDLike.scala | 2 +- .../main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala | 2 +- .../apache/spark/streaming/api/java/JavaDStreamLike.scala| 4 ++-- .../apache/spark/streaming/scheduler/RateController.scala| 8 4 files changed, 8 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/392bd19d/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala -- diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala index 829fae1..c582488 100644 --- a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala +++ b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala @@ -354,7 +354,7 @@ trait JavaRDDLike[T, This : JavaRDDLike[T, This]] extends Serializable { * Return an array that contains all of the elements in this RDD. * @deprecated As of Spark 1.0.0, toArray() is deprecated, use {@link #collect()} instead */ - @Deprecated + @deprecated(use collect(), 1.0.0) def toArray(): JList[T] = collect() /** http://git-wip-us.apache.org/repos/asf/spark/blob/392bd19d/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala index 7e9dba4..dda4216 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala @@ -76,7 +76,7 @@ class SparkHadoopUtil extends Logging { } } - @Deprecated + @deprecated(use newConfiguration with SparkConf argument, 1.2.0) def newConfiguration(): Configuration = newConfiguration(null) /** http://git-wip-us.apache.org/repos/asf/spark/blob/392bd19d/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala index 808dcc1..214cd80 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala @@ -291,7 +291,7 @@ trait JavaDStreamLike[T, This : JavaDStreamLike[T, This, R], R : JavaRDDLike[T * * @deprecated As of release 0.9.0, replaced by foreachRDD */ - @Deprecated + @deprecated(Use foreachRDD, 0.9.0) def foreach(foreachFunc: JFunction[R, Void]) { foreachRDD(foreachFunc) } @@ -302,7 +302,7 @@ trait JavaDStreamLike[T, This : JavaDStreamLike[T, This, R], R : JavaRDDLike[T * * @deprecated As of release 0.9.0, replaced by foreachRDD */ - @Deprecated + @deprecated(Use foreachRDD, 0.9.0) def foreach(foreachFunc: JFunction2[R, Time, Void]) { foreachRDD(foreachFunc) } http://git-wip-us.apache.org/repos/asf/spark/blob/392bd19d/streaming/src/main/scala/org/apache/spark/streaming/scheduler/RateController.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/RateController.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/RateController.scala index 882ca06..a46c0c1 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler
spark git commit: [SPARK-10128] [STREAMING] Used correct classloader to deserialize WAL data
Repository: spark Updated Branches: refs/heads/master 73431d8af - b762f9920 [SPARK-10128] [STREAMING] Used correct classloader to deserialize WAL data Recovering Kinesis sequence numbers from WAL leads to classnotfoundexception because the ObjectInputStream does not use the correct classloader and the SequenceNumberRanges class (in streaming-kinesis-asl package) cannot be found (added through spark-submit) while deserializing. The solution is to use `Thread.currentThread().getContextClassLoader` while deserializing. Author: Tathagata Das tathagata.das1...@gmail.com Closes #8328 from tdas/SPARK-10128 and squashes the following commits: f19b1c2 [Tathagata Das] Used correct classloader to deserialize WAL data Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b762f992 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b762f992 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b762f992 Branch: refs/heads/master Commit: b762f9920f7587d3c08493c49dd2fede62110b88 Parents: 73431d8 Author: Tathagata Das tathagata.das1...@gmail.com Authored: Wed Aug 19 21:15:58 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Wed Aug 19 21:15:58 2015 -0700 -- .../apache/spark/streaming/scheduler/ReceivedBlockTracker.scala | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b762f992/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala index 7720259..53b96d5 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala @@ -28,7 +28,7 @@ import org.apache.hadoop.fs.Path import org.apache.spark.streaming.Time import org.apache.spark.streaming.util.{WriteAheadLog, WriteAheadLogUtils} import org.apache.spark.util.{Clock, Utils} -import org.apache.spark.{Logging, SparkConf, SparkException} +import org.apache.spark.{Logging, SparkConf} /** Trait representing any event in the ReceivedBlockTracker that updates its state. */ private[streaming] sealed trait ReceivedBlockTrackerLogEvent @@ -199,7 +199,8 @@ private[streaming] class ReceivedBlockTracker( import scala.collection.JavaConversions._ writeAheadLog.readAll().foreach { byteBuffer = logTrace(Recovering record + byteBuffer) -Utils.deserialize[ReceivedBlockTrackerLogEvent](byteBuffer.array) match { +Utils.deserialize[ReceivedBlockTrackerLogEvent]( + byteBuffer.array, Thread.currentThread().getContextClassLoader) match { case BlockAdditionEvent(receivedBlockInfo) = insertAddedBlock(receivedBlockInfo) case BatchAllocationEvent(time, allocatedBlocks) = - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-10128] [STREAMING] Used correct classloader to deserialize WAL data
Repository: spark Updated Branches: refs/heads/branch-1.5 63922fa4d - 71aa54755 [SPARK-10128] [STREAMING] Used correct classloader to deserialize WAL data Recovering Kinesis sequence numbers from WAL leads to classnotfoundexception because the ObjectInputStream does not use the correct classloader and the SequenceNumberRanges class (in streaming-kinesis-asl package) cannot be found (added through spark-submit) while deserializing. The solution is to use `Thread.currentThread().getContextClassLoader` while deserializing. Author: Tathagata Das tathagata.das1...@gmail.com Closes #8328 from tdas/SPARK-10128 and squashes the following commits: f19b1c2 [Tathagata Das] Used correct classloader to deserialize WAL data (cherry picked from commit b762f9920f7587d3c08493c49dd2fede62110b88) Signed-off-by: Tathagata Das tathagata.das1...@gmail.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/71aa5475 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/71aa5475 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/71aa5475 Branch: refs/heads/branch-1.5 Commit: 71aa5475597f4220e2bab6b42caf9b98f248ac99 Parents: 63922fa Author: Tathagata Das tathagata.das1...@gmail.com Authored: Wed Aug 19 21:15:58 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Wed Aug 19 21:16:17 2015 -0700 -- .../apache/spark/streaming/scheduler/ReceivedBlockTracker.scala | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/71aa5475/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala index 7720259..53b96d5 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala @@ -28,7 +28,7 @@ import org.apache.hadoop.fs.Path import org.apache.spark.streaming.Time import org.apache.spark.streaming.util.{WriteAheadLog, WriteAheadLogUtils} import org.apache.spark.util.{Clock, Utils} -import org.apache.spark.{Logging, SparkConf, SparkException} +import org.apache.spark.{Logging, SparkConf} /** Trait representing any event in the ReceivedBlockTracker that updates its state. */ private[streaming] sealed trait ReceivedBlockTrackerLogEvent @@ -199,7 +199,8 @@ private[streaming] class ReceivedBlockTracker( import scala.collection.JavaConversions._ writeAheadLog.readAll().foreach { byteBuffer = logTrace(Recovering record + byteBuffer) -Utils.deserialize[ReceivedBlockTrackerLogEvent](byteBuffer.array) match { +Utils.deserialize[ReceivedBlockTrackerLogEvent]( + byteBuffer.array, Thread.currentThread().getContextClassLoader) match { case BlockAdditionEvent(receivedBlockInfo) = insertAddedBlock(receivedBlockInfo) case BatchAllocationEvent(time, allocatedBlocks) = - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-10072] [STREAMING] BlockGenerator can deadlock when the queue of generate blocks fills up to capacity
Repository: spark Updated Branches: refs/heads/master b4b35f133 - 1aeae05bb [SPARK-10072] [STREAMING] BlockGenerator can deadlock when the queue of generate blocks fills up to capacity Generated blocks are inserted into an ArrayBlockingQueue, and another thread pulls stuff from the ArrayBlockingQueue and pushes it into BlockManager. Now if that queue fills up to capacity (default is 10 blocks), then the inserting into queue (done in the function updateCurrentBuffer) get blocked inside a synchronized block. However, the thread that is pulling blocks from the queue uses the same lock to check the current (active or stopped) while pulling from the queue. Since the block generating threads is blocked (as the queue is full) on the lock, this thread that is supposed to drain the queue gets blocked. Ergo, deadlock. Solution: Moved blocking call to ArrayBlockingQueue outside the synchronized to prevent deadlock. Author: Tathagata Das tathagata.das1...@gmail.com Closes #8257 from tdas/SPARK-10072. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1aeae05b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1aeae05b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1aeae05b Branch: refs/heads/master Commit: 1aeae05bb20f01ab7ccaa62fe905a63e020074b5 Parents: b4b35f1 Author: Tathagata Das tathagata.das1...@gmail.com Authored: Tue Aug 18 19:26:38 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Tue Aug 18 19:26:38 2015 -0700 -- .../streaming/receiver/BlockGenerator.scala | 29 +--- 1 file changed, 19 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1aeae05b/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala index 300e820..421d60a 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala @@ -227,16 +227,21 @@ private[streaming] class BlockGenerator( def isStopped(): Boolean = state == StoppedAll /** Change the buffer to which single records are added to. */ - private def updateCurrentBuffer(time: Long): Unit = synchronized { + private def updateCurrentBuffer(time: Long): Unit = { try { - val newBlockBuffer = currentBuffer - currentBuffer = new ArrayBuffer[Any] - if (newBlockBuffer.size 0) { -val blockId = StreamBlockId(receiverId, time - blockIntervalMs) -val newBlock = new Block(blockId, newBlockBuffer) -listener.onGenerateBlock(blockId) + var newBlock: Block = null + synchronized { +if (currentBuffer.nonEmpty) { + val newBlockBuffer = currentBuffer + currentBuffer = new ArrayBuffer[Any] + val blockId = StreamBlockId(receiverId, time - blockIntervalMs) + listener.onGenerateBlock(blockId) + newBlock = new Block(blockId, newBlockBuffer) +} + } + + if (newBlock != null) { blocksForPushing.put(newBlock) // put is blocking when queue is full -logDebug(Last element in + blockId + is + newBlockBuffer.last) } } catch { case ie: InterruptedException = @@ -250,9 +255,13 @@ private[streaming] class BlockGenerator( private def keepPushingBlocks() { logInfo(Started block pushing thread) -def isGeneratingBlocks = synchronized { state == Active || state == StoppedAddingData } +def areBlocksBeingGenerated: Boolean = synchronized { + state != StoppedGeneratingBlocks +} + try { - while (isGeneratingBlocks) { + // While blocks are being generated, keep polling for to-be-pushed blocks and push them. + while (areBlocksBeingGenerated) { Option(blocksForPushing.poll(10, TimeUnit.MILLISECONDS)) match { case Some(block) = pushBlock(block) case None = - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-10072] [STREAMING] BlockGenerator can deadlock when the queue of generate blocks fills up to capacity
Repository: spark Updated Branches: refs/heads/branch-1.5 0a1385e31 - 08c5962a2 [SPARK-10072] [STREAMING] BlockGenerator can deadlock when the queue of generate blocks fills up to capacity Generated blocks are inserted into an ArrayBlockingQueue, and another thread pulls stuff from the ArrayBlockingQueue and pushes it into BlockManager. Now if that queue fills up to capacity (default is 10 blocks), then the inserting into queue (done in the function updateCurrentBuffer) get blocked inside a synchronized block. However, the thread that is pulling blocks from the queue uses the same lock to check the current (active or stopped) while pulling from the queue. Since the block generating threads is blocked (as the queue is full) on the lock, this thread that is supposed to drain the queue gets blocked. Ergo, deadlock. Solution: Moved blocking call to ArrayBlockingQueue outside the synchronized to prevent deadlock. Author: Tathagata Das tathagata.das1...@gmail.com Closes #8257 from tdas/SPARK-10072. (cherry picked from commit 1aeae05bb20f01ab7ccaa62fe905a63e020074b5) Signed-off-by: Tathagata Das tathagata.das1...@gmail.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/08c5962a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/08c5962a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/08c5962a Branch: refs/heads/branch-1.5 Commit: 08c5962a251555e7d34460135ab6c32cce584704 Parents: 0a1385e Author: Tathagata Das tathagata.das1...@gmail.com Authored: Tue Aug 18 19:26:38 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Tue Aug 18 19:26:51 2015 -0700 -- .../streaming/receiver/BlockGenerator.scala | 29 +--- 1 file changed, 19 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/08c5962a/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala index 300e820..421d60a 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala @@ -227,16 +227,21 @@ private[streaming] class BlockGenerator( def isStopped(): Boolean = state == StoppedAll /** Change the buffer to which single records are added to. */ - private def updateCurrentBuffer(time: Long): Unit = synchronized { + private def updateCurrentBuffer(time: Long): Unit = { try { - val newBlockBuffer = currentBuffer - currentBuffer = new ArrayBuffer[Any] - if (newBlockBuffer.size 0) { -val blockId = StreamBlockId(receiverId, time - blockIntervalMs) -val newBlock = new Block(blockId, newBlockBuffer) -listener.onGenerateBlock(blockId) + var newBlock: Block = null + synchronized { +if (currentBuffer.nonEmpty) { + val newBlockBuffer = currentBuffer + currentBuffer = new ArrayBuffer[Any] + val blockId = StreamBlockId(receiverId, time - blockIntervalMs) + listener.onGenerateBlock(blockId) + newBlock = new Block(blockId, newBlockBuffer) +} + } + + if (newBlock != null) { blocksForPushing.put(newBlock) // put is blocking when queue is full -logDebug(Last element in + blockId + is + newBlockBuffer.last) } } catch { case ie: InterruptedException = @@ -250,9 +255,13 @@ private[streaming] class BlockGenerator( private def keepPushingBlocks() { logInfo(Started block pushing thread) -def isGeneratingBlocks = synchronized { state == Active || state == StoppedAddingData } +def areBlocksBeingGenerated: Boolean = synchronized { + state != StoppedGeneratingBlocks +} + try { - while (isGeneratingBlocks) { + // While blocks are being generated, keep polling for to-be-pushed blocks and push them. + while (areBlocksBeingGenerated) { Option(blocksForPushing.poll(10, TimeUnit.MILLISECONDS)) match { case Some(block) = pushBlock(block) case None = - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-10102] [STREAMING] Fix a race condition that startReceiver may happen before setting trackerState to Started
Repository: spark Updated Branches: refs/heads/branch-1.5 08c5962a2 - a6f8979c8 [SPARK-10102] [STREAMING] Fix a race condition that startReceiver may happen before setting trackerState to Started Test failure: https://amplab.cs.berkeley.edu/jenkins/job/Spark-Master-Maven-with-YARN/HADOOP_PROFILE=hadoop-2.4,label=spark-test/3305/testReport/junit/org.apache.spark.streaming/StreamingContextSuite/stop_gracefully/ There is a race condition that setting `trackerState` to `Started` could happen after calling `startReceiver`. Then `startReceiver` won't start the receivers because it uses `! isTrackerStarted` to check if ReceiverTracker is stopping or stopped. But actually, `trackerState` is `Initialized` and will be changed to `Started` soon. Therefore, we should use `isTrackerStopping || isTrackerStopped`. Author: zsxwing zsxw...@gmail.com Closes #8294 from zsxwing/SPARK-9504. (cherry picked from commit 90273eff9604439a5a5853077e232d34555c67d7) Signed-off-by: Tathagata Das tathagata.das1...@gmail.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a6f8979c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a6f8979c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a6f8979c Branch: refs/heads/branch-1.5 Commit: a6f8979c81c5355759f74e8b3c9eb3cafb6a9c7f Parents: 08c5962 Author: zsxwing zsxw...@gmail.com Authored: Tue Aug 18 20:15:54 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Tue Aug 18 20:16:18 2015 -0700 -- .../spark/streaming/scheduler/ReceiverTracker.scala | 11 --- 1 file changed, 8 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a6f8979c/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala index e076fb5..aae3acf 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala @@ -468,8 +468,13 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false * Start a receiver along with its scheduled executors */ private def startReceiver(receiver: Receiver[_], scheduledExecutors: Seq[String]): Unit = { + def shouldStartReceiver: Boolean = { +// It's okay to start when trackerState is Initialized or Started +!(isTrackerStopping || isTrackerStopped) + } + val receiverId = receiver.streamId - if (!isTrackerStarted) { + if (!shouldStartReceiver) { onReceiverJobFinish(receiverId) return } @@ -494,14 +499,14 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false // We will keep restarting the receiver job until ReceiverTracker is stopped future.onComplete { case Success(_) = - if (!isTrackerStarted) { + if (!shouldStartReceiver) { onReceiverJobFinish(receiverId) } else { logInfo(sRestarting Receiver $receiverId) self.send(RestartReceiver(receiver)) } case Failure(e) = - if (!isTrackerStarted) { + if (!shouldStartReceiver) { onReceiverJobFinish(receiverId) } else { logError(Receiver has been stopped. Try to restart it., e) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-10102] [STREAMING] Fix a race condition that startReceiver may happen before setting trackerState to Started
Repository: spark Updated Branches: refs/heads/master 1aeae05bb - 90273eff9 [SPARK-10102] [STREAMING] Fix a race condition that startReceiver may happen before setting trackerState to Started Test failure: https://amplab.cs.berkeley.edu/jenkins/job/Spark-Master-Maven-with-YARN/HADOOP_PROFILE=hadoop-2.4,label=spark-test/3305/testReport/junit/org.apache.spark.streaming/StreamingContextSuite/stop_gracefully/ There is a race condition that setting `trackerState` to `Started` could happen after calling `startReceiver`. Then `startReceiver` won't start the receivers because it uses `! isTrackerStarted` to check if ReceiverTracker is stopping or stopped. But actually, `trackerState` is `Initialized` and will be changed to `Started` soon. Therefore, we should use `isTrackerStopping || isTrackerStopped`. Author: zsxwing zsxw...@gmail.com Closes #8294 from zsxwing/SPARK-9504. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/90273eff Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/90273eff Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/90273eff Branch: refs/heads/master Commit: 90273eff9604439a5a5853077e232d34555c67d7 Parents: 1aeae05 Author: zsxwing zsxw...@gmail.com Authored: Tue Aug 18 20:15:54 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Tue Aug 18 20:15:54 2015 -0700 -- .../spark/streaming/scheduler/ReceiverTracker.scala | 11 --- 1 file changed, 8 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/90273eff/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala index e076fb5..aae3acf 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala @@ -468,8 +468,13 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false * Start a receiver along with its scheduled executors */ private def startReceiver(receiver: Receiver[_], scheduledExecutors: Seq[String]): Unit = { + def shouldStartReceiver: Boolean = { +// It's okay to start when trackerState is Initialized or Started +!(isTrackerStopping || isTrackerStopped) + } + val receiverId = receiver.streamId - if (!isTrackerStarted) { + if (!shouldStartReceiver) { onReceiverJobFinish(receiverId) return } @@ -494,14 +499,14 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false // We will keep restarting the receiver job until ReceiverTracker is stopped future.onComplete { case Success(_) = - if (!isTrackerStarted) { + if (!shouldStartReceiver) { onReceiverJobFinish(receiverId) } else { logInfo(sRestarting Receiver $receiverId) self.send(RestartReceiver(receiver)) } case Failure(e) = - if (!isTrackerStarted) { + if (!shouldStartReceiver) { onReceiverJobFinish(receiverId) } else { logError(Receiver has been stopped. Try to restart it., e) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9574] [STREAMING] Remove unnecessary contents of spark-streaming-XXX-assembly jars
Repository: spark Updated Branches: refs/heads/master 8bae9015b - bf1d6614d [SPARK-9574] [STREAMING] Remove unnecessary contents of spark-streaming-XXX-assembly jars Removed contents already included in Spark assembly jar from spark-streaming-XXX-assembly jars. Author: zsxwing zsxw...@gmail.com Closes #8069 from zsxwing/SPARK-9574. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bf1d6614 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bf1d6614 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bf1d6614 Branch: refs/heads/master Commit: bf1d6614dcb8f5974e62e406d9c0f8aac52556d3 Parents: 8bae901 Author: zsxwing zsxw...@gmail.com Authored: Tue Aug 18 13:35:45 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Tue Aug 18 13:35:45 2015 -0700 -- external/flume-assembly/pom.xml | 11 + external/kafka-assembly/pom.xml | 84 external/mqtt-assembly/pom.xml | 74 extras/kinesis-asl-assembly/pom.xml | 79 ++ pom.xml | 2 +- 5 files changed, 249 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/bf1d6614/external/flume-assembly/pom.xml -- diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml index 1318959..e05e431 100644 --- a/external/flume-assembly/pom.xml +++ b/external/flume-assembly/pom.xml @@ -69,6 +69,11 @@ scopeprovided/scope /dependency dependency + groupIdcommons-lang/groupId + artifactIdcommons-lang/artifactId + scopeprovided/scope +/dependency +dependency groupIdcommons-net/groupId artifactIdcommons-net/artifactId scopeprovided/scope @@ -89,6 +94,12 @@ scopeprovided/scope /dependency dependency + groupIdorg.apache.avro/groupId + artifactIdavro-mapred/artifactId + classifier${avro.mapred.classifier}/classifier + scopeprovided/scope +/dependency +dependency groupIdorg.scala-lang/groupId artifactIdscala-library/artifactId scopeprovided/scope http://git-wip-us.apache.org/repos/asf/spark/blob/bf1d6614/external/kafka-assembly/pom.xml -- diff --git a/external/kafka-assembly/pom.xml b/external/kafka-assembly/pom.xml index 977514f..36342f3 100644 --- a/external/kafka-assembly/pom.xml +++ b/external/kafka-assembly/pom.xml @@ -47,6 +47,90 @@ version${project.version}/version scopeprovided/scope /dependency +!-- + Demote already included in the Spark assembly. +-- +dependency + groupIdcommons-codec/groupId + artifactIdcommons-codec/artifactId + scopeprovided/scope +/dependency +dependency + groupIdcommons-lang/groupId + artifactIdcommons-lang/artifactId + scopeprovided/scope +/dependency +dependency + groupIdcom.google.protobuf/groupId + artifactIdprotobuf-java/artifactId + scopeprovided/scope +/dependency +dependency + groupIdcom.sun.jersey/groupId + artifactIdjersey-server/artifactId + scopeprovided/scope +/dependency +dependency + groupIdcom.sun.jersey/groupId + artifactIdjersey-core/artifactId + scopeprovided/scope +/dependency +dependency + groupIdnet.jpountz.lz4/groupId + artifactIdlz4/artifactId + scopeprovided/scope +/dependency +dependency + groupIdorg.apache.hadoop/groupId + artifactIdhadoop-client/artifactId + scopeprovided/scope +/dependency +dependency + groupIdorg.apache.avro/groupId + artifactIdavro-mapred/artifactId + classifier${avro.mapred.classifier}/classifier + scopeprovided/scope +/dependency +dependency + groupIdorg.apache.curator/groupId + artifactIdcurator-recipes/artifactId + scopeprovided/scope +/dependency +dependency + groupIdorg.apache.zookeeper/groupId + artifactIdzookeeper/artifactId + scopeprovided/scope +/dependency +dependency + groupIdlog4j/groupId + artifactIdlog4j/artifactId + scopeprovided/scope +/dependency +dependency + groupIdnet.java.dev.jets3t/groupId + artifactIdjets3t/artifactId + scopeprovided/scope +/dependency +dependency + groupIdorg.scala-lang/groupId + artifactIdscala-library/artifactId + scopeprovided/scope +/dependency +dependency + groupIdorg.slf4j/groupId + artifactIdslf4j-api/artifactId + scopeprovided/scope +/dependency +dependency + groupIdorg.slf4j/groupId + artifactIdslf4j-log4j12/artifactId
spark git commit: [SPARK-10098] [STREAMING] [TEST] Cleanup active context after test in FailureSuite
Repository: spark Updated Branches: refs/heads/master c635a16f6 - 9108eff74 [SPARK-10098] [STREAMING] [TEST] Cleanup active context after test in FailureSuite Failures in streaming.FailureSuite can leak StreamingContext and SparkContext which fails all subsequent tests Author: Tathagata Das tathagata.das1...@gmail.com Closes #8289 from tdas/SPARK-10098. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9108eff7 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9108eff7 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9108eff7 Branch: refs/heads/master Commit: 9108eff74a2815986fd067b273c2a344b6315405 Parents: c635a16 Author: Tathagata Das tathagata.das1...@gmail.com Authored: Tue Aug 18 17:00:13 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Tue Aug 18 17:00:13 2015 -0700 -- .../apache/spark/streaming/FailureSuite.scala | 27 1 file changed, 17 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9108eff7/streaming/src/test/scala/org/apache/spark/streaming/FailureSuite.scala -- diff --git a/streaming/src/test/scala/org/apache/spark/streaming/FailureSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/FailureSuite.scala index 0c4c065..e82c2fa 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/FailureSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/FailureSuite.scala @@ -17,25 +17,32 @@ package org.apache.spark.streaming -import org.apache.spark.Logging +import java.io.File + +import org.scalatest.BeforeAndAfter + +import org.apache.spark.{SparkFunSuite, Logging} import org.apache.spark.util.Utils /** * This testsuite tests master failures at random times while the stream is running using * the real clock. */ -class FailureSuite extends TestSuiteBase with Logging { +class FailureSuite extends SparkFunSuite with BeforeAndAfter with Logging { - val directory = Utils.createTempDir() - val numBatches = 30 + private val batchDuration: Duration = Milliseconds(1000) + private val numBatches = 30 + private var directory: File = null - override def batchDuration: Duration = Milliseconds(1000) - - override def useManualClock: Boolean = false + before { +directory = Utils.createTempDir() + } - override def afterFunction() { -Utils.deleteRecursively(directory) -super.afterFunction() + after { +if (directory != null) { + Utils.deleteRecursively(directory) +} +StreamingContext.getActive().foreach { _.stop() } } test(multiple failures with map) { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-10098] [STREAMING] [TEST] Cleanup active context after test in FailureSuite
Repository: spark Updated Branches: refs/heads/branch-1.5 fb207b245 - e1b50c7d2 [SPARK-10098] [STREAMING] [TEST] Cleanup active context after test in FailureSuite Failures in streaming.FailureSuite can leak StreamingContext and SparkContext which fails all subsequent tests Author: Tathagata Das tathagata.das1...@gmail.com Closes #8289 from tdas/SPARK-10098. (cherry picked from commit 9108eff74a2815986fd067b273c2a344b6315405) Signed-off-by: Tathagata Das tathagata.das1...@gmail.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e1b50c7d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e1b50c7d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e1b50c7d Branch: refs/heads/branch-1.5 Commit: e1b50c7d2a604f785e5fe9af5d60c426a6ff01c2 Parents: fb207b2 Author: Tathagata Das tathagata.das1...@gmail.com Authored: Tue Aug 18 17:00:13 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Tue Aug 18 17:00:21 2015 -0700 -- .../apache/spark/streaming/FailureSuite.scala | 27 1 file changed, 17 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e1b50c7d/streaming/src/test/scala/org/apache/spark/streaming/FailureSuite.scala -- diff --git a/streaming/src/test/scala/org/apache/spark/streaming/FailureSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/FailureSuite.scala index 0c4c065..e82c2fa 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/FailureSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/FailureSuite.scala @@ -17,25 +17,32 @@ package org.apache.spark.streaming -import org.apache.spark.Logging +import java.io.File + +import org.scalatest.BeforeAndAfter + +import org.apache.spark.{SparkFunSuite, Logging} import org.apache.spark.util.Utils /** * This testsuite tests master failures at random times while the stream is running using * the real clock. */ -class FailureSuite extends TestSuiteBase with Logging { +class FailureSuite extends SparkFunSuite with BeforeAndAfter with Logging { - val directory = Utils.createTempDir() - val numBatches = 30 + private val batchDuration: Duration = Milliseconds(1000) + private val numBatches = 30 + private var directory: File = null - override def batchDuration: Duration = Milliseconds(1000) - - override def useManualClock: Boolean = false + before { +directory = Utils.createTempDir() + } - override def afterFunction() { -Utils.deleteRecursively(directory) -super.afterFunction() + after { +if (directory != null) { + Utils.deleteRecursively(directory) +} +StreamingContext.getActive().foreach { _.stop() } } test(multiple failures with map) { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9574] [STREAMING] Remove unnecessary contents of spark-streaming-XXX-assembly jars
Repository: spark Updated Branches: refs/heads/branch-1.5 9bd2e6f7c - 2bccd918f [SPARK-9574] [STREAMING] Remove unnecessary contents of spark-streaming-XXX-assembly jars Removed contents already included in Spark assembly jar from spark-streaming-XXX-assembly jars. Author: zsxwing zsxw...@gmail.com Closes #8069 from zsxwing/SPARK-9574. (cherry picked from commit bf1d6614dcb8f5974e62e406d9c0f8aac52556d3) Signed-off-by: Tathagata Das tathagata.das1...@gmail.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2bccd918 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2bccd918 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2bccd918 Branch: refs/heads/branch-1.5 Commit: 2bccd918fcd278f0b544e61b9675ecdf2d6974b3 Parents: 9bd2e6f Author: zsxwing zsxw...@gmail.com Authored: Tue Aug 18 13:35:45 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Tue Aug 18 13:36:25 2015 -0700 -- external/flume-assembly/pom.xml | 11 + external/kafka-assembly/pom.xml | 84 external/mqtt-assembly/pom.xml | 74 extras/kinesis-asl-assembly/pom.xml | 79 ++ pom.xml | 2 +- 5 files changed, 249 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2bccd918/external/flume-assembly/pom.xml -- diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml index 1318959..e05e431 100644 --- a/external/flume-assembly/pom.xml +++ b/external/flume-assembly/pom.xml @@ -69,6 +69,11 @@ scopeprovided/scope /dependency dependency + groupIdcommons-lang/groupId + artifactIdcommons-lang/artifactId + scopeprovided/scope +/dependency +dependency groupIdcommons-net/groupId artifactIdcommons-net/artifactId scopeprovided/scope @@ -89,6 +94,12 @@ scopeprovided/scope /dependency dependency + groupIdorg.apache.avro/groupId + artifactIdavro-mapred/artifactId + classifier${avro.mapred.classifier}/classifier + scopeprovided/scope +/dependency +dependency groupIdorg.scala-lang/groupId artifactIdscala-library/artifactId scopeprovided/scope http://git-wip-us.apache.org/repos/asf/spark/blob/2bccd918/external/kafka-assembly/pom.xml -- diff --git a/external/kafka-assembly/pom.xml b/external/kafka-assembly/pom.xml index 977514f..36342f3 100644 --- a/external/kafka-assembly/pom.xml +++ b/external/kafka-assembly/pom.xml @@ -47,6 +47,90 @@ version${project.version}/version scopeprovided/scope /dependency +!-- + Demote already included in the Spark assembly. +-- +dependency + groupIdcommons-codec/groupId + artifactIdcommons-codec/artifactId + scopeprovided/scope +/dependency +dependency + groupIdcommons-lang/groupId + artifactIdcommons-lang/artifactId + scopeprovided/scope +/dependency +dependency + groupIdcom.google.protobuf/groupId + artifactIdprotobuf-java/artifactId + scopeprovided/scope +/dependency +dependency + groupIdcom.sun.jersey/groupId + artifactIdjersey-server/artifactId + scopeprovided/scope +/dependency +dependency + groupIdcom.sun.jersey/groupId + artifactIdjersey-core/artifactId + scopeprovided/scope +/dependency +dependency + groupIdnet.jpountz.lz4/groupId + artifactIdlz4/artifactId + scopeprovided/scope +/dependency +dependency + groupIdorg.apache.hadoop/groupId + artifactIdhadoop-client/artifactId + scopeprovided/scope +/dependency +dependency + groupIdorg.apache.avro/groupId + artifactIdavro-mapred/artifactId + classifier${avro.mapred.classifier}/classifier + scopeprovided/scope +/dependency +dependency + groupIdorg.apache.curator/groupId + artifactIdcurator-recipes/artifactId + scopeprovided/scope +/dependency +dependency + groupIdorg.apache.zookeeper/groupId + artifactIdzookeeper/artifactId + scopeprovided/scope +/dependency +dependency + groupIdlog4j/groupId + artifactIdlog4j/artifactId + scopeprovided/scope +/dependency +dependency + groupIdnet.java.dev.jets3t/groupId + artifactIdjets3t/artifactId + scopeprovided/scope +/dependency +dependency + groupIdorg.scala-lang/groupId + artifactIdscala-library/artifactId + scopeprovided/scope +/dependency +dependency + groupIdorg.slf4j/groupId +
spark git commit: [SPARK-9966] [STREAMING] Handle couple of corner cases in PIDRateEstimator
Repository: spark Updated Branches: refs/heads/branch-1.5 5bbb2d327 - 612b4609b [SPARK-9966] [STREAMING] Handle couple of corner cases in PIDRateEstimator 1. The rate estimator should not estimate any rate when there are no records in the batch, as there is no data to estimate the rate. In the current state, it estimates and set the rate to zero. That is incorrect. 2. The rate estimator should not never set the rate to zero under any circumstances. Otherwise the system will stop receiving data, and stop generating useful estimates (see reason 1). So the fix is to define a parameters that sets a lower bound on the estimated rate, so that the system always receives some data. Author: Tathagata Das tathagata.das1...@gmail.com Closes #8199 from tdas/SPARK-9966 and squashes the following commits: 829f793 [Tathagata Das] Fixed unit test and added comments 3a994db [Tathagata Das] Added min rate and updated tests in PIDRateEstimator (cherry picked from commit f3bfb711c1742d0915e43bda8230b4d1d22b4190) Signed-off-by: Tathagata Das tathagata.das1...@gmail.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/612b4609 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/612b4609 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/612b4609 Branch: refs/heads/branch-1.5 Commit: 612b4609bdd38763725ae07d77c2176aa6756e64 Parents: 5bbb2d3 Author: Tathagata Das tathagata.das1...@gmail.com Authored: Fri Aug 14 15:10:01 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Fri Aug 14 15:10:19 2015 -0700 -- .../scheduler/rate/PIDRateEstimator.scala | 46 +--- .../scheduler/rate/RateEstimator.scala | 4 +- .../scheduler/rate/PIDRateEstimatorSuite.scala | 79 +--- 3 files changed, 87 insertions(+), 42 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/612b4609/streaming/src/main/scala/org/apache/spark/streaming/scheduler/rate/PIDRateEstimator.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/rate/PIDRateEstimator.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/rate/PIDRateEstimator.scala index 6ae56a6..84a3ca9 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/rate/PIDRateEstimator.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/rate/PIDRateEstimator.scala @@ -17,6 +17,8 @@ package org.apache.spark.streaming.scheduler.rate +import org.apache.spark.Logging + /** * Implements a proportional-integral-derivative (PID) controller which acts on * the speed of ingestion of elements into Spark Streaming. A PID controller works @@ -26,7 +28,7 @@ package org.apache.spark.streaming.scheduler.rate * * @see https://en.wikipedia.org/wiki/PID_controller * - * @param batchDurationMillis the batch duration, in milliseconds + * @param batchIntervalMillis the batch duration, in milliseconds * @param proportional how much the correction should depend on the current *error. This term usually provides the bulk of correction and should be positive or zero. *A value too large would make the controller overshoot the setpoint, while a small value @@ -39,13 +41,17 @@ package org.apache.spark.streaming.scheduler.rate *of future errors, based on current rate of change. This value should be positive or 0. *This term is not used very often, as it impacts stability of the system. The default *value is 0. + * @param minRate what is the minimum rate that can be estimated. + *This must be greater than zero, so that the system always receives some data for rate + *estimation to work. */ private[streaming] class PIDRateEstimator( batchIntervalMillis: Long, -proportional: Double = 1D, -integral: Double = .2D, -derivative: Double = 0D) - extends RateEstimator { +proportional: Double, +integral: Double, +derivative: Double, +minRate: Double + ) extends RateEstimator with Logging { private var firstRun: Boolean = true private var latestTime: Long = -1L @@ -64,16 +70,23 @@ private[streaming] class PIDRateEstimator( require( derivative = 0, sDerivative term $derivative in PIDRateEstimator should be = 0.) + require( +minRate 0, +sMinimum rate in PIDRateEstimator should be 0) + logInfo(sCreated PIDRateEstimator with proportional = $proportional, integral = $integral, + +sderivative = $derivative, min rate = $minRate) - def compute(time: Long, // in milliseconds + def compute( + time: Long, // in milliseconds numElements: Long, processingDelay: Long, // in milliseconds schedulingDelay: Long
spark git commit: [SPARK-9966] [STREAMING] Handle couple of corner cases in PIDRateEstimator
Repository: spark Updated Branches: refs/heads/master 1150a19b1 - f3bfb711c [SPARK-9966] [STREAMING] Handle couple of corner cases in PIDRateEstimator 1. The rate estimator should not estimate any rate when there are no records in the batch, as there is no data to estimate the rate. In the current state, it estimates and set the rate to zero. That is incorrect. 2. The rate estimator should not never set the rate to zero under any circumstances. Otherwise the system will stop receiving data, and stop generating useful estimates (see reason 1). So the fix is to define a parameters that sets a lower bound on the estimated rate, so that the system always receives some data. Author: Tathagata Das tathagata.das1...@gmail.com Closes #8199 from tdas/SPARK-9966 and squashes the following commits: 829f793 [Tathagata Das] Fixed unit test and added comments 3a994db [Tathagata Das] Added min rate and updated tests in PIDRateEstimator Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f3bfb711 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f3bfb711 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f3bfb711 Branch: refs/heads/master Commit: f3bfb711c1742d0915e43bda8230b4d1d22b4190 Parents: 1150a19 Author: Tathagata Das tathagata.das1...@gmail.com Authored: Fri Aug 14 15:10:01 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Fri Aug 14 15:10:01 2015 -0700 -- .../scheduler/rate/PIDRateEstimator.scala | 46 +--- .../scheduler/rate/RateEstimator.scala | 4 +- .../scheduler/rate/PIDRateEstimatorSuite.scala | 79 +--- 3 files changed, 87 insertions(+), 42 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f3bfb711/streaming/src/main/scala/org/apache/spark/streaming/scheduler/rate/PIDRateEstimator.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/rate/PIDRateEstimator.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/rate/PIDRateEstimator.scala index 6ae56a6..84a3ca9 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/rate/PIDRateEstimator.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/rate/PIDRateEstimator.scala @@ -17,6 +17,8 @@ package org.apache.spark.streaming.scheduler.rate +import org.apache.spark.Logging + /** * Implements a proportional-integral-derivative (PID) controller which acts on * the speed of ingestion of elements into Spark Streaming. A PID controller works @@ -26,7 +28,7 @@ package org.apache.spark.streaming.scheduler.rate * * @see https://en.wikipedia.org/wiki/PID_controller * - * @param batchDurationMillis the batch duration, in milliseconds + * @param batchIntervalMillis the batch duration, in milliseconds * @param proportional how much the correction should depend on the current *error. This term usually provides the bulk of correction and should be positive or zero. *A value too large would make the controller overshoot the setpoint, while a small value @@ -39,13 +41,17 @@ package org.apache.spark.streaming.scheduler.rate *of future errors, based on current rate of change. This value should be positive or 0. *This term is not used very often, as it impacts stability of the system. The default *value is 0. + * @param minRate what is the minimum rate that can be estimated. + *This must be greater than zero, so that the system always receives some data for rate + *estimation to work. */ private[streaming] class PIDRateEstimator( batchIntervalMillis: Long, -proportional: Double = 1D, -integral: Double = .2D, -derivative: Double = 0D) - extends RateEstimator { +proportional: Double, +integral: Double, +derivative: Double, +minRate: Double + ) extends RateEstimator with Logging { private var firstRun: Boolean = true private var latestTime: Long = -1L @@ -64,16 +70,23 @@ private[streaming] class PIDRateEstimator( require( derivative = 0, sDerivative term $derivative in PIDRateEstimator should be = 0.) + require( +minRate 0, +sMinimum rate in PIDRateEstimator should be 0) + logInfo(sCreated PIDRateEstimator with proportional = $proportional, integral = $integral, + +sderivative = $derivative, min rate = $minRate) - def compute(time: Long, // in milliseconds + def compute( + time: Long, // in milliseconds numElements: Long, processingDelay: Long, // in milliseconds schedulingDelay: Long // in milliseconds ): Option[Double] = { - +logTrace(s\ntime = $time, # records = $numElements, + + sprocessing time
spark git commit: [SPARK-9968] [STREAMING] Reduced time spent within synchronized block to prevent lock starvation
Repository: spark Updated Branches: refs/heads/branch-1.5 612b4609b - 8d2624790 [SPARK-9968] [STREAMING] Reduced time spent within synchronized block to prevent lock starvation When the rate limiter is actually limiting the rate at which data is inserted into the buffer, the synchronized block of BlockGenerator.addData stays blocked for long time. This causes the thread switching the buffer and generating blocks (synchronized with addData) to starve and not generate blocks for seconds. The correct solution is to not block on the rate limiter within the synchronized block for adding data to the buffer. Author: Tathagata Das tathagata.das1...@gmail.com Closes #8204 from tdas/SPARK-9968 and squashes the following commits: 8cbcc1b [Tathagata Das] Removed unused val a73b645 [Tathagata Das] Reduced time spent within synchronized block (cherry picked from commit 18a761ef7a01a4dfa1dd91abe78cd68f2f8fdb67) Signed-off-by: Tathagata Das tathagata.das1...@gmail.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8d262479 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8d262479 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8d262479 Branch: refs/heads/branch-1.5 Commit: 8d26247903a1b594df6e202f0834ed165f47bbdc Parents: 612b460 Author: Tathagata Das tathagata.das1...@gmail.com Authored: Fri Aug 14 15:54:14 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Fri Aug 14 15:54:33 2015 -0700 -- .../streaming/receiver/BlockGenerator.scala | 40 1 file changed, 32 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8d262479/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala index 794dece..300e820 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala @@ -155,10 +155,17 @@ private[streaming] class BlockGenerator( /** * Push a single data item into the buffer. */ - def addData(data: Any): Unit = synchronized { + def addData(data: Any): Unit = { if (state == Active) { waitToPush() - currentBuffer += data + synchronized { +if (state == Active) { + currentBuffer += data +} else { + throw new SparkException( +Cannot add data as BlockGenerator has not been started or has been stopped) +} + } } else { throw new SparkException( Cannot add data as BlockGenerator has not been started or has been stopped) @@ -169,11 +176,18 @@ private[streaming] class BlockGenerator( * Push a single data item into the buffer. After buffering the data, the * `BlockGeneratorListener.onAddData` callback will be called. */ - def addDataWithCallback(data: Any, metadata: Any): Unit = synchronized { + def addDataWithCallback(data: Any, metadata: Any): Unit = { if (state == Active) { waitToPush() - currentBuffer += data - listener.onAddData(data, metadata) + synchronized { +if (state == Active) { + currentBuffer += data + listener.onAddData(data, metadata) +} else { + throw new SparkException( +Cannot add data as BlockGenerator has not been started or has been stopped) +} + } } else { throw new SparkException( Cannot add data as BlockGenerator has not been started or has been stopped) @@ -185,13 +199,23 @@ private[streaming] class BlockGenerator( * `BlockGeneratorListener.onAddData` callback will be called. Note that all the data items * are atomically added to the buffer, and are hence guaranteed to be present in a single block. */ - def addMultipleDataWithCallback(dataIterator: Iterator[Any], metadata: Any): Unit = synchronized { + def addMultipleDataWithCallback(dataIterator: Iterator[Any], metadata: Any): Unit = { if (state == Active) { + // Unroll iterator into a temp buffer, and wait for pushing in the process + val tempBuffer = new ArrayBuffer[Any] dataIterator.foreach { data = waitToPush() -currentBuffer += data +tempBuffer += data + } + synchronized { +if (state == Active) { + currentBuffer ++= tempBuffer + listener.onAddData(tempBuffer, metadata) +} else { + throw new SparkException( +Cannot add data as BlockGenerator has not been started or has been
spark git commit: [SPARK-9968] [STREAMING] Reduced time spent within synchronized block to prevent lock starvation
Repository: spark Updated Branches: refs/heads/master f3bfb711c - 18a761ef7 [SPARK-9968] [STREAMING] Reduced time spent within synchronized block to prevent lock starvation When the rate limiter is actually limiting the rate at which data is inserted into the buffer, the synchronized block of BlockGenerator.addData stays blocked for long time. This causes the thread switching the buffer and generating blocks (synchronized with addData) to starve and not generate blocks for seconds. The correct solution is to not block on the rate limiter within the synchronized block for adding data to the buffer. Author: Tathagata Das tathagata.das1...@gmail.com Closes #8204 from tdas/SPARK-9968 and squashes the following commits: 8cbcc1b [Tathagata Das] Removed unused val a73b645 [Tathagata Das] Reduced time spent within synchronized block Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/18a761ef Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/18a761ef Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/18a761ef Branch: refs/heads/master Commit: 18a761ef7a01a4dfa1dd91abe78cd68f2f8fdb67 Parents: f3bfb71 Author: Tathagata Das tathagata.das1...@gmail.com Authored: Fri Aug 14 15:54:14 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Fri Aug 14 15:54:14 2015 -0700 -- .../streaming/receiver/BlockGenerator.scala | 40 1 file changed, 32 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/18a761ef/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala index 794dece..300e820 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala @@ -155,10 +155,17 @@ private[streaming] class BlockGenerator( /** * Push a single data item into the buffer. */ - def addData(data: Any): Unit = synchronized { + def addData(data: Any): Unit = { if (state == Active) { waitToPush() - currentBuffer += data + synchronized { +if (state == Active) { + currentBuffer += data +} else { + throw new SparkException( +Cannot add data as BlockGenerator has not been started or has been stopped) +} + } } else { throw new SparkException( Cannot add data as BlockGenerator has not been started or has been stopped) @@ -169,11 +176,18 @@ private[streaming] class BlockGenerator( * Push a single data item into the buffer. After buffering the data, the * `BlockGeneratorListener.onAddData` callback will be called. */ - def addDataWithCallback(data: Any, metadata: Any): Unit = synchronized { + def addDataWithCallback(data: Any, metadata: Any): Unit = { if (state == Active) { waitToPush() - currentBuffer += data - listener.onAddData(data, metadata) + synchronized { +if (state == Active) { + currentBuffer += data + listener.onAddData(data, metadata) +} else { + throw new SparkException( +Cannot add data as BlockGenerator has not been started or has been stopped) +} + } } else { throw new SparkException( Cannot add data as BlockGenerator has not been started or has been stopped) @@ -185,13 +199,23 @@ private[streaming] class BlockGenerator( * `BlockGeneratorListener.onAddData` callback will be called. Note that all the data items * are atomically added to the buffer, and are hence guaranteed to be present in a single block. */ - def addMultipleDataWithCallback(dataIterator: Iterator[Any], metadata: Any): Unit = synchronized { + def addMultipleDataWithCallback(dataIterator: Iterator[Any], metadata: Any): Unit = { if (state == Active) { + // Unroll iterator into a temp buffer, and wait for pushing in the process + val tempBuffer = new ArrayBuffer[Any] dataIterator.foreach { data = waitToPush() -currentBuffer += data +tempBuffer += data + } + synchronized { +if (state == Active) { + currentBuffer ++= tempBuffer + listener.onAddData(tempBuffer, metadata) +} else { + throw new SparkException( +Cannot add data as BlockGenerator has not been started or has been stopped) +} } - listener.onAddData(dataIterator, metadata) } else { throw new SparkException
spark git commit: [SPARK-9780] [STREAMING] [KAFKA] prevent NPE if KafkaRDD instantiation …
Repository: spark Updated Branches: refs/heads/master 660e6dcff - 8ce60963c [SPARK-9780] [STREAMING] [KAFKA] prevent NPE if KafkaRDD instantiation ⦠â¦fails Author: cody koeninger c...@koeninger.org Closes #8133 from koeninger/SPARK-9780 and squashes the following commits: 406259d [cody koeninger] [SPARK-9780][Streaming][Kafka] prevent NPE if KafkaRDD instantiation fails Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8ce60963 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8ce60963 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8ce60963 Branch: refs/heads/master Commit: 8ce60963cb0928058ef7b6e29ff94eb69d1143af Parents: 660e6dc Author: cody koeninger c...@koeninger.org Authored: Wed Aug 12 17:44:16 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Wed Aug 12 17:44:16 2015 -0700 -- .../main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8ce60963/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala -- diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala index 1a9d78c..ea5f842 100644 --- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala +++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala @@ -197,7 +197,11 @@ class KafkaRDD[ .dropWhile(_.offset requestOffset) } -override def close(): Unit = consumer.close() +override def close(): Unit = { + if (consumer != null) { +consumer.close() + } +} override def getNext(): R = { if (iter == null || !iter.hasNext) { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9780] [STREAMING] [KAFKA] prevent NPE if KafkaRDD instantiation …
Repository: spark Updated Branches: refs/heads/branch-1.5 3298fb69f - 62ab2a4c6 [SPARK-9780] [STREAMING] [KAFKA] prevent NPE if KafkaRDD instantiation ⦠â¦fails Author: cody koeninger c...@koeninger.org Closes #8133 from koeninger/SPARK-9780 and squashes the following commits: 406259d [cody koeninger] [SPARK-9780][Streaming][Kafka] prevent NPE if KafkaRDD instantiation fails (cherry picked from commit 8ce60963cb0928058ef7b6e29ff94eb69d1143af) Signed-off-by: Tathagata Das tathagata.das1...@gmail.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/62ab2a4c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/62ab2a4c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/62ab2a4c Branch: refs/heads/branch-1.5 Commit: 62ab2a4c6b4b0cf4875ac1291562660b4b77cac4 Parents: 3298fb6 Author: cody koeninger c...@koeninger.org Authored: Wed Aug 12 17:44:16 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Wed Aug 12 17:44:27 2015 -0700 -- .../main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/62ab2a4c/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala -- diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala index 1a9d78c..ea5f842 100644 --- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala +++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala @@ -197,7 +197,11 @@ class KafkaRDD[ .dropWhile(_.offset requestOffset) } -override def close(): Unit = consumer.close() +override def close(): Unit = { + if (consumer != null) { +consumer.close() + } +} override def getNext(): R = { if (iter == null || !iter.hasNext) { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9640] [STREAMING] [TEST] Do not run Python Kinesis tests when the Kinesis assembly JAR has not been generated
Repository: spark Updated Branches: refs/heads/branch-1.5 f9beef998 - c7f009040 [SPARK-9640] [STREAMING] [TEST] Do not run Python Kinesis tests when the Kinesis assembly JAR has not been generated Author: Tathagata Das tathagata.das1...@gmail.com Closes #7961 from tdas/SPARK-9640 and squashes the following commits: 974ce19 [Tathagata Das] Undo changes related to SPARK-9727 004ae26 [Tathagata Das] style fixes 9bbb97d [Tathagata Das] Minor style fies e6a677e [Tathagata Das] Merge remote-tracking branch 'apache-github/master' into SPARK-9640 ca90719 [Tathagata Das] Removed extra line ba9cfc7 [Tathagata Das] Improved kinesis test selection logic 88d59bd [Tathagata Das] updated test modules 871fcc8 [Tathagata Das] Fixed SparkBuild 94be631 [Tathagata Das] Fixed style b858196 [Tathagata Das] Fixed conditions and few other things based on PR comments. e292e64 [Tathagata Das] Added filters for Kinesis python tests (cherry picked from commit 0f90d6055e5bea9ceb1d454db84f4aa1d59b284d) Signed-off-by: Tathagata Das tathagata.das1...@gmail.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c7f00904 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c7f00904 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c7f00904 Branch: refs/heads/branch-1.5 Commit: c7f0090409c2a94a43404271730beded421a0f2f Parents: f9beef9 Author: Tathagata Das tathagata.das1...@gmail.com Authored: Mon Aug 10 23:41:53 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Mon Aug 10 23:42:44 2015 -0700 -- python/pyspark/streaming/tests.py | 56 ++ 1 file changed, 44 insertions(+), 12 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c7f00904/python/pyspark/streaming/tests.py -- diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py index 66ae334..f0ed415 100644 --- a/python/pyspark/streaming/tests.py +++ b/python/pyspark/streaming/tests.py @@ -971,8 +971,10 @@ class KinesisStreamTests(PySparkStreamingTestCase): awsAccessKey, awsSecretKey) def test_kinesis_stream(self): -if os.environ.get('ENABLE_KINESIS_TESTS') != '1': -print(Skip test_kinesis_stream) +if not are_kinesis_tests_enabled: +sys.stderr.write( +Skipped test_kinesis_stream (enable by setting environment variable %s=1 +% kinesis_test_environ_var) return import random @@ -1013,6 +1015,7 @@ class KinesisStreamTests(PySparkStreamingTestCase): traceback.print_exc() raise finally: +self.ssc.stop(False) kinesisTestUtils.deleteStream() kinesisTestUtils.deleteDynamoDBTable(kinesisAppName) @@ -1027,7 +1030,7 @@ def search_kafka_assembly_jar(): (Failed to find Spark Streaming kafka assembly jar in %s. % kafka_assembly_dir) + You need to build Spark with 'build/sbt assembly/assembly streaming-kafka-assembly/assembly' or -'build/mvn package' before running this test) +'build/mvn package' before running this test.) elif len(jars) 1: raise Exception((Found multiple Spark Streaming Kafka assembly JARs in %s; please remove all but one) % kafka_assembly_dir) @@ -1045,7 +1048,7 @@ def search_flume_assembly_jar(): (Failed to find Spark Streaming Flume assembly jar in %s. % flume_assembly_dir) + You need to build Spark with 'build/sbt assembly/assembly streaming-flume-assembly/assembly' or -'build/mvn package' before running this test) +'build/mvn package' before running this test.) elif len(jars) 1: raise Exception((Found multiple Spark Streaming Flume assembly JARs in %s; please remove all but one) % flume_assembly_dir) @@ -1095,11 +1098,7 @@ def search_kinesis_asl_assembly_jar(): os.path.join(kinesis_asl_assembly_dir, target/scala-*/spark-streaming-kinesis-asl-assembly-*.jar)) if not jars: -raise Exception( -(Failed to find Spark Streaming Kinesis ASL assembly jar in %s. % - kinesis_asl_assembly_dir) + You need to build Spark with -'build/sbt -Pkinesis-asl assembly/assembly streaming-kinesis-asl-assembly/assembly' -or 'build/mvn -Pkinesis-asl package' before running this test) +return None elif len(jars) 1: raise Exception((Found multiple Spark Streaming Kinesis ASL assembly JARs in %s; please remove all but one) % kinesis_asl_assembly_dir) @@ -1107,6 +1106,10
spark git commit: [SPARK-9640] [STREAMING] [TEST] Do not run Python Kinesis tests when the Kinesis assembly JAR has not been generated
Repository: spark Updated Branches: refs/heads/master 91e9389f3 - 0f90d6055 [SPARK-9640] [STREAMING] [TEST] Do not run Python Kinesis tests when the Kinesis assembly JAR has not been generated Author: Tathagata Das tathagata.das1...@gmail.com Closes #7961 from tdas/SPARK-9640 and squashes the following commits: 974ce19 [Tathagata Das] Undo changes related to SPARK-9727 004ae26 [Tathagata Das] style fixes 9bbb97d [Tathagata Das] Minor style fies e6a677e [Tathagata Das] Merge remote-tracking branch 'apache-github/master' into SPARK-9640 ca90719 [Tathagata Das] Removed extra line ba9cfc7 [Tathagata Das] Improved kinesis test selection logic 88d59bd [Tathagata Das] updated test modules 871fcc8 [Tathagata Das] Fixed SparkBuild 94be631 [Tathagata Das] Fixed style b858196 [Tathagata Das] Fixed conditions and few other things based on PR comments. e292e64 [Tathagata Das] Added filters for Kinesis python tests Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0f90d605 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0f90d605 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0f90d605 Branch: refs/heads/master Commit: 0f90d6055e5bea9ceb1d454db84f4aa1d59b284d Parents: 91e9389 Author: Tathagata Das tathagata.das1...@gmail.com Authored: Mon Aug 10 23:41:53 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Mon Aug 10 23:41:53 2015 -0700 -- python/pyspark/streaming/tests.py | 56 ++ 1 file changed, 44 insertions(+), 12 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0f90d605/python/pyspark/streaming/tests.py -- diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py index 66ae334..f0ed415 100644 --- a/python/pyspark/streaming/tests.py +++ b/python/pyspark/streaming/tests.py @@ -971,8 +971,10 @@ class KinesisStreamTests(PySparkStreamingTestCase): awsAccessKey, awsSecretKey) def test_kinesis_stream(self): -if os.environ.get('ENABLE_KINESIS_TESTS') != '1': -print(Skip test_kinesis_stream) +if not are_kinesis_tests_enabled: +sys.stderr.write( +Skipped test_kinesis_stream (enable by setting environment variable %s=1 +% kinesis_test_environ_var) return import random @@ -1013,6 +1015,7 @@ class KinesisStreamTests(PySparkStreamingTestCase): traceback.print_exc() raise finally: +self.ssc.stop(False) kinesisTestUtils.deleteStream() kinesisTestUtils.deleteDynamoDBTable(kinesisAppName) @@ -1027,7 +1030,7 @@ def search_kafka_assembly_jar(): (Failed to find Spark Streaming kafka assembly jar in %s. % kafka_assembly_dir) + You need to build Spark with 'build/sbt assembly/assembly streaming-kafka-assembly/assembly' or -'build/mvn package' before running this test) +'build/mvn package' before running this test.) elif len(jars) 1: raise Exception((Found multiple Spark Streaming Kafka assembly JARs in %s; please remove all but one) % kafka_assembly_dir) @@ -1045,7 +1048,7 @@ def search_flume_assembly_jar(): (Failed to find Spark Streaming Flume assembly jar in %s. % flume_assembly_dir) + You need to build Spark with 'build/sbt assembly/assembly streaming-flume-assembly/assembly' or -'build/mvn package' before running this test) +'build/mvn package' before running this test.) elif len(jars) 1: raise Exception((Found multiple Spark Streaming Flume assembly JARs in %s; please remove all but one) % flume_assembly_dir) @@ -1095,11 +1098,7 @@ def search_kinesis_asl_assembly_jar(): os.path.join(kinesis_asl_assembly_dir, target/scala-*/spark-streaming-kinesis-asl-assembly-*.jar)) if not jars: -raise Exception( -(Failed to find Spark Streaming Kinesis ASL assembly jar in %s. % - kinesis_asl_assembly_dir) + You need to build Spark with -'build/sbt -Pkinesis-asl assembly/assembly streaming-kinesis-asl-assembly/assembly' -or 'build/mvn -Pkinesis-asl package' before running this test) +return None elif len(jars) 1: raise Exception((Found multiple Spark Streaming Kinesis ASL assembly JARs in %s; please remove all but one) % kinesis_asl_assembly_dir) @@ -1107,6 +1106,10 @@ def search_kinesis_asl_assembly_jar(): return jars[0] +# Must be same as the variable and condition defined
spark git commit: [SPARK-9727] [STREAMING] [BUILD] Updated streaming kinesis SBT project name to be more consistent
Repository: spark Updated Branches: refs/heads/master 55752d883 - 600031ebe [SPARK-9727] [STREAMING] [BUILD] Updated streaming kinesis SBT project name to be more consistent Author: Tathagata Das tathagata.das1...@gmail.com Closes #8092 from tdas/SPARK-9727 and squashes the following commits: b1b01fd [Tathagata Das] Updated streaming kinesis project name Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/600031eb Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/600031eb Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/600031eb Branch: refs/heads/master Commit: 600031ebe27473d8fffe6ea436c2149223b82896 Parents: 55752d8 Author: Tathagata Das tathagata.das1...@gmail.com Authored: Tue Aug 11 02:41:03 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Tue Aug 11 02:41:03 2015 -0700 -- dev/sparktestsupport/modules.py | 4 ++-- extras/kinesis-asl/pom.xml | 2 +- project/SparkBuild.scala| 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/600031eb/dev/sparktestsupport/modules.py -- diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index d82c0cc..346452f 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -134,7 +134,7 @@ streaming = Module( # files in streaming_kinesis_asl are changed, so that if Kinesis experiences an outage, we don't # fail other PRs. streaming_kinesis_asl = Module( -name=kinesis-asl, +name=streaming-kinesis-asl, dependencies=[], source_file_regexes=[ extras/kinesis-asl/, @@ -147,7 +147,7 @@ streaming_kinesis_asl = Module( ENABLE_KINESIS_TESTS: 1 }, sbt_test_goals=[ -kinesis-asl/test, +streaming-kinesis-asl/test, ] ) http://git-wip-us.apache.org/repos/asf/spark/blob/600031eb/extras/kinesis-asl/pom.xml -- diff --git a/extras/kinesis-asl/pom.xml b/extras/kinesis-asl/pom.xml index c242e7a..521b53e 100644 --- a/extras/kinesis-asl/pom.xml +++ b/extras/kinesis-asl/pom.xml @@ -31,7 +31,7 @@ nameSpark Kinesis Integration/name properties -sbt.project.namekinesis-asl/sbt.project.name +sbt.project.namestreaming-kinesis-asl/sbt.project.name /properties dependencies http://git-wip-us.apache.org/repos/asf/spark/blob/600031eb/project/SparkBuild.scala -- diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index 41a85fa..cad7067 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -42,8 +42,8 @@ object BuildCommons { streaming-zeromq, launcher, unsafe).map(ProjectRef(buildLocation, _)) val optionallyEnabledProjects@Seq(yarn, yarnStable, java8Tests, sparkGangliaLgpl, -sparkKinesisAsl) = Seq(yarn, yarn-stable, java8-tests, ganglia-lgpl, -kinesis-asl).map(ProjectRef(buildLocation, _)) +streamingKinesisAsl) = Seq(yarn, yarn-stable, java8-tests, ganglia-lgpl, +streaming-kinesis-asl).map(ProjectRef(buildLocation, _)) val assemblyProjects@Seq(assembly, examples, networkYarn, streamingFlumeAssembly, streamingKafkaAssembly, streamingMqttAssembly, streamingKinesisAslAssembly) = Seq(assembly, examples, network-yarn, streaming-flume-assembly, streaming-kafka-assembly, streaming-mqtt-assembly, streaming-kinesis-asl-assembly) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9727] [STREAMING] [BUILD] Updated streaming kinesis SBT project name to be more consistent
Repository: spark Updated Branches: refs/heads/branch-1.5 c7f009040 - ebbd3b616 [SPARK-9727] [STREAMING] [BUILD] Updated streaming kinesis SBT project name to be more consistent Author: Tathagata Das tathagata.das1...@gmail.com Closes #8092 from tdas/SPARK-9727 and squashes the following commits: b1b01fd [Tathagata Das] Updated streaming kinesis project name (cherry picked from commit 600031ebe27473d8fffe6ea436c2149223b82896) Signed-off-by: Tathagata Das tathagata.das1...@gmail.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ebbd3b61 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ebbd3b61 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ebbd3b61 Branch: refs/heads/branch-1.5 Commit: ebbd3b616bf49701c2466bde5193241f69cf3e30 Parents: c7f0090 Author: Tathagata Das tathagata.das1...@gmail.com Authored: Tue Aug 11 02:41:03 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Tue Aug 11 02:41:25 2015 -0700 -- dev/sparktestsupport/modules.py | 4 ++-- extras/kinesis-asl/pom.xml | 2 +- project/SparkBuild.scala| 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ebbd3b61/dev/sparktestsupport/modules.py -- diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index d82c0cc..346452f 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -134,7 +134,7 @@ streaming = Module( # files in streaming_kinesis_asl are changed, so that if Kinesis experiences an outage, we don't # fail other PRs. streaming_kinesis_asl = Module( -name=kinesis-asl, +name=streaming-kinesis-asl, dependencies=[], source_file_regexes=[ extras/kinesis-asl/, @@ -147,7 +147,7 @@ streaming_kinesis_asl = Module( ENABLE_KINESIS_TESTS: 1 }, sbt_test_goals=[ -kinesis-asl/test, +streaming-kinesis-asl/test, ] ) http://git-wip-us.apache.org/repos/asf/spark/blob/ebbd3b61/extras/kinesis-asl/pom.xml -- diff --git a/extras/kinesis-asl/pom.xml b/extras/kinesis-asl/pom.xml index c242e7a..521b53e 100644 --- a/extras/kinesis-asl/pom.xml +++ b/extras/kinesis-asl/pom.xml @@ -31,7 +31,7 @@ nameSpark Kinesis Integration/name properties -sbt.project.namekinesis-asl/sbt.project.name +sbt.project.namestreaming-kinesis-asl/sbt.project.name /properties dependencies http://git-wip-us.apache.org/repos/asf/spark/blob/ebbd3b61/project/SparkBuild.scala -- diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index 41a85fa..cad7067 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -42,8 +42,8 @@ object BuildCommons { streaming-zeromq, launcher, unsafe).map(ProjectRef(buildLocation, _)) val optionallyEnabledProjects@Seq(yarn, yarnStable, java8Tests, sparkGangliaLgpl, -sparkKinesisAsl) = Seq(yarn, yarn-stable, java8-tests, ganglia-lgpl, -kinesis-asl).map(ProjectRef(buildLocation, _)) +streamingKinesisAsl) = Seq(yarn, yarn-stable, java8-tests, ganglia-lgpl, +streaming-kinesis-asl).map(ProjectRef(buildLocation, _)) val assemblyProjects@Seq(assembly, examples, networkYarn, streamingFlumeAssembly, streamingKafkaAssembly, streamingMqttAssembly, streamingKinesisAslAssembly) = Seq(assembly, examples, network-yarn, streaming-flume-assembly, streaming-kafka-assembly, streaming-mqtt-assembly, streaming-kinesis-asl-assembly) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9572] [STREAMING] [PYSPARK] Added StreamingContext.getActiveOrCreate() in Python
Repository: spark Updated Branches: refs/heads/master dbd778d84 - 5b8bb1b21 [SPARK-9572] [STREAMING] [PYSPARK] Added StreamingContext.getActiveOrCreate() in Python Author: Tathagata Das tathagata.das1...@gmail.com Closes #8080 from tdas/SPARK-9572 and squashes the following commits: 64a231d [Tathagata Das] Fix based on comments 741a0d0 [Tathagata Das] Fixed style f4f094c [Tathagata Das] Tweaked test 9afcdbe [Tathagata Das] Merge remote-tracking branch 'apache-github/master' into SPARK-9572 e21488d [Tathagata Das] Minor update 1a371d9 [Tathagata Das] Addressed comments. 60479da [Tathagata Das] Fixed indent 9c2da9c [Tathagata Das] Fixed bugs b5bd32c [Tathagata Das] Merge remote-tracking branch 'apache-github/master' into SPARK-9572 b55b348 [Tathagata Das] Removed prints 5781728 [Tathagata Das] Fix style issues b711214 [Tathagata Das] Reverted run-tests.py 643b59d [Tathagata Das] Revert unnecessary change 150e58c [Tathagata Das] Added StreamingContext.getActiveOrCreate() in Python Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5b8bb1b2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5b8bb1b2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5b8bb1b2 Branch: refs/heads/master Commit: 5b8bb1b213b8738f563fcd00747604410fbb3087 Parents: dbd778d Author: Tathagata Das tathagata.das1...@gmail.com Authored: Tue Aug 11 12:02:28 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Tue Aug 11 12:02:28 2015 -0700 -- python/pyspark/streaming/context.py | 57 - python/pyspark/streaming/tests.py | 133 --- python/run-tests.py | 2 +- 3 files changed, 177 insertions(+), 15 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5b8bb1b2/python/pyspark/streaming/context.py -- diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py index ac5ba69..e3ba70e 100644 --- a/python/pyspark/streaming/context.py +++ b/python/pyspark/streaming/context.py @@ -86,6 +86,9 @@ class StreamingContext(object): _transformerSerializer = None +# Reference to a currently active StreamingContext +_activeContext = None + def __init__(self, sparkContext, batchDuration=None, jssc=None): Create a new StreamingContext. @@ -142,10 +145,10 @@ class StreamingContext(object): Either recreate a StreamingContext from checkpoint data or create a new StreamingContext. If checkpoint data exists in the provided `checkpointPath`, then StreamingContext will be recreated from the checkpoint data. If the data does not exist, then the provided setupFunc -will be used to create a JavaStreamingContext. +will be used to create a new context. -@param checkpointPath: Checkpoint directory used in an earlier JavaStreamingContext program -@param setupFunc: Function to create a new JavaStreamingContext and setup DStreams +@param checkpointPath: Checkpoint directory used in an earlier streaming program +@param setupFunc: Function to create a new context and setup DStreams # TODO: support checkpoint in HDFS if not os.path.exists(checkpointPath) or not os.listdir(checkpointPath): @@ -170,6 +173,52 @@ class StreamingContext(object): cls._transformerSerializer.ctx = sc return StreamingContext(sc, None, jssc) +@classmethod +def getActive(cls): + +Return either the currently active StreamingContext (i.e., if there is a context started +but not stopped) or None. + +activePythonContext = cls._activeContext +if activePythonContext is not None: +# Verify that the current running Java StreamingContext is active and is the same one +# backing the supposedly active Python context +activePythonContextJavaId = activePythonContext._jssc.ssc().hashCode() +activeJvmContextOption = activePythonContext._jvm.StreamingContext.getActive() + +if activeJvmContextOption.isEmpty(): +cls._activeContext = None +elif activeJvmContextOption.get().hashCode() != activePythonContextJavaId: +cls._activeContext = None +raise Exception(JVM's active JavaStreamingContext is not the JavaStreamingContext +backing the action Python StreamingContext. This is unexpected.) +return cls._activeContext + +@classmethod +def getActiveOrCreate(cls, checkpointPath, setupFunc): + +Either return the active StreamingContext (i.e. currently started but not stopped
spark git commit: [SPARK-9572] [STREAMING] [PYSPARK] Added StreamingContext.getActiveOrCreate() in Python
Repository: spark Updated Branches: refs/heads/branch-1.5 b077f36ea - 71460b889 [SPARK-9572] [STREAMING] [PYSPARK] Added StreamingContext.getActiveOrCreate() in Python Author: Tathagata Das tathagata.das1...@gmail.com Closes #8080 from tdas/SPARK-9572 and squashes the following commits: 64a231d [Tathagata Das] Fix based on comments 741a0d0 [Tathagata Das] Fixed style f4f094c [Tathagata Das] Tweaked test 9afcdbe [Tathagata Das] Merge remote-tracking branch 'apache-github/master' into SPARK-9572 e21488d [Tathagata Das] Minor update 1a371d9 [Tathagata Das] Addressed comments. 60479da [Tathagata Das] Fixed indent 9c2da9c [Tathagata Das] Fixed bugs b5bd32c [Tathagata Das] Merge remote-tracking branch 'apache-github/master' into SPARK-9572 b55b348 [Tathagata Das] Removed prints 5781728 [Tathagata Das] Fix style issues b711214 [Tathagata Das] Reverted run-tests.py 643b59d [Tathagata Das] Revert unnecessary change 150e58c [Tathagata Das] Added StreamingContext.getActiveOrCreate() in Python (cherry picked from commit 5b8bb1b213b8738f563fcd00747604410fbb3087) Signed-off-by: Tathagata Das tathagata.das1...@gmail.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/71460b88 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/71460b88 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/71460b88 Branch: refs/heads/branch-1.5 Commit: 71460b889b4fd7345706a84e26132c216625df95 Parents: b077f36 Author: Tathagata Das tathagata.das1...@gmail.com Authored: Tue Aug 11 12:02:28 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Tue Aug 11 12:02:44 2015 -0700 -- python/pyspark/streaming/context.py | 57 - python/pyspark/streaming/tests.py | 133 --- python/run-tests.py | 2 +- 3 files changed, 177 insertions(+), 15 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/71460b88/python/pyspark/streaming/context.py -- diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py index ac5ba69..e3ba70e 100644 --- a/python/pyspark/streaming/context.py +++ b/python/pyspark/streaming/context.py @@ -86,6 +86,9 @@ class StreamingContext(object): _transformerSerializer = None +# Reference to a currently active StreamingContext +_activeContext = None + def __init__(self, sparkContext, batchDuration=None, jssc=None): Create a new StreamingContext. @@ -142,10 +145,10 @@ class StreamingContext(object): Either recreate a StreamingContext from checkpoint data or create a new StreamingContext. If checkpoint data exists in the provided `checkpointPath`, then StreamingContext will be recreated from the checkpoint data. If the data does not exist, then the provided setupFunc -will be used to create a JavaStreamingContext. +will be used to create a new context. -@param checkpointPath: Checkpoint directory used in an earlier JavaStreamingContext program -@param setupFunc: Function to create a new JavaStreamingContext and setup DStreams +@param checkpointPath: Checkpoint directory used in an earlier streaming program +@param setupFunc: Function to create a new context and setup DStreams # TODO: support checkpoint in HDFS if not os.path.exists(checkpointPath) or not os.listdir(checkpointPath): @@ -170,6 +173,52 @@ class StreamingContext(object): cls._transformerSerializer.ctx = sc return StreamingContext(sc, None, jssc) +@classmethod +def getActive(cls): + +Return either the currently active StreamingContext (i.e., if there is a context started +but not stopped) or None. + +activePythonContext = cls._activeContext +if activePythonContext is not None: +# Verify that the current running Java StreamingContext is active and is the same one +# backing the supposedly active Python context +activePythonContextJavaId = activePythonContext._jssc.ssc().hashCode() +activeJvmContextOption = activePythonContext._jvm.StreamingContext.getActive() + +if activeJvmContextOption.isEmpty(): +cls._activeContext = None +elif activeJvmContextOption.get().hashCode() != activePythonContextJavaId: +cls._activeContext = None +raise Exception(JVM's active JavaStreamingContext is not the JavaStreamingContext +backing the action Python StreamingContext. This is unexpected.) +return cls._activeContext + +@classmethod +def getActiveOrCreate(cls
spark git commit: [SPARK-5155] [PYSPARK] [STREAMING] Mqtt streaming support in Python
Repository: spark Updated Branches: refs/heads/master c4fd2a242 - 853809e94 [SPARK-5155] [PYSPARK] [STREAMING] Mqtt streaming support in Python This PR is based on #4229, thanks prabeesh. Closes #4229 Author: Prabeesh K prabsma...@gmail.com Author: zsxwing zsxw...@gmail.com Author: prabs prabsma...@gmail.com Author: Prabeesh K prabees...@namshi.com Closes #7833 from zsxwing/pr4229 and squashes the following commits: 9570bec [zsxwing] Fix the variable name and check null in finally 4a9c79e [zsxwing] Fix pom.xml indentation abf5f18 [zsxwing] Merge branch 'master' into pr4229 935615c [zsxwing] Fix the flaky MQTT tests 47278c5 [zsxwing] Include the project class files 478f844 [zsxwing] Add unpack 5f8a1d4 [zsxwing] Make the maven build generate the test jar for Python MQTT tests 734db99 [zsxwing] Merge branch 'master' into pr4229 126608a [Prabeesh K] address the comments b90b709 [Prabeesh K] Merge pull request #1 from zsxwing/pr4229 d07f454 [zsxwing] Register StreamingListerner before starting StreamingContext; Revert unncessary changes; fix the python unit test a6747cb [Prabeesh K] wait for starting the receiver before publishing data 87fc677 [Prabeesh K] address the comments: 97244ec [zsxwing] Make sbt build the assembly test jar for streaming mqtt 80474d1 [Prabeesh K] fix 1f0cfe9 [Prabeesh K] python style fix e1ee016 [Prabeesh K] scala style fix a5a8f9f [Prabeesh K] added Python test 9767d82 [Prabeesh K] implemented Python-friendly class a11968b [Prabeesh K] fixed python style 795ec27 [Prabeesh K] address comments ee387ae [Prabeesh K] Fix assembly jar location of mqtt-assembly 3f4df12 [Prabeesh K] updated version b34c3c1 [prabs] adress comments 3aa7fff [prabs] Added Python streaming mqtt word count example b7d42ff [prabs] Mqtt streaming support in Python Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/853809e9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/853809e9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/853809e9 Branch: refs/heads/master Commit: 853809e948e7c5092643587a30738115b6591a59 Parents: c4fd2a2 Author: Prabeesh K prabsma...@gmail.com Authored: Mon Aug 10 16:33:23 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Mon Aug 10 16:33:23 2015 -0700 -- dev/run-tests.py| 2 + dev/sparktestsupport/modules.py | 2 + docs/streaming-programming-guide.md | 2 +- .../src/main/python/streaming/mqtt_wordcount.py | 58 + external/mqtt-assembly/pom.xml | 102 external/mqtt/pom.xml | 28 + external/mqtt/src/main/assembly/assembly.xml| 44 +++ .../apache/spark/streaming/mqtt/MQTTUtils.scala | 16 +++ .../spark/streaming/mqtt/MQTTStreamSuite.scala | 118 +++ .../spark/streaming/mqtt/MQTTTestUtils.scala| 111 + pom.xml | 1 + project/SparkBuild.scala| 12 +- python/pyspark/streaming/mqtt.py| 72 +++ python/pyspark/streaming/tests.py | 106 - 14 files changed, 565 insertions(+), 109 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/853809e9/dev/run-tests.py -- diff --git a/dev/run-tests.py b/dev/run-tests.py index d1852b9..f689425 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -303,6 +303,8 @@ def build_spark_sbt(hadoop_version): assembly/assembly, streaming-kafka-assembly/assembly, streaming-flume-assembly/assembly, + streaming-mqtt-assembly/assembly, + streaming-mqtt/test:assembly, streaming-kinesis-asl-assembly/assembly] profiles_and_goals = build_profiles + sbt_goals http://git-wip-us.apache.org/repos/asf/spark/blob/853809e9/dev/sparktestsupport/modules.py -- diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index a9717ff..d82c0cc 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -181,6 +181,7 @@ streaming_mqtt = Module( dependencies=[streaming], source_file_regexes=[ external/mqtt, +external/mqtt-assembly, ], sbt_test_goals=[ streaming-mqtt/test, @@ -306,6 +307,7 @@ pyspark_streaming = Module( streaming, streaming_kafka, streaming_flume_assembly, +streaming_mqtt, streaming_kinesis_asl ], source_file_regexes=[ http://git-wip-us.apache.org/repos/asf/spark/blob/853809e9/docs/streaming-programming-guide.md
spark git commit: [SPARK-5155] [PYSPARK] [STREAMING] Mqtt streaming support in Python
Repository: spark Updated Branches: refs/heads/branch-1.5 51406becc - 8f4014fda [SPARK-5155] [PYSPARK] [STREAMING] Mqtt streaming support in Python This PR is based on #4229, thanks prabeesh. Closes #4229 Author: Prabeesh K prabsma...@gmail.com Author: zsxwing zsxw...@gmail.com Author: prabs prabsma...@gmail.com Author: Prabeesh K prabees...@namshi.com Closes #7833 from zsxwing/pr4229 and squashes the following commits: 9570bec [zsxwing] Fix the variable name and check null in finally 4a9c79e [zsxwing] Fix pom.xml indentation abf5f18 [zsxwing] Merge branch 'master' into pr4229 935615c [zsxwing] Fix the flaky MQTT tests 47278c5 [zsxwing] Include the project class files 478f844 [zsxwing] Add unpack 5f8a1d4 [zsxwing] Make the maven build generate the test jar for Python MQTT tests 734db99 [zsxwing] Merge branch 'master' into pr4229 126608a [Prabeesh K] address the comments b90b709 [Prabeesh K] Merge pull request #1 from zsxwing/pr4229 d07f454 [zsxwing] Register StreamingListerner before starting StreamingContext; Revert unncessary changes; fix the python unit test a6747cb [Prabeesh K] wait for starting the receiver before publishing data 87fc677 [Prabeesh K] address the comments: 97244ec [zsxwing] Make sbt build the assembly test jar for streaming mqtt 80474d1 [Prabeesh K] fix 1f0cfe9 [Prabeesh K] python style fix e1ee016 [Prabeesh K] scala style fix a5a8f9f [Prabeesh K] added Python test 9767d82 [Prabeesh K] implemented Python-friendly class a11968b [Prabeesh K] fixed python style 795ec27 [Prabeesh K] address comments ee387ae [Prabeesh K] Fix assembly jar location of mqtt-assembly 3f4df12 [Prabeesh K] updated version b34c3c1 [prabs] adress comments 3aa7fff [prabs] Added Python streaming mqtt word count example b7d42ff [prabs] Mqtt streaming support in Python (cherry picked from commit 853809e948e7c5092643587a30738115b6591a59) Signed-off-by: Tathagata Das tathagata.das1...@gmail.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8f4014fd Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8f4014fd Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8f4014fd Branch: refs/heads/branch-1.5 Commit: 8f4014fdaf22dd8a3bd4728987c76c11d79e07d9 Parents: 51406be Author: Prabeesh K prabsma...@gmail.com Authored: Mon Aug 10 16:33:23 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Mon Aug 10 16:33:34 2015 -0700 -- dev/run-tests.py| 2 + dev/sparktestsupport/modules.py | 2 + docs/streaming-programming-guide.md | 2 +- .../src/main/python/streaming/mqtt_wordcount.py | 58 + external/mqtt-assembly/pom.xml | 102 external/mqtt/pom.xml | 28 + external/mqtt/src/main/assembly/assembly.xml| 44 +++ .../apache/spark/streaming/mqtt/MQTTUtils.scala | 16 +++ .../spark/streaming/mqtt/MQTTStreamSuite.scala | 118 +++ .../spark/streaming/mqtt/MQTTTestUtils.scala| 111 + pom.xml | 1 + project/SparkBuild.scala| 12 +- python/pyspark/streaming/mqtt.py| 72 +++ python/pyspark/streaming/tests.py | 106 - 14 files changed, 565 insertions(+), 109 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8f4014fd/dev/run-tests.py -- diff --git a/dev/run-tests.py b/dev/run-tests.py index d1852b9..f689425 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -303,6 +303,8 @@ def build_spark_sbt(hadoop_version): assembly/assembly, streaming-kafka-assembly/assembly, streaming-flume-assembly/assembly, + streaming-mqtt-assembly/assembly, + streaming-mqtt/test:assembly, streaming-kinesis-asl-assembly/assembly] profiles_and_goals = build_profiles + sbt_goals http://git-wip-us.apache.org/repos/asf/spark/blob/8f4014fd/dev/sparktestsupport/modules.py -- diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index a9717ff..d82c0cc 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -181,6 +181,7 @@ streaming_mqtt = Module( dependencies=[streaming], source_file_regexes=[ external/mqtt, +external/mqtt-assembly, ], sbt_test_goals=[ streaming-mqtt/test, @@ -306,6 +307,7 @@ pyspark_streaming = Module( streaming, streaming_kafka, streaming_flume_assembly, +streaming_mqtt, streaming_kinesis_asl
spark git commit: [SPARK-9801] [STREAMING] Check if file exists before deleting temporary files.
Repository: spark Updated Branches: refs/heads/branch-1.4 4b5bbc589 - 6dde38026 [SPARK-9801] [STREAMING] Check if file exists before deleting temporary files. Spark streaming deletes the temp file and backup files without checking if they exist or not Author: Hao Zhu viadea...@gmail.com Closes #8082 from viadea/master and squashes the following commits: 242d05f [Hao Zhu] [SPARK-9801][Streaming]No need to check the existence of those files fd143f2 [Hao Zhu] [SPARK-9801][Streaming]Check if backupFile exists before deleting backupFile files. 087daf0 [Hao Zhu] SPARK-9801 (cherry picked from commit 3c9802d9400bea802984456683b2736a450ee17e) Signed-off-by: Tathagata Das tathagata.das1...@gmail.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6dde3802 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6dde3802 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6dde3802 Branch: refs/heads/branch-1.4 Commit: 6dde38026113d8f83190e801a0f889c53bbc316d Parents: 4b5bbc5 Author: Hao Zhu viadea...@gmail.com Authored: Mon Aug 10 17:17:22 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Mon Aug 10 17:17:47 2015 -0700 -- .../main/scala/org/apache/spark/streaming/Checkpoint.scala | 8 ++-- 1 file changed, 6 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6dde3802/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala index 5279331..bd117ed 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala @@ -189,7 +189,9 @@ class CheckpointWriter( + ') // Write checkpoint to temp file - fs.delete(tempFile, true) // just in case it exists + if (fs.exists(tempFile)) { +fs.delete(tempFile, true) // just in case it exists + } val fos = fs.create(tempFile) Utils.tryWithSafeFinally { fos.write(bytes) @@ -200,7 +202,9 @@ class CheckpointWriter( // If the checkpoint file exists, back it up // If the backup exists as well, just delete it, otherwise rename will fail if (fs.exists(checkpointFile)) { -fs.delete(backupFile, true) // just in case it exists +if (fs.exists(backupFile)){ + fs.delete(backupFile, true) // just in case it exists +} if (!fs.rename(checkpointFile, backupFile)) { logWarning(Could not rename + checkpointFile + to + backupFile) } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9801] [STREAMING] Check if file exists before deleting temporary files.
Repository: spark Updated Branches: refs/heads/branch-1.5 8f4014fda - 94692bb14 [SPARK-9801] [STREAMING] Check if file exists before deleting temporary files. Spark streaming deletes the temp file and backup files without checking if they exist or not Author: Hao Zhu viadea...@gmail.com Closes #8082 from viadea/master and squashes the following commits: 242d05f [Hao Zhu] [SPARK-9801][Streaming]No need to check the existence of those files fd143f2 [Hao Zhu] [SPARK-9801][Streaming]Check if backupFile exists before deleting backupFile files. 087daf0 [Hao Zhu] SPARK-9801 (cherry picked from commit 3c9802d9400bea802984456683b2736a450ee17e) Signed-off-by: Tathagata Das tathagata.das1...@gmail.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/94692bb1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/94692bb1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/94692bb1 Branch: refs/heads/branch-1.5 Commit: 94692bb14f75b814aab00bc43f15550e26ada6f1 Parents: 8f4014f Author: Hao Zhu viadea...@gmail.com Authored: Mon Aug 10 17:17:22 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Mon Aug 10 17:17:33 2015 -0700 -- .../main/scala/org/apache/spark/streaming/Checkpoint.scala | 8 ++-- 1 file changed, 6 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/94692bb1/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala index 2780d5b..6f6b449 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala @@ -192,7 +192,9 @@ class CheckpointWriter( + ') // Write checkpoint to temp file - fs.delete(tempFile, true) // just in case it exists + if (fs.exists(tempFile)) { +fs.delete(tempFile, true) // just in case it exists + } val fos = fs.create(tempFile) Utils.tryWithSafeFinally { fos.write(bytes) @@ -203,7 +205,9 @@ class CheckpointWriter( // If the checkpoint file exists, back it up // If the backup exists as well, just delete it, otherwise rename will fail if (fs.exists(checkpointFile)) { -fs.delete(backupFile, true) // just in case it exists +if (fs.exists(backupFile)){ + fs.delete(backupFile, true) // just in case it exists +} if (!fs.rename(checkpointFile, backupFile)) { logWarning(Could not rename + checkpointFile + to + backupFile) } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9801] [STREAMING] Check if file exists before deleting temporary files.
Repository: spark Updated Branches: refs/heads/master 853809e94 - 3c9802d94 [SPARK-9801] [STREAMING] Check if file exists before deleting temporary files. Spark streaming deletes the temp file and backup files without checking if they exist or not Author: Hao Zhu viadea...@gmail.com Closes #8082 from viadea/master and squashes the following commits: 242d05f [Hao Zhu] [SPARK-9801][Streaming]No need to check the existence of those files fd143f2 [Hao Zhu] [SPARK-9801][Streaming]Check if backupFile exists before deleting backupFile files. 087daf0 [Hao Zhu] SPARK-9801 Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3c9802d9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3c9802d9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3c9802d9 Branch: refs/heads/master Commit: 3c9802d9400bea802984456683b2736a450ee17e Parents: 853809e Author: Hao Zhu viadea...@gmail.com Authored: Mon Aug 10 17:17:22 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Mon Aug 10 17:17:22 2015 -0700 -- .../main/scala/org/apache/spark/streaming/Checkpoint.scala | 8 ++-- 1 file changed, 6 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3c9802d9/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala index 2780d5b..6f6b449 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala @@ -192,7 +192,9 @@ class CheckpointWriter( + ') // Write checkpoint to temp file - fs.delete(tempFile, true) // just in case it exists + if (fs.exists(tempFile)) { +fs.delete(tempFile, true) // just in case it exists + } val fos = fs.create(tempFile) Utils.tryWithSafeFinally { fos.write(bytes) @@ -203,7 +205,9 @@ class CheckpointWriter( // If the checkpoint file exists, back it up // If the backup exists as well, just delete it, otherwise rename will fail if (fs.exists(checkpointFile)) { -fs.delete(backupFile, true) // just in case it exists +if (fs.exists(backupFile)){ + fs.delete(backupFile, true) // just in case it exists +} if (!fs.rename(checkpointFile, backupFile)) { logWarning(Could not rename + checkpointFile + to + backupFile) } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9801] [STREAMING] Check if file exists before deleting temporary files.
Repository: spark Updated Branches: refs/heads/branch-1.3 b104501d3 - a98603f8c [SPARK-9801] [STREAMING] Check if file exists before deleting temporary files. Spark streaming deletes the temp file and backup files without checking if they exist or not Author: Hao Zhu viadea...@gmail.com Closes #8082 from viadea/master and squashes the following commits: 242d05f [Hao Zhu] [SPARK-9801][Streaming]No need to check the existence of those files fd143f2 [Hao Zhu] [SPARK-9801][Streaming]Check if backupFile exists before deleting backupFile files. 087daf0 [Hao Zhu] SPARK-9801 (cherry picked from commit 3c9802d9400bea802984456683b2736a450ee17e) Signed-off-by: Tathagata Das tathagata.das1...@gmail.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a98603f8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a98603f8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a98603f8 Branch: refs/heads/branch-1.3 Commit: a98603f8c118fcd23efe80ebaa120e47e9785d46 Parents: b104501 Author: Hao Zhu viadea...@gmail.com Authored: Mon Aug 10 17:17:22 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Mon Aug 10 17:18:03 2015 -0700 -- .../main/scala/org/apache/spark/streaming/Checkpoint.scala | 8 ++-- 1 file changed, 6 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a98603f8/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala index 832ce78..c1d0fe4 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala @@ -137,7 +137,9 @@ class CheckpointWriter( + ') // Write checkpoint to temp file - fs.delete(tempFile, true) // just in case it exists + if (fs.exists(tempFile)) { +fs.delete(tempFile, true) // just in case it exists + } val fos = fs.create(tempFile) fos.write(bytes) fos.close() @@ -145,7 +147,9 @@ class CheckpointWriter( // If the checkpoint file exists, back it up // If the backup exists as well, just delete it, otherwise rename will fail if (fs.exists(checkpointFile)) { -fs.delete(backupFile, true) // just in case it exists +if (fs.exists(backupFile)){ + fs.delete(backupFile, true) // just in case it exists +} if (!fs.rename(checkpointFile, backupFile)) { logWarning(Could not rename + checkpointFile + to + backupFile) } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8978] [STREAMING] Implements the DirectKafkaRateController
Repository: spark Updated Branches: refs/heads/branch-1.5 8a7956283 - 8b00c0690 [SPARK-8978] [STREAMING] Implements the DirectKafkaRateController Author: Dean Wampler d...@concurrentthought.com Author: Nilanjan Raychaudhuri nraychaudh...@gmail.com Author: François Garillot franc...@garillot.net Closes #7796 from dragos/topic/streaming-bp/kafka-direct and squashes the following commits: 50d1f21 [Nilanjan Raychaudhuri] Taking care of the remaining nits 648c8b1 [Dean Wampler] Refactored rate controller test to be more predictable and run faster. e43f678 [Nilanjan Raychaudhuri] fixing doc and nits ce19d2a [Dean Wampler] Removing an unreliable assertion. 9615320 [Dean Wampler] Give me a break... 6372478 [Dean Wampler] Found a few ways to make this test more robust... 9e69e37 [Dean Wampler] Attempt to fix flakey test that fails in CI, but not locally :( d3db1ea [Dean Wampler] Fixing stylecheck errors. d04a288 [Nilanjan Raychaudhuri] adding test to make sure rate controller is used to calculate maxMessagesPerPartition b6ecb67 [Nilanjan Raychaudhuri] Fixed styling issue 3110267 [Nilanjan Raychaudhuri] [SPARK-8978][Streaming] Implements the DirectKafkaRateController 393c580 [François Garillot] [SPARK-8978][Streaming] Implements the DirectKafkaRateController 51e78c6 [Nilanjan Raychaudhuri] Rename and fix build failure 2795509 [Nilanjan Raychaudhuri] Added missing RateController 19200f5 [Dean Wampler] Removed usage of infix notation. Changed a private variable name to be more consistent with usage. aa4a70b [François Garillot] [SPARK-8978][Streaming] Implements the DirectKafkaController (cherry picked from commit a1bbf1bc5c51cd796015ac159799cf024de6fa07) Signed-off-by: Tathagata Das tathagata.das1...@gmail.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8b00c069 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8b00c069 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8b00c069 Branch: refs/heads/branch-1.5 Commit: 8b00c06907edfbe57801edcc1035ea0422f78ee3 Parents: 8a79562 Author: Nilanjan Raychaudhuri nraychaudh...@gmail.com Authored: Thu Aug 6 12:50:08 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Thu Aug 6 12:53:25 2015 -0700 -- .../kafka/DirectKafkaInputDStream.scala | 47 +-- .../kafka/DirectKafkaStreamSuite.scala | 89 2 files changed, 127 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8b00c069/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala -- diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala index 48a1933..8a17707 100644 --- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala +++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala @@ -29,7 +29,8 @@ import org.apache.spark.{Logging, SparkException} import org.apache.spark.streaming.{StreamingContext, Time} import org.apache.spark.streaming.dstream._ import org.apache.spark.streaming.kafka.KafkaCluster.LeaderOffset -import org.apache.spark.streaming.scheduler.StreamInputInfo +import org.apache.spark.streaming.scheduler.{RateController, StreamInputInfo} +import org.apache.spark.streaming.scheduler.rate.RateEstimator /** * A stream of {@link org.apache.spark.streaming.kafka.KafkaRDD} where @@ -61,7 +62,7 @@ class DirectKafkaInputDStream[ val kafkaParams: Map[String, String], val fromOffsets: Map[TopicAndPartition, Long], messageHandler: MessageAndMetadata[K, V] = R -) extends InputDStream[R](ssc_) with Logging { + ) extends InputDStream[R](ssc_) with Logging { val maxRetries = context.sparkContext.getConf.getInt( spark.streaming.kafka.maxRetries, 1) @@ -71,14 +72,35 @@ class DirectKafkaInputDStream[ protected[streaming] override val checkpointData = new DirectKafkaInputDStreamCheckpointData + + /** + * Asynchronously maintains sends new rate limits to the receiver through the receiver tracker. + */ + override protected[streaming] val rateController: Option[RateController] = { +if (RateController.isBackPressureEnabled(ssc.conf)) { + Some(new DirectKafkaRateController(id, +RateEstimator.create(ssc.conf, ssc_.graph.batchDuration))) +} else { + None +} + } + protected val kc = new KafkaCluster(kafkaParams) - protected val maxMessagesPerPartition: Option[Long] = { -val ratePerSec = context.sparkContext.getConf.getInt( + private val maxRateLimitPerPartition: Int =
spark git commit: [SPARK-8978] [STREAMING] Implements the DirectKafkaRateController
Repository: spark Updated Branches: refs/heads/master 0d7aac99d - a1bbf1bc5 [SPARK-8978] [STREAMING] Implements the DirectKafkaRateController Author: Dean Wampler d...@concurrentthought.com Author: Nilanjan Raychaudhuri nraychaudh...@gmail.com Author: François Garillot franc...@garillot.net Closes #7796 from dragos/topic/streaming-bp/kafka-direct and squashes the following commits: 50d1f21 [Nilanjan Raychaudhuri] Taking care of the remaining nits 648c8b1 [Dean Wampler] Refactored rate controller test to be more predictable and run faster. e43f678 [Nilanjan Raychaudhuri] fixing doc and nits ce19d2a [Dean Wampler] Removing an unreliable assertion. 9615320 [Dean Wampler] Give me a break... 6372478 [Dean Wampler] Found a few ways to make this test more robust... 9e69e37 [Dean Wampler] Attempt to fix flakey test that fails in CI, but not locally :( d3db1ea [Dean Wampler] Fixing stylecheck errors. d04a288 [Nilanjan Raychaudhuri] adding test to make sure rate controller is used to calculate maxMessagesPerPartition b6ecb67 [Nilanjan Raychaudhuri] Fixed styling issue 3110267 [Nilanjan Raychaudhuri] [SPARK-8978][Streaming] Implements the DirectKafkaRateController 393c580 [François Garillot] [SPARK-8978][Streaming] Implements the DirectKafkaRateController 51e78c6 [Nilanjan Raychaudhuri] Rename and fix build failure 2795509 [Nilanjan Raychaudhuri] Added missing RateController 19200f5 [Dean Wampler] Removed usage of infix notation. Changed a private variable name to be more consistent with usage. aa4a70b [François Garillot] [SPARK-8978][Streaming] Implements the DirectKafkaController Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a1bbf1bc Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a1bbf1bc Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a1bbf1bc Branch: refs/heads/master Commit: a1bbf1bc5c51cd796015ac159799cf024de6fa07 Parents: 0d7aac9 Author: Nilanjan Raychaudhuri nraychaudh...@gmail.com Authored: Thu Aug 6 12:50:08 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Thu Aug 6 12:50:08 2015 -0700 -- .../kafka/DirectKafkaInputDStream.scala | 47 +-- .../kafka/DirectKafkaStreamSuite.scala | 89 2 files changed, 127 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a1bbf1bc/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala -- diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala index 48a1933..8a17707 100644 --- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala +++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala @@ -29,7 +29,8 @@ import org.apache.spark.{Logging, SparkException} import org.apache.spark.streaming.{StreamingContext, Time} import org.apache.spark.streaming.dstream._ import org.apache.spark.streaming.kafka.KafkaCluster.LeaderOffset -import org.apache.spark.streaming.scheduler.StreamInputInfo +import org.apache.spark.streaming.scheduler.{RateController, StreamInputInfo} +import org.apache.spark.streaming.scheduler.rate.RateEstimator /** * A stream of {@link org.apache.spark.streaming.kafka.KafkaRDD} where @@ -61,7 +62,7 @@ class DirectKafkaInputDStream[ val kafkaParams: Map[String, String], val fromOffsets: Map[TopicAndPartition, Long], messageHandler: MessageAndMetadata[K, V] = R -) extends InputDStream[R](ssc_) with Logging { + ) extends InputDStream[R](ssc_) with Logging { val maxRetries = context.sparkContext.getConf.getInt( spark.streaming.kafka.maxRetries, 1) @@ -71,14 +72,35 @@ class DirectKafkaInputDStream[ protected[streaming] override val checkpointData = new DirectKafkaInputDStreamCheckpointData + + /** + * Asynchronously maintains sends new rate limits to the receiver through the receiver tracker. + */ + override protected[streaming] val rateController: Option[RateController] = { +if (RateController.isBackPressureEnabled(ssc.conf)) { + Some(new DirectKafkaRateController(id, +RateEstimator.create(ssc.conf, ssc_.graph.batchDuration))) +} else { + None +} + } + protected val kc = new KafkaCluster(kafkaParams) - protected val maxMessagesPerPartition: Option[Long] = { -val ratePerSec = context.sparkContext.getConf.getInt( + private val maxRateLimitPerPartition: Int = context.sparkContext.getConf.getInt( spark.streaming.kafka.maxRatePerPartition, 0) -if (ratePerSec 0) { + protected def
spark git commit: [DOCS] [STREAMING] make the existing parameter docs for OffsetRange ac…
Repository: spark Updated Branches: refs/heads/branch-1.5 3997dd3fd - 8ecfb05e3 [DOCS] [STREAMING] make the existing parameter docs for OffsetRange ac⦠â¦tually visible Author: cody koeninger c...@koeninger.org Closes #7995 from koeninger/doc-fixes and squashes the following commits: 87af9ea [cody koeninger] [Docs][Streaming] make the existing parameter docs for OffsetRange actually visible (cherry picked from commit 1723e34893f9b087727ea0e5c8b335645f42c295) Signed-off-by: Tathagata Das tathagata.das1...@gmail.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8ecfb05e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8ecfb05e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8ecfb05e Branch: refs/heads/branch-1.5 Commit: 8ecfb05e370bc80cd74bb99dc8dd9459a855312f Parents: 3997dd3 Author: cody koeninger c...@koeninger.org Authored: Thu Aug 6 14:37:25 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Thu Aug 6 14:37:33 2015 -0700 -- .../scala/org/apache/spark/streaming/kafka/OffsetRange.scala | 8 1 file changed, 4 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8ecfb05e/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/OffsetRange.scala -- diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/OffsetRange.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/OffsetRange.scala index f326e7f..2f8981d 100644 --- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/OffsetRange.scala +++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/OffsetRange.scala @@ -42,16 +42,16 @@ trait HasOffsetRanges { * :: Experimental :: * Represents a range of offsets from a single Kafka TopicAndPartition. Instances of this class * can be created with `OffsetRange.create()`. + * @param topic Kafka topic name + * @param partition Kafka partition id + * @param fromOffset Inclusive starting offset + * @param untilOffset Exclusive ending offset */ @Experimental final class OffsetRange private( -/** Kafka topic name */ val topic: String, -/** Kafka partition id */ val partition: Int, -/** inclusive starting offset */ val fromOffset: Long, -/** exclusive ending offset */ val untilOffset: Long) extends Serializable { import OffsetRange.OffsetRangeTuple - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [DOCS] [STREAMING] make the existing parameter docs for OffsetRange ac…
Repository: spark Updated Branches: refs/heads/master 0a078303d - 1723e3489 [DOCS] [STREAMING] make the existing parameter docs for OffsetRange ac⦠â¦tually visible Author: cody koeninger c...@koeninger.org Closes #7995 from koeninger/doc-fixes and squashes the following commits: 87af9ea [cody koeninger] [Docs][Streaming] make the existing parameter docs for OffsetRange actually visible Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1723e348 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1723e348 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1723e348 Branch: refs/heads/master Commit: 1723e34893f9b087727ea0e5c8b335645f42c295 Parents: 0a07830 Author: cody koeninger c...@koeninger.org Authored: Thu Aug 6 14:37:25 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Thu Aug 6 14:37:25 2015 -0700 -- .../scala/org/apache/spark/streaming/kafka/OffsetRange.scala | 8 1 file changed, 4 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1723e348/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/OffsetRange.scala -- diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/OffsetRange.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/OffsetRange.scala index f326e7f..2f8981d 100644 --- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/OffsetRange.scala +++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/OffsetRange.scala @@ -42,16 +42,16 @@ trait HasOffsetRanges { * :: Experimental :: * Represents a range of offsets from a single Kafka TopicAndPartition. Instances of this class * can be created with `OffsetRange.create()`. + * @param topic Kafka topic name + * @param partition Kafka partition id + * @param fromOffset Inclusive starting offset + * @param untilOffset Exclusive ending offset */ @Experimental final class OffsetRange private( -/** Kafka topic name */ val topic: String, -/** Kafka partition id */ val partition: Int, -/** inclusive starting offset */ val fromOffset: Long, -/** exclusive ending offset */ val untilOffset: Long) extends Serializable { import OffsetRange.OffsetRangeTuple - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9556] [SPARK-9619] [SPARK-9624] [STREAMING] Make BlockGenerator more robust and make all BlockGenerators subscribe to rate limit updates
Repository: spark Updated Branches: refs/heads/branch-1.5 3137628bc - 3997dd3fd [SPARK-9556] [SPARK-9619] [SPARK-9624] [STREAMING] Make BlockGenerator more robust and make all BlockGenerators subscribe to rate limit updates In some receivers, instead of using the default `BlockGenerator` in `ReceiverSupervisorImpl`, custom generator with their custom listeners are used for reliability (see [`ReliableKafkaReceiver`](https://github.com/apache/spark/blob/master/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/ReliableKafkaReceiver.scala#L99) and [updated `KinesisReceiver`](https://github.com/apache/spark/pull/7825/files)). These custom generators do not receive rate updates. This PR modifies the code to allow custom `BlockGenerator`s to be created through the `ReceiverSupervisorImpl` so that they can be kept track and rate updates can be applied. In the process, I did some simplification, and de-flaki-fication of some rate controller related tests. In particular. - Renamed `Receiver.executor` to `Receiver.supervisor` (to match `ReceiverSupervisor`) - Made `RateControllerSuite` faster (by increasing batch interval) and less flaky - Changed a few internal API to return the current rate of block generators as Long instead of Option\[Long\] (was inconsistent at places). - Updated existing `ReceiverTrackerSuite` to test that custom block generators get rate updates as well. Author: Tathagata Das tathagata.das1...@gmail.com Closes #7913 from tdas/SPARK-9556 and squashes the following commits: 41d4461 [Tathagata Das] fix scala style eb9fd59 [Tathagata Das] Updated kinesis receiver d24994d [Tathagata Das] Updated BlockGeneratorSuite to use manual clock in BlockGenerator d70608b [Tathagata Das] Updated BlockGenerator with states and proper synchronization f6bd47e [Tathagata Das] Merge remote-tracking branch 'apache-github/master' into SPARK-9556 31da173 [Tathagata Das] Fix bug 12116df [Tathagata Das] Add BlockGeneratorSuite 74bd069 [Tathagata Das] Fix style 989bb5c [Tathagata Das] Made BlockGenerator fail is used after stop, and added better unit tests for it 3ff618c [Tathagata Das] Fix test b40eff8 [Tathagata Das] slight refactoring f0df0f1 [Tathagata Das] Scala style fixes 51759cb [Tathagata Das] Refactored rate controller tests and added the ability to update rate of any custom block generator (cherry picked from commit 0a078303d08ad2bb92b9a8a6969563d75b512290) Signed-off-by: Tathagata Das tathagata.das1...@gmail.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3997dd3f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3997dd3f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3997dd3f Branch: refs/heads/branch-1.5 Commit: 3997dd3fde0f1f67ddc4941921a8ce1449bb44f0 Parents: 3137628 Author: Tathagata Das tathagata.das1...@gmail.com Authored: Thu Aug 6 14:35:30 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Thu Aug 6 14:36:55 2015 -0700 -- .../org/apache/spark/util/ManualClock.scala | 2 +- .../streaming/kafka/ReliableKafkaReceiver.scala | 2 +- .../streaming/kinesis/KinesisReceiver.scala | 2 +- .../streaming/receiver/ActorReceiver.scala | 8 +- .../streaming/receiver/BlockGenerator.scala | 131 +++--- .../spark/streaming/receiver/RateLimiter.scala | 3 +- .../spark/streaming/receiver/Receiver.scala | 52 ++-- .../streaming/receiver/ReceiverSupervisor.scala | 27 +- .../receiver/ReceiverSupervisorImpl.scala | 33 ++- .../spark/streaming/CheckpointSuite.scala | 16 +- .../apache/spark/streaming/ReceiverSuite.scala | 31 +-- .../receiver/BlockGeneratorSuite.scala | 253 +++ .../scheduler/RateControllerSuite.scala | 64 ++--- .../scheduler/ReceiverTrackerSuite.scala| 129 +- 14 files changed, 534 insertions(+), 219 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3997dd3f/core/src/main/scala/org/apache/spark/util/ManualClock.scala -- diff --git a/core/src/main/scala/org/apache/spark/util/ManualClock.scala b/core/src/main/scala/org/apache/spark/util/ManualClock.scala index 1718554..e7a65d7 100644 --- a/core/src/main/scala/org/apache/spark/util/ManualClock.scala +++ b/core/src/main/scala/org/apache/spark/util/ManualClock.scala @@ -58,7 +58,7 @@ private[spark] class ManualClock(private var time: Long) extends Clock { */ def waitTillTime(targetTime: Long): Long = synchronized { while (time targetTime) { - wait(100) + wait(10) } getTimeMillis() } http://git-wip-us.apache.org/repos/asf/spark/blob/3997dd3f/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/ReliableKafkaReceiver.scala
spark git commit: [SPARK-9556] [SPARK-9619] [SPARK-9624] [STREAMING] Make BlockGenerator more robust and make all BlockGenerators subscribe to rate limit updates
Repository: spark Updated Branches: refs/heads/master 21fdfd7d6 - 0a078303d [SPARK-9556] [SPARK-9619] [SPARK-9624] [STREAMING] Make BlockGenerator more robust and make all BlockGenerators subscribe to rate limit updates In some receivers, instead of using the default `BlockGenerator` in `ReceiverSupervisorImpl`, custom generator with their custom listeners are used for reliability (see [`ReliableKafkaReceiver`](https://github.com/apache/spark/blob/master/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/ReliableKafkaReceiver.scala#L99) and [updated `KinesisReceiver`](https://github.com/apache/spark/pull/7825/files)). These custom generators do not receive rate updates. This PR modifies the code to allow custom `BlockGenerator`s to be created through the `ReceiverSupervisorImpl` so that they can be kept track and rate updates can be applied. In the process, I did some simplification, and de-flaki-fication of some rate controller related tests. In particular. - Renamed `Receiver.executor` to `Receiver.supervisor` (to match `ReceiverSupervisor`) - Made `RateControllerSuite` faster (by increasing batch interval) and less flaky - Changed a few internal API to return the current rate of block generators as Long instead of Option\[Long\] (was inconsistent at places). - Updated existing `ReceiverTrackerSuite` to test that custom block generators get rate updates as well. Author: Tathagata Das tathagata.das1...@gmail.com Closes #7913 from tdas/SPARK-9556 and squashes the following commits: 41d4461 [Tathagata Das] fix scala style eb9fd59 [Tathagata Das] Updated kinesis receiver d24994d [Tathagata Das] Updated BlockGeneratorSuite to use manual clock in BlockGenerator d70608b [Tathagata Das] Updated BlockGenerator with states and proper synchronization f6bd47e [Tathagata Das] Merge remote-tracking branch 'apache-github/master' into SPARK-9556 31da173 [Tathagata Das] Fix bug 12116df [Tathagata Das] Add BlockGeneratorSuite 74bd069 [Tathagata Das] Fix style 989bb5c [Tathagata Das] Made BlockGenerator fail is used after stop, and added better unit tests for it 3ff618c [Tathagata Das] Fix test b40eff8 [Tathagata Das] slight refactoring f0df0f1 [Tathagata Das] Scala style fixes 51759cb [Tathagata Das] Refactored rate controller tests and added the ability to update rate of any custom block generator Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0a078303 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0a078303 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0a078303 Branch: refs/heads/master Commit: 0a078303d08ad2bb92b9a8a6969563d75b512290 Parents: 21fdfd7 Author: Tathagata Das tathagata.das1...@gmail.com Authored: Thu Aug 6 14:35:30 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Thu Aug 6 14:35:30 2015 -0700 -- .../org/apache/spark/util/ManualClock.scala | 2 +- .../streaming/kafka/ReliableKafkaReceiver.scala | 2 +- .../streaming/kinesis/KinesisReceiver.scala | 2 +- .../streaming/receiver/ActorReceiver.scala | 8 +- .../streaming/receiver/BlockGenerator.scala | 131 +++--- .../spark/streaming/receiver/RateLimiter.scala | 3 +- .../spark/streaming/receiver/Receiver.scala | 52 ++-- .../streaming/receiver/ReceiverSupervisor.scala | 27 +- .../receiver/ReceiverSupervisorImpl.scala | 33 ++- .../spark/streaming/CheckpointSuite.scala | 16 +- .../apache/spark/streaming/ReceiverSuite.scala | 31 +-- .../receiver/BlockGeneratorSuite.scala | 253 +++ .../scheduler/RateControllerSuite.scala | 64 ++--- .../scheduler/ReceiverTrackerSuite.scala| 129 +- 14 files changed, 534 insertions(+), 219 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0a078303/core/src/main/scala/org/apache/spark/util/ManualClock.scala -- diff --git a/core/src/main/scala/org/apache/spark/util/ManualClock.scala b/core/src/main/scala/org/apache/spark/util/ManualClock.scala index 1718554..e7a65d7 100644 --- a/core/src/main/scala/org/apache/spark/util/ManualClock.scala +++ b/core/src/main/scala/org/apache/spark/util/ManualClock.scala @@ -58,7 +58,7 @@ private[spark] class ManualClock(private var time: Long) extends Clock { */ def waitTillTime(targetTime: Long): Long = synchronized { while (time targetTime) { - wait(100) + wait(10) } getTimeMillis() } http://git-wip-us.apache.org/repos/asf/spark/blob/0a078303/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/ReliableKafkaReceiver.scala -- diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka
spark git commit: [SPARK-9639] [STREAMING] Fix a potential NPE in Streaming JobScheduler
Repository: spark Updated Branches: refs/heads/branch-1.5 8ecfb05e3 - 980687206 [SPARK-9639] [STREAMING] Fix a potential NPE in Streaming JobScheduler Because `JobScheduler.stop(false)` may set `eventLoop` to null when `JobHandler` is running, then it's possible that when `post` is called, `eventLoop` happens to null. This PR fixed this bug and also set threads in `jobExecutor` to `daemon`. Author: zsxwing zsxw...@gmail.com Closes #7960 from zsxwing/fix-npe and squashes the following commits: b0864c4 [zsxwing] Fix a potential NPE in Streaming JobScheduler (cherry picked from commit 346209097e88fe79015359e40b49c32cc0bdc439) Signed-off-by: Tathagata Das tathagata.das1...@gmail.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/98068720 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/98068720 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/98068720 Branch: refs/heads/branch-1.5 Commit: 9806872065b97df524bd631467105219b37f79f3 Parents: 8ecfb05 Author: zsxwing zsxw...@gmail.com Authored: Thu Aug 6 14:39:36 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Thu Aug 6 14:39:48 2015 -0700 -- .../streaming/scheduler/JobScheduler.scala | 32 ++-- 1 file changed, 22 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/98068720/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala index 7e73556..6d4cdc4 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala @@ -17,7 +17,7 @@ package org.apache.spark.streaming.scheduler -import java.util.concurrent.{TimeUnit, ConcurrentHashMap, Executors} +import java.util.concurrent.{ConcurrentHashMap, TimeUnit} import scala.collection.JavaConversions._ import scala.util.{Failure, Success} @@ -25,7 +25,7 @@ import scala.util.{Failure, Success} import org.apache.spark.Logging import org.apache.spark.rdd.PairRDDFunctions import org.apache.spark.streaming._ -import org.apache.spark.util.EventLoop +import org.apache.spark.util.{EventLoop, ThreadUtils} private[scheduler] sealed trait JobSchedulerEvent @@ -44,7 +44,8 @@ class JobScheduler(val ssc: StreamingContext) extends Logging { // https://gist.github.com/AlainODea/1375759b8720a3f9f094 private val jobSets: java.util.Map[Time, JobSet] = new ConcurrentHashMap[Time, JobSet] private val numConcurrentJobs = ssc.conf.getInt(spark.streaming.concurrentJobs, 1) - private val jobExecutor = Executors.newFixedThreadPool(numConcurrentJobs) + private val jobExecutor = +ThreadUtils.newDaemonFixedThreadPool(numConcurrentJobs, streaming-job-executor) private val jobGenerator = new JobGenerator(this) val clock = jobGenerator.clock val listenerBus = new StreamingListenerBus() @@ -193,14 +194,25 @@ class JobScheduler(val ssc: StreamingContext) extends Logging { ssc.sc.setLocalProperty(JobScheduler.BATCH_TIME_PROPERTY_KEY, job.time.milliseconds.toString) ssc.sc.setLocalProperty(JobScheduler.OUTPUT_OP_ID_PROPERTY_KEY, job.outputOpId.toString) try { -eventLoop.post(JobStarted(job)) -// Disable checks for existing output directories in jobs launched by the streaming -// scheduler, since we may need to write output to an existing directory during checkpoint -// recovery; see SPARK-4835 for more details. -PairRDDFunctions.disableOutputSpecValidation.withValue(true) { - job.run() +// We need to assign `eventLoop` to a temp variable. Otherwise, because +// `JobScheduler.stop(false)` may set `eventLoop` to null when this method is running, then +// it's possible that when `post` is called, `eventLoop` happens to null. +var _eventLoop = eventLoop +if (_eventLoop != null) { + _eventLoop.post(JobStarted(job)) + // Disable checks for existing output directories in jobs launched by the streaming + // scheduler, since we may need to write output to an existing directory during checkpoint + // recovery; see SPARK-4835 for more details. + PairRDDFunctions.disableOutputSpecValidation.withValue(true) { +job.run() + } + _eventLoop = eventLoop + if (_eventLoop != null) { +_eventLoop.post(JobCompleted(job)) + } +} else { + // JobScheduler has been stopped. } -
spark git commit: [SPARK-9217] [STREAMING] Make the kinesis receiver reliable by recording sequence numbers
Repository: spark Updated Branches: refs/heads/branch-1.5 b6e8446a4 - ea23e54ff [SPARK-9217] [STREAMING] Make the kinesis receiver reliable by recording sequence numbers This PR is the second one in the larger issue of making the Kinesis integration reliable and provide WAL-free at-least once guarantee. It is based on the design doc - https://docs.google.com/document/d/1k0dl270EnK7uExrsCE7jYw7PYx0YC935uBcxn3p0f58/edit In this PR, I have updated the Kinesis Receiver to do the following. - Control the block generation, by creating its own BlockGenerator with own callback methods and using it to keep track of the ranges of sequence numbers that go into each block. - More specifically, as the KinesisRecordProcessor provides small batches of records, the records are atomically inserted into the block (that is, either the whole batch is in the block, or not). Accordingly the sequence number range of the batch is recorded. Since there may be many batches added to a block, the receiver tracks all the range of sequence numbers that is added to a block. - When the block is ready to be pushed, the block is pushed and the ranges are reported as metadata of the block. In addition, the ranges are used to find out the latest sequence number for each shard that can be checkpointed through the DynamoDB. - Periodically, each KinesisRecordProcessor checkpoints the latest successfully stored sequence number for it own shard. - The array of ranges in the block metadata is used to create KinesisBackedBlockRDDs. The ReceiverInputDStream has been slightly refactored to allow the creation of KinesisBackedBlockRDDs instead of the WALBackedBlockRDDs. Things to be done - [x] Add new test to verify that the sequence numbers are recovered. Author: Tathagata Das tathagata.das1...@gmail.com Closes #7825 from tdas/kinesis-receiver and squashes the following commits: 2159be9 [Tathagata Das] Fixed bug 569be83 [Tathagata Das] Fix scala style issue bf31e22 [Tathagata Das] Added more documentation to make the kinesis test endpoint more configurable 3ad8361 [Tathagata Das] Merge remote-tracking branch 'apache-github/master' into kinesis-receiver c693a63 [Tathagata Das] Removed unnecessary constructor params from KinesisTestUtils e1f1d0a [Tathagata Das] Addressed PR comments b9fa6bf [Tathagata Das] Fix serialization issues f8b7680 [Tathagata Das] Updated doc 33fe43a [Tathagata Das] Added more tests 7997138 [Tathagata Das] Fix style errors a806710 [Tathagata Das] Fixed unit test and use KinesisInputDStream 40a1709 [Tathagata Das] Fixed KinesisReceiverSuite tests 7e44df6 [Tathagata Das] Added documentation and fixed checkpointing 096383f [Tathagata Das] Added test, and addressed some of the comments. 84a7892 [Tathagata Das] fixed scala style issue e19e37d [Tathagata Das] Added license 1cd7b66 [Tathagata Das] Updated kinesis receiver (cherry picked from commit c2a71f0714b7a6ab30c1c4998f606f782428971c) Signed-off-by: Tathagata Das tathagata.das1...@gmail.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ea23e54f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ea23e54f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ea23e54f Branch: refs/heads/branch-1.5 Commit: ea23e54ff65fa871b4da61859254ba4980250752 Parents: b6e8446 Author: Tathagata Das tathagata.das1...@gmail.com Authored: Wed Aug 5 00:20:26 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Wed Aug 5 00:20:44 2015 -0700 -- .../kinesis/KinesisBackedBlockRDD.scala | 20 +- .../streaming/kinesis/KinesisInputDStream.scala | 71 ++ .../streaming/kinesis/KinesisReceiver.scala | 195 ++-- .../kinesis/KinesisRecordProcessor.scala| 76 +++ .../streaming/kinesis/KinesisTestUtils.scala| 63 -- .../spark/streaming/kinesis/KinesisUtils.scala | 21 +- .../kinesis/KinesisBackedBlockRDDSuite.scala| 18 +- .../streaming/kinesis/KinesisFunSuite.scala | 4 +- .../kinesis/KinesisReceiverSuite.scala | 41 ++-- .../streaming/kinesis/KinesisStreamSuite.scala | 222 +++ .../dstream/ReceiverInputDStream.scala | 71 +++--- 11 files changed, 605 insertions(+), 197 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ea23e54f/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala -- diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala index 8f144a4..a003ddf 100644 --- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis
spark git commit: [SPARK-9217] [STREAMING] Make the kinesis receiver reliable by recording sequence numbers
Repository: spark Updated Branches: refs/heads/master 781c8d71a - c2a71f071 [SPARK-9217] [STREAMING] Make the kinesis receiver reliable by recording sequence numbers This PR is the second one in the larger issue of making the Kinesis integration reliable and provide WAL-free at-least once guarantee. It is based on the design doc - https://docs.google.com/document/d/1k0dl270EnK7uExrsCE7jYw7PYx0YC935uBcxn3p0f58/edit In this PR, I have updated the Kinesis Receiver to do the following. - Control the block generation, by creating its own BlockGenerator with own callback methods and using it to keep track of the ranges of sequence numbers that go into each block. - More specifically, as the KinesisRecordProcessor provides small batches of records, the records are atomically inserted into the block (that is, either the whole batch is in the block, or not). Accordingly the sequence number range of the batch is recorded. Since there may be many batches added to a block, the receiver tracks all the range of sequence numbers that is added to a block. - When the block is ready to be pushed, the block is pushed and the ranges are reported as metadata of the block. In addition, the ranges are used to find out the latest sequence number for each shard that can be checkpointed through the DynamoDB. - Periodically, each KinesisRecordProcessor checkpoints the latest successfully stored sequence number for it own shard. - The array of ranges in the block metadata is used to create KinesisBackedBlockRDDs. The ReceiverInputDStream has been slightly refactored to allow the creation of KinesisBackedBlockRDDs instead of the WALBackedBlockRDDs. Things to be done - [x] Add new test to verify that the sequence numbers are recovered. Author: Tathagata Das tathagata.das1...@gmail.com Closes #7825 from tdas/kinesis-receiver and squashes the following commits: 2159be9 [Tathagata Das] Fixed bug 569be83 [Tathagata Das] Fix scala style issue bf31e22 [Tathagata Das] Added more documentation to make the kinesis test endpoint more configurable 3ad8361 [Tathagata Das] Merge remote-tracking branch 'apache-github/master' into kinesis-receiver c693a63 [Tathagata Das] Removed unnecessary constructor params from KinesisTestUtils e1f1d0a [Tathagata Das] Addressed PR comments b9fa6bf [Tathagata Das] Fix serialization issues f8b7680 [Tathagata Das] Updated doc 33fe43a [Tathagata Das] Added more tests 7997138 [Tathagata Das] Fix style errors a806710 [Tathagata Das] Fixed unit test and use KinesisInputDStream 40a1709 [Tathagata Das] Fixed KinesisReceiverSuite tests 7e44df6 [Tathagata Das] Added documentation and fixed checkpointing 096383f [Tathagata Das] Added test, and addressed some of the comments. 84a7892 [Tathagata Das] fixed scala style issue e19e37d [Tathagata Das] Added license 1cd7b66 [Tathagata Das] Updated kinesis receiver Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c2a71f07 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c2a71f07 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c2a71f07 Branch: refs/heads/master Commit: c2a71f0714b7a6ab30c1c4998f606f782428971c Parents: 781c8d7 Author: Tathagata Das tathagata.das1...@gmail.com Authored: Wed Aug 5 00:20:26 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Wed Aug 5 00:20:26 2015 -0700 -- .../kinesis/KinesisBackedBlockRDD.scala | 20 +- .../streaming/kinesis/KinesisInputDStream.scala | 71 ++ .../streaming/kinesis/KinesisReceiver.scala | 195 ++-- .../kinesis/KinesisRecordProcessor.scala| 76 +++ .../streaming/kinesis/KinesisTestUtils.scala| 63 -- .../spark/streaming/kinesis/KinesisUtils.scala | 21 +- .../kinesis/KinesisBackedBlockRDDSuite.scala| 18 +- .../streaming/kinesis/KinesisFunSuite.scala | 4 +- .../kinesis/KinesisReceiverSuite.scala | 41 ++-- .../streaming/kinesis/KinesisStreamSuite.scala | 222 +++ .../dstream/ReceiverInputDStream.scala | 71 +++--- 11 files changed, 605 insertions(+), 197 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c2a71f07/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala -- diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala index 8f144a4..a003ddf 100644 --- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala +++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala @@ -37,16 +37,18 @@ case
spark git commit: [SPARK-9601] [DOCS] Fix JavaPairDStream signature for stream-stream and windowed join in streaming guide doc
Repository: spark Updated Branches: refs/heads/master 6d8a6e416 - 1bf608b5e [SPARK-9601] [DOCS] Fix JavaPairDStream signature for stream-stream and windowed join in streaming guide doc Author: Namit Katariya katariya.na...@gmail.com Closes #7935 from namitk/SPARK-9601 and squashes the following commits: 03b5784 [Namit Katariya] [SPARK-9601] Fix signature of JavaPairDStream for stream-stream and windowed join in streaming guide doc Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1bf608b5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1bf608b5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1bf608b5 Branch: refs/heads/master Commit: 1bf608b5ef1e6e2ae4325e13c2bd5e34db62450f Parents: 6d8a6e4 Author: Namit Katariya katariya.na...@gmail.com Authored: Wed Aug 5 01:07:33 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Wed Aug 5 01:07:33 2015 -0700 -- docs/streaming-programming-guide.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1bf608b5/docs/streaming-programming-guide.md -- diff --git a/docs/streaming-programming-guide.md b/docs/streaming-programming-guide.md index 4663b3f..dbfdb61 100644 --- a/docs/streaming-programming-guide.md +++ b/docs/streaming-programming-guide.md @@ -1141,7 +1141,7 @@ val joinedStream = stream1.join(stream2) {% highlight java %} JavaPairDStreamString, String stream1 = ... JavaPairDStreamString, String stream2 = ... -JavaPairDStreamString, String joinedStream = stream1.join(stream2); +JavaPairDStreamString, Tuple2String, String joinedStream = stream1.join(stream2); {% endhighlight %} /div div data-lang=python markdown=1 @@ -1166,7 +1166,7 @@ val joinedStream = windowedStream1.join(windowedStream2) {% highlight java %} JavaPairDStreamString, String windowedStream1 = stream1.window(Durations.seconds(20)); JavaPairDStreamString, String windowedStream2 = stream2.window(Durations.minutes(1)); -JavaPairDStreamString, String joinedStream = windowedStream1.join(windowedStream2); +JavaPairDStreamString, Tuple2String, String joinedStream = windowedStream1.join(windowedStream2); {% endhighlight %} /div div data-lang=python markdown=1 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9601] [DOCS] Fix JavaPairDStream signature for stream-stream and windowed join in streaming guide doc
Repository: spark Updated Branches: refs/heads/branch-1.5 7fa419535 - 6306019ff [SPARK-9601] [DOCS] Fix JavaPairDStream signature for stream-stream and windowed join in streaming guide doc Author: Namit Katariya katariya.na...@gmail.com Closes #7935 from namitk/SPARK-9601 and squashes the following commits: 03b5784 [Namit Katariya] [SPARK-9601] Fix signature of JavaPairDStream for stream-stream and windowed join in streaming guide doc (cherry picked from commit 1bf608b5ef1e6e2ae4325e13c2bd5e34db62450f) Signed-off-by: Tathagata Das tathagata.das1...@gmail.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6306019f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6306019f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6306019f Branch: refs/heads/branch-1.5 Commit: 6306019ff126ba064919e70d244fa43dd6349cdf Parents: 7fa4195 Author: Namit Katariya katariya.na...@gmail.com Authored: Wed Aug 5 01:07:33 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Wed Aug 5 01:07:51 2015 -0700 -- docs/streaming-programming-guide.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6306019f/docs/streaming-programming-guide.md -- diff --git a/docs/streaming-programming-guide.md b/docs/streaming-programming-guide.md index 4663b3f..dbfdb61 100644 --- a/docs/streaming-programming-guide.md +++ b/docs/streaming-programming-guide.md @@ -1141,7 +1141,7 @@ val joinedStream = stream1.join(stream2) {% highlight java %} JavaPairDStreamString, String stream1 = ... JavaPairDStreamString, String stream2 = ... -JavaPairDStreamString, String joinedStream = stream1.join(stream2); +JavaPairDStreamString, Tuple2String, String joinedStream = stream1.join(stream2); {% endhighlight %} /div div data-lang=python markdown=1 @@ -1166,7 +1166,7 @@ val joinedStream = windowedStream1.join(windowedStream2) {% highlight java %} JavaPairDStreamString, String windowedStream1 = stream1.window(Durations.seconds(20)); JavaPairDStreamString, String windowedStream2 = stream2.window(Durations.minutes(1)); -JavaPairDStreamString, String joinedStream = windowedStream1.join(windowedStream2); +JavaPairDStreamString, Tuple2String, String joinedStream = windowedStream1.join(windowedStream2); {% endhighlight %} /div div data-lang=python markdown=1 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9504] [STREAMING] [TESTS] Fix o.a.s.streaming.StreamingContextSuite.stop gracefully again
Repository: spark Updated Branches: refs/heads/branch-1.5 d196d3607 - 6e72d24e2 [SPARK-9504] [STREAMING] [TESTS] Fix o.a.s.streaming.StreamingContextSuite.stop gracefully again The test failure is here: https://amplab.cs.berkeley.edu/jenkins/job/Spark-Master-SBT/3150/AMPLAB_JENKINS_BUILD_PROFILE=hadoop1.0,label=centos/testReport/junit/org.apache.spark.streaming/StreamingContextSuite/stop_gracefully/ There is a race condition in TestReceiver that it may add 1 record and increase `TestReceiver.counter` after stopping `BlockGenerator`. This PR just adds `join` to wait the pushing thread. Author: zsxwing zsxw...@gmail.com Closes #7934 from zsxwing/SPARK-9504-2 and squashes the following commits: cfd7973 [zsxwing] Wait for the thread to make sure we won't change TestReceiver.counter after stopping BlockGenerator (cherry picked from commit d34bac0e156432ca6a260db73dbe1318060e309c) Signed-off-by: Tathagata Das tathagata.das1...@gmail.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6e72d24e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6e72d24e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6e72d24e Branch: refs/heads/branch-1.5 Commit: 6e72d24e24d0e6d140e1b25597edae3f3054f98a Parents: d196d36 Author: zsxwing zsxw...@gmail.com Authored: Tue Aug 4 20:09:15 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Tue Aug 4 20:09:27 2015 -0700 -- .../scala/org/apache/spark/streaming/StreamingContextSuite.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6e72d24e/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala -- diff --git a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala index b7db280..7423ef6 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala @@ -789,7 +789,8 @@ class TestReceiver extends Receiver[Int](StorageLevel.MEMORY_ONLY) with Logging } def onStop() { -// no clean to be done, the receiving thread should stop on it own +// no clean to be done, the receiving thread should stop on it own, so just wait for it. +receivingThreadOption.foreach(_.join()) } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9504] [STREAMING] [TESTS] Fix o.a.s.streaming.StreamingContextSuite.stop gracefully again
Repository: spark Updated Branches: refs/heads/master 2b67fdb60 - d34bac0e1 [SPARK-9504] [STREAMING] [TESTS] Fix o.a.s.streaming.StreamingContextSuite.stop gracefully again The test failure is here: https://amplab.cs.berkeley.edu/jenkins/job/Spark-Master-SBT/3150/AMPLAB_JENKINS_BUILD_PROFILE=hadoop1.0,label=centos/testReport/junit/org.apache.spark.streaming/StreamingContextSuite/stop_gracefully/ There is a race condition in TestReceiver that it may add 1 record and increase `TestReceiver.counter` after stopping `BlockGenerator`. This PR just adds `join` to wait the pushing thread. Author: zsxwing zsxw...@gmail.com Closes #7934 from zsxwing/SPARK-9504-2 and squashes the following commits: cfd7973 [zsxwing] Wait for the thread to make sure we won't change TestReceiver.counter after stopping BlockGenerator Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d34bac0e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d34bac0e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d34bac0e Branch: refs/heads/master Commit: d34bac0e156432ca6a260db73dbe1318060e309c Parents: 2b67fdb Author: zsxwing zsxw...@gmail.com Authored: Tue Aug 4 20:09:15 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Tue Aug 4 20:09:15 2015 -0700 -- .../scala/org/apache/spark/streaming/StreamingContextSuite.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d34bac0e/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala -- diff --git a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala index b7db280..7423ef6 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala @@ -789,7 +789,8 @@ class TestReceiver extends Receiver[Int](StorageLevel.MEMORY_ONLY) with Logging } def onStop() { -// no clean to be done, the receiving thread should stop on it own +// no clean to be done, the receiving thread should stop on it own, so just wait for it. +receivingThreadOption.foreach(_.join()) } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-1855] Local checkpointing
Repository: spark Updated Branches: refs/heads/master 69f5a7c93 - b41a32718 [SPARK-1855] Local checkpointing Certain use cases of Spark involve RDDs with long lineages that must be truncated periodically (e.g. GraphX). The existing way of doing it is through `rdd.checkpoint()`, which is expensive because it writes to HDFS. This patch provides an alternative to truncate lineages cheaply *without providing the same level of fault tolerance*. **Local checkpointing** writes checkpointed data to the local file system through the block manager. It is much faster than replicating to a reliable storage and provides the same semantics as long as executors do not fail. It is accessible through a new operator `rdd.localCheckpoint()` and leaves the old one unchanged. Users may even decide to combine the two and call the reliable one less frequently. The bulk of this patch involves refactoring the checkpointing interface to accept custom implementations of checkpointing. [Design doc](https://issues.apache.org/jira/secure/attachment/12741708/SPARK-7292-design.pdf). Author: Andrew Or and...@databricks.com Closes #7279 from andrewor14/local-checkpoint and squashes the following commits: 729600f [Andrew Or] Oops, fix tests 34bc059 [Andrew Or] Avoid computing all partitions in local checkpoint e43bbb6 [Andrew Or] Merge branch 'master' of github.com:apache/spark into local-checkpoint 3be5aea [Andrew Or] Address comments bf846a6 [Andrew Or] Merge branch 'master' of github.com:apache/spark into local-checkpoint ab003a3 [Andrew Or] Fix compile c2e111b [Andrew Or] Address comments 33f167a [Andrew Or] Merge branch 'master' of github.com:apache/spark into local-checkpoint e908a42 [Andrew Or] Fix tests f5be0f3 [Andrew Or] Use MEMORY_AND_DISK as the default local checkpoint level a92657d [Andrew Or] Update a few comments e58e3e3 [Andrew Or] Merge branch 'master' of github.com:apache/spark into local-checkpoint 4eb6eb1 [Andrew Or] Merge branch 'master' of github.com:apache/spark into local-checkpoint 1bbe154 [Andrew Or] Simplify LocalCheckpointRDD 48a9996 [Andrew Or] Avoid traversing dependency tree + rewrite tests 62aba3f [Andrew Or] Merge branch 'master' of github.com:apache/spark into local-checkpoint db70dc2 [Andrew Or] Express local checkpointing through caching the original RDD 87d43c6 [Andrew Or] Merge branch 'master' of github.com:apache/spark into local-checkpoint c449b38 [Andrew Or] Fix style 4a182f3 [Andrew Or] Add fine-grained tests for local checkpointing 53b363b [Andrew Or] Rename a few more awkwardly named methods (minor) e4cf071 [Andrew Or] Simplify LocalCheckpointRDD + docs + clean ups 4880deb [Andrew Or] Fix style d096c67 [Andrew Or] Fix mima 172cb66 [Andrew Or] Fix mima? e53d964 [Andrew Or] Fix style 56831c5 [Andrew Or] Add a few warnings and clear exception messages 2e59646 [Andrew Or] Add local checkpoint clean up tests 4dbbab1 [Andrew Or] Refactor CheckpointSuite to test local checkpointing 4514dc9 [Andrew Or] Clean local checkpoint files through RDD cleanups 0477eec [Andrew Or] Rename a few methods with awkward names (minor) 2e902e5 [Andrew Or] First implementation of local checkpointing 8447454 [Andrew Or] Fix tests 4ac1896 [Andrew Or] Refactor checkpoint interface for modularity Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b41a3271 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b41a3271 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b41a3271 Branch: refs/heads/master Commit: b41a32718d615b304efba146bf97be0229779b01 Parents: 69f5a7c Author: Andrew Or and...@databricks.com Authored: Mon Aug 3 10:58:37 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Mon Aug 3 10:58:37 2015 -0700 -- .../scala/org/apache/spark/ContextCleaner.scala | 9 +- .../scala/org/apache/spark/SparkContext.scala | 2 +- .../scala/org/apache/spark/TaskContext.scala| 8 + .../org/apache/spark/rdd/CheckpointRDD.scala| 153 + .../apache/spark/rdd/LocalCheckpointRDD.scala | 67 .../spark/rdd/LocalRDDCheckpointData.scala | 83 + .../main/scala/org/apache/spark/rdd/RDD.scala | 128 +-- .../apache/spark/rdd/RDDCheckpointData.scala| 106 ++ .../spark/rdd/ReliableCheckpointRDD.scala | 172 ++ .../spark/rdd/ReliableRDDCheckpointData.scala | 108 ++ .../org/apache/spark/CheckpointSuite.scala | 164 + .../org/apache/spark/ContextCleanerSuite.scala | 61 +++- .../apache/spark/rdd/LocalCheckpointSuite.scala | 330 +++ project/MimaExcludes.scala | 9 +- 14 files changed, 1085 insertions(+), 315 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b41a3271/core/src/main/scala/org/apache/spark/ContextCleaner.scala
spark git commit: [SPARK-9056] [STREAMING] Rename configuration `spark.streaming.minRememberDuration` to `spark.streaming.fileStream.minRememberDuration`
Repository: spark Updated Branches: refs/heads/master 3c0d2e552 - 060c79aab [SPARK-9056] [STREAMING] Rename configuration `spark.streaming.minRememberDuration` to `spark.streaming.fileStream.minRememberDuration` Rename configuration `spark.streaming.minRememberDuration` to `spark.streaming.fileStream.minRememberDuration` Author: Sameer Abhyankar sabhyankar@sabhyankar-MBP.local Author: Sameer Abhyankar sabhyankar@sabhyankar-MBP.Samavihome Closes #7740 from sabhyankar/spark_branch_9056 and squashes the following commits: d5b2f1f [Sameer Abhyankar] Correct deprecated version to 1.5 1268133 [Sameer Abhyankar] Add {} and indentation ddf9844 [Sameer Abhyankar] Change 4 space indentation to 2 space indentation 1819b5f [Sameer Abhyankar] Use spark.streaming.fileStream.minRememberDuration property in lieu of spark.streaming.minRememberDuration Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/060c79aa Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/060c79aa Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/060c79aa Branch: refs/heads/master Commit: 060c79aab58efd4ce7353a1b00534de0d9e1de0b Parents: 3c0d2e5 Author: Sameer Abhyankar sabhyankar@sabhyankar-MBP.local Authored: Fri Jul 31 13:08:55 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Fri Jul 31 13:08:55 2015 -0700 -- core/src/main/scala/org/apache/spark/SparkConf.scala | 4 +++- .../org/apache/spark/streaming/dstream/FileInputDStream.scala | 6 -- 2 files changed, 7 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/060c79aa/core/src/main/scala/org/apache/spark/SparkConf.scala -- diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala index 4161792..08bab4b 100644 --- a/core/src/main/scala/org/apache/spark/SparkConf.scala +++ b/core/src/main/scala/org/apache/spark/SparkConf.scala @@ -548,7 +548,9 @@ private[spark] object SparkConf extends Logging { spark.rpc.askTimeout - Seq( AlternateConfig(spark.akka.askTimeout, 1.4)), spark.rpc.lookupTimeout - Seq( - AlternateConfig(spark.akka.lookupTimeout, 1.4)) + AlternateConfig(spark.akka.lookupTimeout, 1.4)), +spark.streaming.fileStream.minRememberDuration - Seq( + AlternateConfig(spark.streaming.minRememberDuration, 1.5)) ) /** http://git-wip-us.apache.org/repos/asf/spark/blob/060c79aa/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala index dd4da9d..c358f5b 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala @@ -86,8 +86,10 @@ class FileInputDStream[K, V, F : NewInputFormat[K, V]]( * Files with mod times older than this window of remembering will be ignored. So if new * files are visible within this window, then the file will get selected in the next batch. */ - private val minRememberDurationS = -Seconds(ssc.conf.getTimeAsSeconds(spark.streaming.minRememberDuration, 60s)) + private val minRememberDurationS = { + Seconds(ssc.conf.getTimeAsSeconds(spark.streaming.fileStream.minRememberDuration, + ssc.conf.get(spark.streaming.minRememberDuration, 60s))) + } // This is a def so that it works during checkpoint recovery: private def clock = ssc.scheduler.clock - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [STREAMING] [TEST] [HOTFIX] Fixed Kinesis test to not throw weird errors when Kinesis tests are enabled without AWS keys
Repository: spark Updated Branches: refs/heads/master 04c840910 - 1afdeb7b4 [STREAMING] [TEST] [HOTFIX] Fixed Kinesis test to not throw weird errors when Kinesis tests are enabled without AWS keys If Kinesis tests are enabled by env ENABLE_KINESIS_TESTS = 1 but no AWS credentials are found, the desired behavior is the fail the test using with ``` Exception encountered when attempting to run a suite with class name: org.apache.spark.streaming.kinesis.KinesisBackedBlockRDDSuite *** ABORTED *** (3 seconds, 5 milliseconds) [info] java.lang.Exception: Kinesis tests enabled, but could get not AWS credentials ``` Instead KinesisStreamSuite fails with ``` [info] - basic operation *** FAILED *** (3 seconds, 35 milliseconds) [info] java.lang.IllegalArgumentException: requirement failed: Stream not yet created, call createStream() to create one [info] at scala.Predef$.require(Predef.scala:233) [info] at org.apache.spark.streaming.kinesis.KinesisTestUtils.streamName(KinesisTestUtils.scala:77) [info] at org.apache.spark.streaming.kinesis.KinesisTestUtils$$anonfun$deleteStream$1.apply(KinesisTestUtils.scala:150) [info] at org.apache.spark.streaming.kinesis.KinesisTestUtils$$anonfun$deleteStream$1.apply(KinesisTestUtils.scala:150) [info] at org.apache.spark.Logging$class.logWarning(Logging.scala:71) [info] at org.apache.spark.streaming.kinesis.KinesisTestUtils.logWarning(KinesisTestUtils.scala:39) [info] at org.apache.spark.streaming.kinesis.KinesisTestUtils.deleteStream(KinesisTestUtils.scala:150) [info] at org.apache.spark.streaming.kinesis.KinesisStreamSuite$$anonfun$3.apply$mcV$sp(KinesisStreamSuite.scala:111) [info] at org.apache.spark.streaming.kinesis.KinesisStreamSuite$$anonfun$3.apply(KinesisStreamSuite.scala:86) [info] at org.apache.spark.streaming.kinesis.KinesisStreamSuite$$anonfun$3.apply(KinesisStreamSuite.scala:86) ``` This is because attempting to delete a non-existent Kinesis stream throws uncaught exception. This PR fixes it. Author: Tathagata Das tathagata.das1...@gmail.com Closes #7809 from tdas/kinesis-test-hotfix and squashes the following commits: 7c372e6 [Tathagata Das] Fixed test Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1afdeb7b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1afdeb7b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1afdeb7b Branch: refs/heads/master Commit: 1afdeb7b458f86e2641f062fb9ddc00e9c5c7531 Parents: 04c8409 Author: Tathagata Das tathagata.das1...@gmail.com Authored: Thu Jul 30 16:44:02 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Thu Jul 30 16:44:02 2015 -0700 -- .../streaming/kinesis/KinesisTestUtils.scala| 27 ++-- .../streaming/kinesis/KinesisStreamSuite.scala | 4 +-- 2 files changed, 16 insertions(+), 15 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1afdeb7b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisTestUtils.scala -- diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisTestUtils.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisTestUtils.scala index 0ff1b7e..ca39358 100644 --- a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisTestUtils.scala +++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisTestUtils.scala @@ -53,6 +53,8 @@ private class KinesisTestUtils( @volatile private var streamCreated = false + + @volatile private var _streamName: String = _ private lazy val kinesisClient = { @@ -115,21 +117,9 @@ private class KinesisTestUtils( shardIdToSeqNumbers.toMap } - def describeStream(streamNameToDescribe: String = streamName): Option[StreamDescription] = { -try { - val describeStreamRequest = new DescribeStreamRequest().withStreamName(streamNameToDescribe) - val desc = kinesisClient.describeStream(describeStreamRequest).getStreamDescription() - Some(desc) -} catch { - case rnfe: ResourceNotFoundException = -None -} - } - def deleteStream(): Unit = { try { - if (describeStream().nonEmpty) { -val deleteStreamRequest = new DeleteStreamRequest() + if (streamCreated) { kinesisClient.deleteStream(streamName) } } catch { @@ -149,6 +139,17 @@ private class KinesisTestUtils( } } + private def describeStream(streamNameToDescribe: String): Option[StreamDescription] = { +try { + val describeStreamRequest = new DescribeStreamRequest().withStreamName(streamNameToDescribe) + val desc = kinesisClient.describeStream(describeStreamRequest
spark git commit: [SPARK-9479] [STREAMING] [TESTS] Fix ReceiverTrackerSuite failure for maven build and other potential test failures in Streaming
Repository: spark Updated Branches: refs/heads/master 89cda69ec - 0dbd6963d [SPARK-9479] [STREAMING] [TESTS] Fix ReceiverTrackerSuite failure for maven build and other potential test failures in Streaming See https://issues.apache.org/jira/browse/SPARK-9479 for the failure cause. The PR includes the following changes: 1. Make ReceiverTrackerSuite create StreamingContext in the test body. 2. Fix places that don't stop StreamingContext. I verified no SparkContext was stopped in the shutdown hook locally after this fix. 3. Fix an issue that `ReceiverTracker.endpoint` may be null. 4. Make sure stopping SparkContext in non-main thread won't fail other tests. Author: zsxwing zsxw...@gmail.com Closes #7797 from zsxwing/fix-ReceiverTrackerSuite and squashes the following commits: 3a4bb98 [zsxwing] Fix another potential NPE d7497df [zsxwing] Fix ReceiverTrackerSuite; make sure StreamingContext in tests is closed Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0dbd6963 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0dbd6963 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0dbd6963 Branch: refs/heads/master Commit: 0dbd6963d589a8f6ad344273f3da7df680ada515 Parents: 89cda69 Author: zsxwing zsxw...@gmail.com Authored: Thu Jul 30 15:39:46 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Thu Jul 30 15:39:46 2015 -0700 -- .../StreamingLogisticRegressionSuite.scala | 21 +-- .../mllib/clustering/StreamingKMeansSuite.scala | 17 -- .../StreamingLinearRegressionSuite.scala| 21 +-- .../streaming/scheduler/ReceiverTracker.scala | 12 +++- .../apache/spark/streaming/JavaAPISuite.java| 1 + .../spark/streaming/BasicOperationsSuite.scala | 58 ++-- .../spark/streaming/InputStreamsSuite.scala | 38 +++-- .../spark/streaming/MasterFailureTest.scala | 8 ++- .../spark/streaming/StreamingContextSuite.scala | 22 ++-- .../streaming/StreamingListenerSuite.scala | 13 - .../scheduler/ReceiverTrackerSuite.scala| 56 ++- .../ui/StreamingJobProgressListenerSuite.scala | 19 +-- 12 files changed, 183 insertions(+), 103 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0dbd6963/mllib/src/test/scala/org/apache/spark/mllib/classification/StreamingLogisticRegressionSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/StreamingLogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/StreamingLogisticRegressionSuite.scala index fd65329..d7b291d 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/classification/StreamingLogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/StreamingLogisticRegressionSuite.scala @@ -24,13 +24,22 @@ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.streaming.dstream.DStream -import org.apache.spark.streaming.TestSuiteBase +import org.apache.spark.streaming.{StreamingContext, TestSuiteBase} class StreamingLogisticRegressionSuite extends SparkFunSuite with TestSuiteBase { // use longer wait time to ensure job completion override def maxWaitTimeMillis: Int = 3 + var ssc: StreamingContext = _ + + override def afterFunction() { +super.afterFunction() +if (ssc != null) { + ssc.stop() +} + } + // Test if we can accurately learn B for Y = logistic(BX) on streaming data test(parameter accuracy) { @@ -50,7 +59,7 @@ class StreamingLogisticRegressionSuite extends SparkFunSuite with TestSuiteBase } // apply model training to input stream -val ssc = setupStreams(input, (inputDStream: DStream[LabeledPoint]) = { +ssc = setupStreams(input, (inputDStream: DStream[LabeledPoint]) = { model.trainOn(inputDStream) inputDStream.count() }) @@ -84,7 +93,7 @@ class StreamingLogisticRegressionSuite extends SparkFunSuite with TestSuiteBase // apply model training to input stream, storing the intermediate results // (we add a count to ensure the result is a DStream) -val ssc = setupStreams(input, (inputDStream: DStream[LabeledPoint]) = { +ssc = setupStreams(input, (inputDStream: DStream[LabeledPoint]) = { model.trainOn(inputDStream) inputDStream.foreachRDD(x = history.append(math.abs(model.latestModel().weights(0) - B))) inputDStream.count() @@ -118,7 +127,7 @@ class StreamingLogisticRegressionSuite extends SparkFunSuite with TestSuiteBase } // apply model predictions to test stream -val ssc =
spark git commit: [SPARK-9472] [STREAMING] consistent hadoop configuration, streaming only
Repository: spark Updated Branches: refs/heads/master 3c66ff727 - 9307f5653 [SPARK-9472] [STREAMING] consistent hadoop configuration, streaming only Author: cody koeninger c...@koeninger.org Closes #7772 from koeninger/streaming-hadoop-config and squashes the following commits: 5267284 [cody koeninger] [SPARK-4229][Streaming] consistent hadoop configuration, streaming only Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9307f565 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9307f565 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9307f565 Branch: refs/heads/master Commit: 9307f5653d19a6a2fda355a675ca9ea97e35611b Parents: 3c66ff7 Author: cody koeninger c...@koeninger.org Authored: Thu Jul 30 17:44:20 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Thu Jul 30 17:44:20 2015 -0700 -- .../main/scala/org/apache/spark/streaming/Checkpoint.scala| 3 ++- .../scala/org/apache/spark/streaming/StreamingContext.scala | 7 --- .../org/apache/spark/streaming/api/java/JavaPairDStream.scala | 2 +- .../spark/streaming/api/java/JavaStreamingContext.scala | 3 ++- 4 files changed, 9 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9307f565/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala index 65d4e93..2780d5b 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala @@ -25,6 +25,7 @@ import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.conf.Configuration import org.apache.spark.{SparkException, SparkConf, Logging} +import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.io.CompressionCodec import org.apache.spark.util.{MetadataCleaner, Utils} import org.apache.spark.streaming.scheduler.JobGenerator @@ -100,7 +101,7 @@ object Checkpoint extends Logging { } val path = new Path(checkpointDir) -val fs = fsOption.getOrElse(path.getFileSystem(new Configuration())) +val fs = fsOption.getOrElse(path.getFileSystem(SparkHadoopUtil.get.conf)) if (fs.exists(path)) { val statuses = fs.listStatus(path) if (statuses != null) { http://git-wip-us.apache.org/repos/asf/spark/blob/9307f565/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala index 92438f1..177e710 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala @@ -34,6 +34,7 @@ import org.apache.hadoop.mapreduce.{InputFormat = NewInputFormat} import org.apache.spark._ import org.apache.spark.annotation.{DeveloperApi, Experimental} +import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.input.FixedLengthBinaryInputFormat import org.apache.spark.rdd.{RDD, RDDOperationScope} import org.apache.spark.serializer.SerializationDebugger @@ -110,7 +111,7 @@ class StreamingContext private[streaming] ( * Recreate a StreamingContext from a checkpoint file. * @param path Path to the directory that was specified as the checkpoint directory */ - def this(path: String) = this(path, new Configuration) + def this(path: String) = this(path, SparkHadoopUtil.get.conf) /** * Recreate a StreamingContext from a checkpoint file using an existing SparkContext. @@ -803,7 +804,7 @@ object StreamingContext extends Logging { def getActiveOrCreate( checkpointPath: String, creatingFunc: () = StreamingContext, - hadoopConf: Configuration = new Configuration(), + hadoopConf: Configuration = SparkHadoopUtil.get.conf, createOnError: Boolean = false ): StreamingContext = { ACTIVATION_LOCK.synchronized { @@ -828,7 +829,7 @@ object StreamingContext extends Logging { def getOrCreate( checkpointPath: String, creatingFunc: () = StreamingContext, - hadoopConf: Configuration = new Configuration(), + hadoopConf: Configuration = SparkHadoopUtil.get.conf, createOnError: Boolean = false ): StreamingContext = { val checkpointOption = CheckpointReader.read( http://git-wip-us.apache.org/repos/asf/spark/blob/9307f565/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala
spark git commit: [SPARK-9335] [TESTS] Enable Kinesis tests only when files in extras/kinesis-asl are changed
Repository: spark Updated Branches: refs/heads/master 1221849f9 - 76f2e393a [SPARK-9335] [TESTS] Enable Kinesis tests only when files in extras/kinesis-asl are changed Author: zsxwing zsxw...@gmail.com Closes #7711 from zsxwing/SPARK-9335-test and squashes the following commits: c13ec2f [zsxwing] environs - environ 69c2865 [zsxwing] Merge remote-tracking branch 'origin/master' into SPARK-9335-test ef84a08 [zsxwing] Revert Modify the Kinesis project to trigger ENABLE_KINESIS_TESTS f691028 [zsxwing] Modify the Kinesis project to trigger ENABLE_KINESIS_TESTS 7618205 [zsxwing] Enable Kinesis tests only when files in extras/kinesis-asl are changed Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/76f2e393 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/76f2e393 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/76f2e393 Branch: refs/heads/master Commit: 76f2e393a5fad0db8b56c4b8dad5ef686bf140a4 Parents: 1221849 Author: zsxwing zsxw...@gmail.com Authored: Thu Jul 30 00:46:36 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Thu Jul 30 00:46:36 2015 -0700 -- dev/run-tests.py| 16 dev/sparktestsupport/modules.py | 14 -- 2 files changed, 28 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/76f2e393/dev/run-tests.py -- diff --git a/dev/run-tests.py b/dev/run-tests.py index 1f0d218..29420da 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -85,6 +85,13 @@ def identify_changed_files_from_git_commits(patch_sha, target_branch=None, targe return [f for f in raw_output.split('\n') if f] +def setup_test_environ(environ): +print([info] Setup the following environment variables for tests: ) +for (k, v) in environ.items(): +print(%s=%s % (k, v)) +os.environ[k] = v + + def determine_modules_to_test(changed_modules): Given a set of modules that have changed, compute the transitive closure of those modules' @@ -455,6 +462,15 @@ def main(): print([info] Found the following changed modules:, , .join(x.name for x in changed_modules)) +# setup environment variables +# note - the 'root' module doesn't collect environment variables for all modules. Because the +# environment variables should not be set if a module is not changed, even if running the 'root' +# module. So here we should use changed_modules rather than test_modules. +test_environ = {} +for m in changed_modules: +test_environ.update(m.environ) +setup_test_environ(test_environ) + test_modules = determine_modules_to_test(changed_modules) # license checks http://git-wip-us.apache.org/repos/asf/spark/blob/76f2e393/dev/sparktestsupport/modules.py -- diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index 3073d48..030d982 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -29,7 +29,7 @@ class Module(object): changed. -def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=(), +def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=(), environ={}, sbt_test_goals=(), python_test_goals=(), blacklisted_python_implementations=(), should_run_r_tests=False): @@ -43,6 +43,8 @@ class Module(object): filename strings. :param build_profile_flags: A set of profile flags that should be passed to Maven or SBT in order to build and test this module (e.g. '-PprofileName'). +:param environ: A dict of environment variables that should be set when files in this +module are changed. :param sbt_test_goals: A set of SBT test goals for testing this module. :param python_test_goals: A set of Python test goals for testing this module. :param blacklisted_python_implementations: A set of Python implementations that are not @@ -55,6 +57,7 @@ class Module(object): self.source_file_prefixes = source_file_regexes self.sbt_test_goals = sbt_test_goals self.build_profile_flags = build_profile_flags +self.environ = environ self.python_test_goals = python_test_goals self.blacklisted_python_implementations = blacklisted_python_implementations self.should_run_r_tests = should_run_r_tests @@ -126,15 +129,22 @@ streaming = Module( ) +# Don't set the dependencies because changes in other modules should not trigger Kinesis tests. +# Kinesis tests depends on external Amazon kinesis service. We should
spark git commit: [SPARK-8977] [STREAMING] Defines the RateEstimator interface, and impements the RateController
Repository: spark Updated Branches: refs/heads/master 069a4c414 - 819be46e5 [SPARK-8977] [STREAMING] Defines the RateEstimator interface, and impements the RateController Based on #7471. - [x] add a test that exercises the publish path from driver to receiver - [ ] remove Serializable from `RateController` and `RateEstimator` Author: Iulian Dragos jagua...@gmail.com Author: François Garillot franc...@garillot.net Closes #7600 from dragos/topic/streaming-bp/rate-controller and squashes the following commits: f168c94 [Iulian Dragos] Latest review round. 5125e60 [Iulian Dragos] Fix style. a2eb3b9 [Iulian Dragos] Merge remote-tracking branch 'upstream/master' into topic/streaming-bp/rate-controller 475e346 [Iulian Dragos] Latest round of reviews. e9fb45e [Iulian Dragos] - Add a test for checkpointing - fixed serialization for RateController.executionContext 715437a [Iulian Dragos] Review comments and added a `reset` call in ReceiverTrackerTest. e57c66b [Iulian Dragos] Added a couple of tests for the full scenario from driver to receivers, with several rate updates. b425d32 [Iulian Dragos] Removed DeveloperAPI, removed rateEstimator field, removed Noop rate estimator, changed logic for initialising rate estimator. 238cfc6 [Iulian Dragos] Merge remote-tracking branch 'upstream/master' into topic/streaming-bp/rate-controller 34a389d [Iulian Dragos] Various style changes and a first test for the rate controller. d32ca36 [François Garillot] [SPARK-8977][Streaming] Defines the RateEstimator interface, and implements the ReceiverRateController 8941cf9 [Iulian Dragos] Renames and other nitpicks. 162d9e5 [Iulian Dragos] Use Reflection for accessing truly private `executor` method and use the listener bus to know when receivers have registered (`onStart` is called before receivers have registered, leading to flaky behavior). 210f495 [Iulian Dragos] Revert Added a few tests that measure the receiverâs rate. 0c51959 [Iulian Dragos] Added a few tests that measure the receiverâs rate. 261a051 [Iulian Dragos] - removed field to hold the current rate limit in rate limiter - made rate limit a Long and default to Long.MaxValue (consequence of the above) - removed custom `waitUntil` and replaced it by `eventually` cd1397d [Iulian Dragos] Add a test for the propagation of a new rate limit from driver to receivers. 6369b30 [Iulian Dragos] Merge pull request #15 from huitseeker/SPARK-8975 d15de42 [François Garillot] [SPARK-8975][Streaming] Adds Ratelimiter unit tests w.r.t. spark.streaming.receiver.maxRate 4721c7d [François Garillot] [SPARK-8975][Streaming] Add a mechanism to send a new rate from the driver to the block generator Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/819be46e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/819be46e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/819be46e Branch: refs/heads/master Commit: 819be46e5a73f2d19230354ebba30c58538590f5 Parents: 069a4c4 Author: Iulian Dragos jagua...@gmail.com Authored: Wed Jul 29 13:47:37 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Wed Jul 29 13:47:37 2015 -0700 -- .../spark/streaming/dstream/InputDStream.scala | 7 +- .../dstream/ReceiverInputDStream.scala | 26 - .../streaming/scheduler/JobScheduler.scala | 6 ++ .../streaming/scheduler/RateController.scala| 90 .../scheduler/rate/RateEstimator.scala | 59 +++ .../spark/streaming/CheckpointSuite.scala | 28 + .../scheduler/RateControllerSuite.scala | 103 +++ .../ReceiverSchedulingPolicySuite.scala | 10 +- .../scheduler/ReceiverTrackerSuite.scala| 41 ++-- 9 files changed, 355 insertions(+), 15 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/819be46e/streaming/src/main/scala/org/apache/spark/streaming/dstream/InputDStream.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/InputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/InputDStream.scala index d58c99a..a6c4cd2 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/InputDStream.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/InputDStream.scala @@ -21,7 +21,9 @@ import scala.reflect.ClassTag import org.apache.spark.SparkContext import org.apache.spark.rdd.RDDOperationScope -import org.apache.spark.streaming.{Time, Duration, StreamingContext} +import org.apache.spark.streaming.{Duration, StreamingContext, Time} +import org.apache.spark.streaming.scheduler.RateController +import org.apache.spark.streaming.scheduler.rate.RateEstimator import
spark git commit: [STREAMING] [HOTFIX] Ignore ReceiverTrackerSuite flaky test
Repository: spark Updated Branches: refs/heads/master 59b92add7 - c5ed36953 [STREAMING] [HOTFIX] Ignore ReceiverTrackerSuite flaky test Author: Tathagata Das tathagata.das1...@gmail.com Closes #7738 from tdas/ReceiverTrackerSuite-hotfix and squashes the following commits: 00f0ee1 [Tathagata Das] ignore flaky test Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c5ed3695 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c5ed3695 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c5ed3695 Branch: refs/heads/master Commit: c5ed36953f840018f603dfde94fcb4651e5246ac Parents: 59b92ad Author: Tathagata Das tathagata.das1...@gmail.com Authored: Tue Jul 28 16:41:56 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Tue Jul 28 16:41:56 2015 -0700 -- .../apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala| 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c5ed3695/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala -- diff --git a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala index e2159bd..b039233 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala @@ -31,7 +31,7 @@ class ReceiverTrackerSuite extends TestSuiteBase { val sparkConf = new SparkConf().setMaster(local[8]).setAppName(test) val ssc = new StreamingContext(sparkConf, Milliseconds(100)) - test(Receiver tracker - propagates rate limit) { + ignore(Receiver tracker - propagates rate limit) { object ReceiverStartedWaiter extends StreamingListener { @volatile var started = false - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9335] [STREAMING] [TESTS] Make sure the test stream is deleted in KinesisBackedBlockRDDSuite
Repository: spark Updated Branches: refs/heads/master 9c5612f4e - d93ab93d6 [SPARK-9335] [STREAMING] [TESTS] Make sure the test stream is deleted in KinesisBackedBlockRDDSuite KinesisBackedBlockRDDSuite should make sure delete the stream. Author: zsxwing zsxw...@gmail.com Closes #7663 from zsxwing/fix-SPARK-9335 and squashes the following commits: f0e9154 [zsxwing] Revert [HOTFIX] - Disable Kinesis tests due to rate limits 71a4552 [zsxwing] Make sure the test stream is deleted Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d93ab93d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d93ab93d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d93ab93d Branch: refs/heads/master Commit: d93ab93d673c5007a1edb90a424b451c91c8a285 Parents: 9c5612f Author: zsxwing zsxw...@gmail.com Authored: Mon Jul 27 23:34:29 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Mon Jul 27 23:34:29 2015 -0700 -- .../spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala | 7 +-- .../apache/spark/streaming/kinesis/KinesisStreamSuite.scala | 4 ++-- 2 files changed, 7 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d93ab93d/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala -- diff --git a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala index b2e2a42..e81fb11 100644 --- a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala +++ b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala @@ -17,10 +17,10 @@ package org.apache.spark.streaming.kinesis -import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll} +import org.scalatest.BeforeAndAfterAll import org.apache.spark.storage.{BlockId, BlockManager, StorageLevel, StreamBlockId} -import org.apache.spark.{SparkConf, SparkContext, SparkException, SparkFunSuite} +import org.apache.spark.{SparkConf, SparkContext, SparkException} class KinesisBackedBlockRDDSuite extends KinesisFunSuite with BeforeAndAfterAll { @@ -65,6 +65,9 @@ class KinesisBackedBlockRDDSuite extends KinesisFunSuite with BeforeAndAfterAll } override def afterAll(): Unit = { +if (testUtils != null) { + testUtils.deleteStream() +} if (sc != null) { sc.stop() } http://git-wip-us.apache.org/repos/asf/spark/blob/d93ab93d/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala -- diff --git a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala index 4992b04..f9c952b 100644 --- a/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala +++ b/extras/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala @@ -59,7 +59,7 @@ class KinesisStreamSuite extends KinesisFunSuite } } - ignore(KinesisUtils API) { + test(KinesisUtils API) { ssc = new StreamingContext(sc, Seconds(1)) // Tests the API, does not actually test data receiving val kinesisStream1 = KinesisUtils.createStream(ssc, mySparkStream, @@ -83,7 +83,7 @@ class KinesisStreamSuite extends KinesisFunSuite * you must have AWS credentials available through the default AWS provider chain, * and you have to set the system environment variable RUN_KINESIS_TESTS=1 . */ - ignore(basic operation) { + testIfEnabled(basic operation) { val kinesisTestUtils = new KinesisTestUtils() try { kinesisTestUtils.createStream() - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8882] [STREAMING] Add a new Receiver scheduling mechanism
Repository: spark Updated Branches: refs/heads/master ce89ff477 - daa1964b6 [SPARK-8882] [STREAMING] Add a new Receiver scheduling mechanism The design doc: https://docs.google.com/document/d/1ZsoRvHjpISPrDmSjsGzuSu8UjwgbtmoCTzmhgTurHJw/edit?usp=sharing Author: zsxwing zsxw...@gmail.com Closes #7276 from zsxwing/receiver-scheduling and squashes the following commits: 137b257 [zsxwing] Add preferredNumExecutors to rescheduleReceiver 61a6c3f [zsxwing] Set state to ReceiverState.INACTIVE in deregisterReceiver 5e1fa48 [zsxwing] Fix the code style 7451498 [zsxwing] Move DummyReceiver back to ReceiverTrackerSuite 715ef9c [zsxwing] Rename: scheduledLocations - scheduledExecutors; locations - executors 05daf9c [zsxwing] Use receiverTrackingInfo.toReceiverInfo 1d6d7c8 [zsxwing] Merge branch 'master' into receiver-scheduling 8f93c8d [zsxwing] Use hostPort as the receiver location rather than host; fix comments and unit tests 59f8887 [zsxwing] Schedule all receivers at the same time when launching them 075e0a3 [zsxwing] Add receiver RDD name; use '!isTrackerStarted' instead 276a4ac [zsxwing] Remove ReceiverLauncher and move codes to launchReceivers fab9a01 [zsxwing] Move methods back to the outer class 4e639c4 [zsxwing] Fix unintentional changes f60d021 [zsxwing] Reorganize ReceiverTracker to use an event loop for lock free 105037e [zsxwing] Merge branch 'master' into receiver-scheduling 5fee132 [zsxwing] Update tha scheduling algorithm to avoid to keep restarting Receiver 9e242c8 [zsxwing] Remove the ScheduleReceiver message because we can refuse it when receiving RegisterReceiver a9acfbf [zsxwing] Merge branch 'squash-pr-6294' into receiver-scheduling 881edb9 [zsxwing] ReceiverScheduler - ReceiverSchedulingPolicy e530bcc [zsxwing] [SPARK-5681][Streaming] Use a lock to eliminate the race condition when stopping receivers and registering receivers happen at the same time #6294 3b87e4a [zsxwing] Revert SparkContext.scala a86850c [zsxwing] Remove submitAsyncJob and revert JobWaiter f549595 [zsxwing] Add comments for the scheduling approach 9ecc08e [zsxwing] Fix comments and code style 28d1bee [zsxwing] Make 'host' protected; rescheduleReceiver - getAllowedLocations 2c86a9e [zsxwing] Use tryFailure to support calling jobFailed multiple times ca6fe35 [zsxwing] Add a test for Receiver.restart 27acd45 [zsxwing] Add unit tests for LoadBalanceReceiverSchedulerImplSuite cc76142 [zsxwing] Add JobWaiter.toFuture to avoid blocking threads d9a3e72 [zsxwing] Add a new Receiver scheduling mechanism Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/daa1964b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/daa1964b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/daa1964b Branch: refs/heads/master Commit: daa1964b6098f79100def78451bda181b5c92198 Parents: ce89ff4 Author: zsxwing zsxw...@gmail.com Authored: Mon Jul 27 17:59:43 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Mon Jul 27 17:59:43 2015 -0700 -- .../streaming/receiver/ReceiverSupervisor.scala | 4 +- .../receiver/ReceiverSupervisorImpl.scala | 6 +- .../streaming/scheduler/ReceiverInfo.scala | 1 - .../scheduler/ReceiverSchedulingPolicy.scala| 171 +++ .../streaming/scheduler/ReceiverTracker.scala | 468 --- .../scheduler/ReceiverTrackingInfo.scala| 55 +++ .../ReceiverSchedulingPolicySuite.scala | 130 ++ .../scheduler/ReceiverTrackerSuite.scala| 66 +-- .../ui/StreamingJobProgressListenerSuite.scala | 6 +- 9 files changed, 674 insertions(+), 233 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/daa1964b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisor.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisor.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisor.scala index a7c220f..e98017a 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisor.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisor.scala @@ -24,9 +24,9 @@ import scala.collection.mutable.ArrayBuffer import scala.concurrent._ import scala.util.control.NonFatal -import org.apache.spark.{Logging, SparkConf} +import org.apache.spark.{SparkEnv, Logging, SparkConf} import org.apache.spark.storage.StreamBlockId -import org.apache.spark.util.ThreadUtils +import org.apache.spark.util.{Utils, ThreadUtils} /** * Abstract class that is responsible for supervising a Receiver in the worker.
spark git commit: [SPARK-8975] [STREAMING] Adds a mechanism to send a new rate from the driver to the block generator
Repository: spark Updated Branches: refs/heads/master fe26584a1 - 798dff7b4 [SPARK-8975] [STREAMING] Adds a mechanism to send a new rate from the driver to the block generator First step for [SPARK-7398](https://issues.apache.org/jira/browse/SPARK-7398). tdas huitseeker Author: Iulian Dragos jagua...@gmail.com Author: François Garillot franc...@garillot.net Closes #7471 from dragos/topic/streaming-bp/dynamic-rate and squashes the following commits: 8941cf9 [Iulian Dragos] Renames and other nitpicks. 162d9e5 [Iulian Dragos] Use Reflection for accessing truly private `executor` method and use the listener bus to know when receivers have registered (`onStart` is called before receivers have registered, leading to flaky behavior). 210f495 [Iulian Dragos] Revert Added a few tests that measure the receiverâs rate. 0c51959 [Iulian Dragos] Added a few tests that measure the receiverâs rate. 261a051 [Iulian Dragos] - removed field to hold the current rate limit in rate limiter - made rate limit a Long and default to Long.MaxValue (consequence of the above) - removed custom `waitUntil` and replaced it by `eventually` cd1397d [Iulian Dragos] Add a test for the propagation of a new rate limit from driver to receivers. 6369b30 [Iulian Dragos] Merge pull request #15 from huitseeker/SPARK-8975 d15de42 [François Garillot] [SPARK-8975][Streaming] Adds Ratelimiter unit tests w.r.t. spark.streaming.receiver.maxRate 4721c7d [François Garillot] [SPARK-8975][Streaming] Add a mechanism to send a new rate from the driver to the block generator Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/798dff7b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/798dff7b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/798dff7b Branch: refs/heads/master Commit: 798dff7b4baa952c609725b852bcb6a9c9e5a317 Parents: fe26584 Author: Iulian Dragos jagua...@gmail.com Authored: Wed Jul 22 15:54:08 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Wed Jul 22 15:54:08 2015 -0700 -- .../spark/streaming/receiver/RateLimiter.scala | 30 -- .../spark/streaming/receiver/Receiver.scala | 2 +- .../streaming/receiver/ReceiverMessage.scala| 3 +- .../streaming/receiver/ReceiverSupervisor.scala | 3 + .../receiver/ReceiverSupervisorImpl.scala | 6 ++ .../streaming/scheduler/ReceiverTracker.scala | 9 ++- .../streaming/receiver/RateLimiterSuite.scala | 46 +++ .../scheduler/ReceiverTrackerSuite.scala| 62 8 files changed, 153 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/798dff7b/streaming/src/main/scala/org/apache/spark/streaming/receiver/RateLimiter.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/RateLimiter.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/RateLimiter.scala index 8df542b..f663def 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/RateLimiter.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/RateLimiter.scala @@ -34,12 +34,32 @@ import org.apache.spark.{Logging, SparkConf} */ private[receiver] abstract class RateLimiter(conf: SparkConf) extends Logging { - private val desiredRate = conf.getInt(spark.streaming.receiver.maxRate, 0) - private lazy val rateLimiter = GuavaRateLimiter.create(desiredRate) + // treated as an upper limit + private val maxRateLimit = conf.getLong(spark.streaming.receiver.maxRate, Long.MaxValue) + private lazy val rateLimiter = GuavaRateLimiter.create(maxRateLimit.toDouble) def waitToPush() { -if (desiredRate 0) { - rateLimiter.acquire() -} +rateLimiter.acquire() } + + /** + * Return the current rate limit. If no limit has been set so far, it returns {{{Long.MaxValue}}}. + */ + def getCurrentLimit: Long = +rateLimiter.getRate.toLong + + /** + * Set the rate limit to `newRate`. The new rate will not exceed the maximum rate configured by + * {{{spark.streaming.receiver.maxRate}}}, even if `newRate` is higher than that. + * + * @param newRate A new rate in events per second. It has no effect if it's 0 or negative. + */ + private[receiver] def updateRate(newRate: Long): Unit = +if (newRate 0) { + if (maxRateLimit 0) { +rateLimiter.setRate(newRate.min(maxRateLimit)) + } else { +rateLimiter.setRate(newRate) + } +} } http://git-wip-us.apache.org/repos/asf/spark/blob/798dff7b/streaming/src/main/scala/org/apache/spark/streaming/receiver/Receiver.scala -- diff --git a/streaming/src/main/scala/org/apache/spark
spark git commit: Disable flaky test: ReceiverSuite block generator throttling.
Repository: spark Updated Branches: refs/heads/branch-1.3 016332535 - 596a4cb8c Disable flaky test: ReceiverSuite block generator throttling. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/596a4cb8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/596a4cb8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/596a4cb8 Branch: refs/heads/branch-1.3 Commit: 596a4cb8cca9ca44db93f4bc3d46768b62ad067c Parents: 0163325 Author: Reynold Xin r...@databricks.com Authored: Wed Apr 22 21:24:22 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Mon Jul 20 11:02:00 2015 -0700 -- .../src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/596a4cb8/streaming/src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala -- diff --git a/streaming/src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala index 5b37de1..fda0259 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala @@ -155,7 +155,7 @@ class ReceiverSuite extends TestSuiteBase with Timeouts with Serializable { assert(recordedData.toSet === generatedData.toSet) } - test(block generator throttling) { + ignore(block generator throttling) { val blockGeneratorListener = new FakeBlockGeneratorListener val blockInterval = 100 val maxRate = 1001 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9030] [STREAMING] Add Kinesis.createStream unit tests that actual sends data
Repository: spark Updated Branches: refs/heads/master bd903ee89 - b13ef7723 [SPARK-9030] [STREAMING] Add Kinesis.createStream unit tests that actual sends data Current Kinesis unit tests do not test createStream by sending data. This PR is to add such unit test. Note that this unit will not run by default. It will only run when the relevant environment variables are set. Author: Tathagata Das tathagata.das1...@gmail.com Closes #7413 from tdas/kinesis-tests and squashes the following commits: 0e16db5 [Tathagata Das] Added more comments regarding testOrIgnore 1ea5ce0 [Tathagata Das] Added more comments c7caef7 [Tathagata Das] Address comments a297b59 [Tathagata Das] Reverted unnecessary change in KafkaStreamSuite 90c9bde [Tathagata Das] Removed scalatest.FunSuite deb7f4f [Tathagata Das] Removed scalatest.FunSuite 18c2208 [Tathagata Das] Changed how SparkFunSuite is inherited dbb33a5 [Tathagata Das] Added license 88f6dab [Tathagata Das] Added scala docs c6be0d7 [Tathagata Das] minor changes 24a992b [Tathagata Das] Moved KinesisTestUtils to src instead of test for future python usage 465b55d [Tathagata Das] Made unit tests optional in a nice way 4d70703 [Tathagata Das] Added license 129d436 [Tathagata Das] Minor updates cc36510 [Tathagata Das] Added KinesisStreamSuite Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b13ef772 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b13ef772 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b13ef772 Branch: refs/heads/master Commit: b13ef7723f254c10c685b93eb8dc08a52527ec73 Parents: bd903ee Author: Tathagata Das tathagata.das1...@gmail.com Authored: Fri Jul 17 16:43:18 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Fri Jul 17 16:43:18 2015 -0700 -- .../streaming/kinesis/KinesisTestUtils.scala| 197 +++ .../streaming/kinesis/KinesisFunSuite.scala | 37 .../kinesis/KinesisReceiverSuite.scala | 17 -- .../streaming/kinesis/KinesisStreamSuite.scala | 120 +++ 4 files changed, 354 insertions(+), 17 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b13ef772/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisTestUtils.scala -- diff --git a/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisTestUtils.scala b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisTestUtils.scala new file mode 100644 index 000..f6bf552 --- /dev/null +++ b/extras/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisTestUtils.scala @@ -0,0 +1,197 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the License); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.streaming.kinesis + +import java.nio.ByteBuffer +import java.util.concurrent.TimeUnit + +import scala.collection.mutable +import scala.collection.mutable.ArrayBuffer +import scala.util.{Failure, Random, Success, Try} + +import com.amazonaws.auth.{AWSCredentials, DefaultAWSCredentialsProviderChain} +import com.amazonaws.regions.RegionUtils +import com.amazonaws.services.dynamodbv2.AmazonDynamoDBClient +import com.amazonaws.services.dynamodbv2.document.DynamoDB +import com.amazonaws.services.kinesis.AmazonKinesisClient +import com.amazonaws.services.kinesis.model._ + +import org.apache.spark.Logging + +/** + * Shared utility methods for performing Kinesis tests that actually transfer data + */ +private class KinesisTestUtils( +val endpointUrl: String = https://kinesis.us-west-2.amazonaws.com;, +_regionName: String = ) extends Logging { + + val regionName = if (_regionName.length == 0) { +RegionUtils.getRegionByEndpoint(endpointUrl).getName() + } else { +RegionUtils.getRegion(_regionName).getName() + } + + val streamShardCount = 2 + + private val createStreamTimeoutSeconds = 300 + private val describeStreamPollTimeSeconds = 1 + + @volatile + private var streamCreated = false + private var _streamName: String
spark git commit: [SPARK-5681] [STREAMING] Move 'stopReceivers' to the event loop to resolve the race condition
Repository: spark Updated Branches: refs/heads/master 074085d67 - ad0954f6d [SPARK-5681] [STREAMING] Move 'stopReceivers' to the event loop to resolve the race condition This is an alternative way to fix `SPARK-5681`. It minimizes the changes. Closes #4467 Author: zsxwing zsxw...@gmail.com Author: Liang-Chi Hsieh vii...@gmail.com Closes #6294 from zsxwing/pr4467 and squashes the following commits: 709ac1f [zsxwing] Fix the comment e103e8a [zsxwing] Move ReceiverTracker.stop into ReceiverTracker.stop f637142 [zsxwing] Address minor code style comments a178d37 [zsxwing] Move 'stopReceivers' to the event looop to resolve the race condition 51fb07e [zsxwing] Fix the code style 3cb19a3 [zsxwing] Merge branch 'master' into pr4467 b4c29e7 [zsxwing] Stop receiver only if we start it c41ee94 [zsxwing] Make stopReceivers private 7c73c1f [zsxwing] Use trackerStateLock to protect trackerState a8120c0 [zsxwing] Merge branch 'master' into pr4467 7b1d9af [zsxwing] case Throwable = case NonFatal 15ed4a1 [zsxwing] Register before starting the receiver fff63f9 [zsxwing] Use a lock to eliminate the race condition when stopping receivers and registering receivers happen at the same time. e0ef72a [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into tracker_status_timeout 19b76d9 [Liang-Chi Hsieh] Remove timeout. 34c18dc [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into tracker_status_timeout c419677 [Liang-Chi Hsieh] Fix style. 9e1a760 [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into tracker_status_timeout 355f9ce [Liang-Chi Hsieh] Separate register and start events for receivers. 3d568e8 [Liang-Chi Hsieh] Let receivers get registered first before going started. ae0d9fd [Liang-Chi Hsieh] Merge branch 'master' into tracker_status_timeout 77983f3 [Liang-Chi Hsieh] Add tracker status and stop to receive messages when stopping tracker. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ad0954f6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ad0954f6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ad0954f6 Branch: refs/heads/master Commit: ad0954f6de29761e0e7e543212c5bfe1fdcbed9f Parents: 074085d Author: zsxwing zsxw...@gmail.com Authored: Fri Jul 17 14:00:31 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Fri Jul 17 14:00:31 2015 -0700 -- .../streaming/receiver/ReceiverSupervisor.scala | 42 -- .../receiver/ReceiverSupervisorImpl.scala | 2 +- .../streaming/scheduler/ReceiverTracker.scala | 139 +-- .../apache/spark/streaming/ReceiverSuite.scala | 2 + .../spark/streaming/StreamingContextSuite.scala | 15 ++ 5 files changed, 138 insertions(+), 62 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ad0954f6/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisor.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisor.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisor.scala index eeb14ca..6467029 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisor.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisor.scala @@ -22,6 +22,7 @@ import java.util.concurrent.CountDownLatch import scala.collection.mutable.ArrayBuffer import scala.concurrent._ +import scala.util.control.NonFatal import org.apache.spark.{Logging, SparkConf} import org.apache.spark.storage.StreamBlockId @@ -36,7 +37,7 @@ private[streaming] abstract class ReceiverSupervisor( conf: SparkConf ) extends Logging { - /** Enumeration to identify current state of the StreamingContext */ + /** Enumeration to identify current state of the Receiver */ object ReceiverState extends Enumeration { type CheckpointState = Value val Initialized, Started, Stopped = Value @@ -97,8 +98,8 @@ private[streaming] abstract class ReceiverSupervisor( /** Called when supervisor is stopped */ protected def onStop(message: String, error: Option[Throwable]) { } - /** Called when receiver is started */ - protected def onReceiverStart() { } + /** Called when receiver is started. Return true if the driver accepts us */ + protected def onReceiverStart(): Boolean /** Called when receiver is stopped */ protected def onReceiverStop(message: String, error: Option[Throwable]) { } @@ -121,13 +122,17 @@ private[streaming] abstract class ReceiverSupervisor( /** Start receiver */ def startReceiver(): Unit = synchronized { try { - logInfo(Starting receiver) - receiver.onStart() - logInfo(Called receiver onStart) -
spark git commit: [SPARK-6304] [STREAMING] Fix checkpointing doesn't retain driver port issue.
Repository: spark Updated Branches: refs/heads/master fec10f0c6 - 031d7d414 [SPARK-6304] [STREAMING] Fix checkpointing doesn't retain driver port issue. Author: jerryshao saisai.s...@intel.com Author: Saisai Shao saisai.s...@intel.com Closes #5060 from jerryshao/SPARK-6304 and squashes the following commits: 89b01f5 [jerryshao] Update the unit test to add more cases 275d252 [jerryshao] Address the comments 7cc146d [jerryshao] Address the comments 2624723 [jerryshao] Fix rebase conflict 45befaa [Saisai Shao] Update the unit test bbc1c9c [Saisai Shao] Fix checkpointing doesn't retain driver port issue Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/031d7d41 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/031d7d41 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/031d7d41 Branch: refs/heads/master Commit: 031d7d41430ec1f3c3353e33eab4821a9bcd58a5 Parents: fec10f0 Author: jerryshao saisai.s...@intel.com Authored: Thu Jul 16 16:55:46 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Thu Jul 16 16:55:46 2015 -0700 -- .../org/apache/spark/streaming/Checkpoint.scala | 2 + .../spark/streaming/CheckpointSuite.scala | 45 +++- 2 files changed, 46 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/031d7d41/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala index 5279331..65d4e93 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala @@ -48,6 +48,8 @@ class Checkpoint(@transient ssc: StreamingContext, val checkpointTime: Time) // Reload properties for the checkpoint application since user wants to set a reload property // or spark had changed its value and user wants to set it back. val propertiesToReload = List( + spark.driver.host, + spark.driver.port, spark.master, spark.yarn.keytab, spark.yarn.principal) http://git-wip-us.apache.org/repos/asf/spark/blob/031d7d41/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala -- diff --git a/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala index 6a94928..d308ac0 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala @@ -191,8 +191,51 @@ class CheckpointSuite extends TestSuiteBase { } } + // This tests if spark.driver.host and spark.driver.port is set by user, can be recovered + // with correct value. + test(get correct spark.driver.[host|port] from checkpoint) { +val conf = Map(spark.driver.host - localhost, spark.driver.port - ) +conf.foreach(kv = System.setProperty(kv._1, kv._2)) +ssc = new StreamingContext(master, framework, batchDuration) +val originalConf = ssc.conf +assert(originalConf.get(spark.driver.host) === localhost) +assert(originalConf.get(spark.driver.port) === ) + +val cp = new Checkpoint(ssc, Time(1000)) +ssc.stop() + +// Serialize/deserialize to simulate write to storage and reading it back +val newCp = Utils.deserialize[Checkpoint](Utils.serialize(cp)) + +val newCpConf = newCp.createSparkConf() +assert(newCpConf.contains(spark.driver.host)) +assert(newCpConf.contains(spark.driver.port)) +assert(newCpConf.get(spark.driver.host) === localhost) +assert(newCpConf.get(spark.driver.port) === ) + +// Check if all the parameters have been restored +ssc = new StreamingContext(null, newCp, null) +val restoredConf = ssc.conf +assert(restoredConf.get(spark.driver.host) === localhost) +assert(restoredConf.get(spark.driver.port) === ) +ssc.stop() + +// If spark.driver.host and spark.driver.host is not set in system property, these two +// parameters should not be presented in the newly recovered conf. +conf.foreach(kv = System.clearProperty(kv._1)) +val newCpConf1 = newCp.createSparkConf() +assert(!newCpConf1.contains(spark.driver.host)) +assert(!newCpConf1.contains(spark.driver.port)) + +// Spark itself will dispatch a random, not-used port for spark.driver.port if it is not set +// explicitly. +ssc = new StreamingContext(null, newCp, null) +val restoredConf1 = ssc.conf +assert(restoredConf1.get(spark.driver.host) ===