spark git commit: [SPARK-24127][SS] Continuous text socket source

gurwls223 Fri, 10 Aug 2018 00:54:00 -0700

Repository: spark
Updated Branches:
  refs/heads/master ab1029fb8 -> 9abe09bfc



[SPARK-24127][SS] Continuous text socket source

## What changes were proposed in this pull request?

Support for text socket stream in spark structured streaming "continuous" mode. 
This is roughly based on the idea of ContinuousMemoryStream where the executor 
queries the data from driver over an RPC endpoint.

This makes it possible to create Structured streaming continuous pipeline to 
ingest data via "nc" and run examples.

## How was this patch tested?

Unit test and ran spark examples in structured streaming continuous mode.

Please review http://spark.apache.org/contributing.html before opening a pull 
request.

Closes #21199 from arunmahadevan/SPARK-24127.

Authored-by: Arun Mahadevan <ar...@apache.org>
Signed-off-by: hyukjinkwon <gurwls...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9abe09bf
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9abe09bf
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9abe09bf

Branch: refs/heads/master
Commit: 9abe09bfc18580233acad676d1241684c7d8768d
Parents: ab1029f
Author: Arun Mahadevan <ar...@apache.org>
Authored: Fri Aug 10 15:53:31 2018 +0800
Committer: hyukjinkwon <gurwls...@apache.org>
Committed: Fri Aug 10 15:53:31 2018 +0800

----------------------------------------------------------------------
 .../streaming/ContinuousRecordEndpoint.scala    |  69 +++++
 .../continuous/ContinuousTextSocketSource.scala | 292 +++++++++++++++++++
 .../sources/ContinuousMemoryStream.scala        |  32 +-
 .../execution/streaming/sources/socket.scala    |  25 +-
 .../sources/TextSocketStreamSuite.scala         |  98 ++++++-
 5 files changed, 482 insertions(+), 34 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/9abe09bf/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ContinuousRecordEndpoint.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ContinuousRecordEndpoint.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ContinuousRecordEndpoint.scala
new file mode 100644
index 0000000..c9c2ebc
--- /dev/null
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ContinuousRecordEndpoint.scala
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.execution.streaming
+
+import org.apache.spark.SparkEnv
+import org.apache.spark.rpc.{RpcCallContext, RpcEnv, ThreadSafeRpcEndpoint}
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.sources.v2.reader.streaming.PartitionOffset
+
+case class ContinuousRecordPartitionOffset(partitionId: Int, offset: Int) 
extends PartitionOffset
+case class GetRecord(offset: ContinuousRecordPartitionOffset)
+
+/**
+ * A RPC end point for continuous readers to poll for
+ * records from the driver.
+ *
+ * @param buckets the data buckets. Each bucket contains a sequence of items 
to be
+ *                returned for a partition. The number of buckets should be 
equal to
+ *                to the number of partitions.
+ * @param lock a lock object for locking the buckets for read
+ */
+class ContinuousRecordEndpoint(buckets: Seq[Seq[Any]], lock: Object)
+  extends ThreadSafeRpcEndpoint {
+
+  private var startOffsets: Seq[Int] = List.fill(buckets.size)(0)
+
+  /**
+   * Sets the start offset.
+   *
+   * @param offsets the base offset per partition to be used
+   *                while retrieving the data in {#receiveAndReply}.
+   */
+  def setStartOffsets(offsets: Seq[Int]): Unit = {
+    lock.synchronized {
+      startOffsets = offsets
+    }
+  }
+
+  override val rpcEnv: RpcEnv = SparkEnv.get.rpcEnv
+
+  /**
+   * Process messages from `RpcEndpointRef.ask`. If receiving a unmatched 
message,
+   * `SparkException` will be thrown and sent to `onError`.
+   */
+  override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, 
Unit] = {
+    case GetRecord(ContinuousRecordPartitionOffset(partitionId, offset)) =>
+      lock.synchronized {
+        val bufOffset = offset - startOffsets(partitionId)
+        val buf = buckets(partitionId)
+        val record = if (buf.size <= bufOffset) None else Some(buf(bufOffset))
+
+        context.reply(record.map(InternalRow(_)))
+      }
+  }
+}

http://git-wip-us.apache.org/repos/asf/spark/blob/9abe09bf/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousTextSocketSource.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousTextSocketSource.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousTextSocketSource.scala
new file mode 100644
index 0000000..1dbdfd5
--- /dev/null
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousTextSocketSource.scala
@@ -0,0 +1,292 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming.continuous
+
+import java.io.{BufferedReader, InputStreamReader, IOException}
+import java.net.Socket
+import java.sql.Timestamp
+import java.util.{Calendar, List => JList}
+import javax.annotation.concurrent.GuardedBy
+
+import scala.collection.JavaConverters._
+import scala.collection.mutable.ListBuffer
+
+import org.json4s.{DefaultFormats, NoTypeHints}
+import org.json4s.jackson.Serialization
+
+import org.apache.spark.SparkEnv
+import org.apache.spark.internal.Logging
+import org.apache.spark.rpc.RpcEndpointRef
+import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.execution.streaming.{ContinuousRecordEndpoint, 
ContinuousRecordPartitionOffset, GetRecord}
+import org.apache.spark.sql.execution.streaming.sources.TextSocketReader
+import org.apache.spark.sql.sources.v2.DataSourceOptions
+import org.apache.spark.sql.sources.v2.reader.{InputPartition, 
InputPartitionReader, SupportsDeprecatedScanRow}
+import 
org.apache.spark.sql.sources.v2.reader.streaming.{ContinuousInputPartitionReader,
 ContinuousReader, Offset, PartitionOffset}
+import org.apache.spark.sql.types.{StringType, StructField, StructType, 
TimestampType}
+import org.apache.spark.util.RpcUtils
+
+
+/**
+ * A ContinuousReader that reads text lines through a TCP socket, designed 
only for tutorials and
+ * debugging. This ContinuousReader will *not* work in production applications 
due to multiple
+ * reasons, including no support for fault recovery.
+ *
+ * The driver maintains a socket connection to the host-port, keeps the 
received messages in
+ * buckets and serves the messages to the executors via a RPC endpoint.
+ */
+class TextSocketContinuousReader(options: DataSourceOptions) extends 
ContinuousReader with Logging {
+  implicit val defaultFormats: DefaultFormats = DefaultFormats
+
+  private val host: String = options.get("host").get()
+  private val port: Int = options.get("port").get().toInt
+
+  assert(SparkSession.getActiveSession.isDefined)
+  private val spark = SparkSession.getActiveSession.get
+  private val numPartitions = spark.sparkContext.defaultParallelism
+
+  @GuardedBy("this")
+  private var socket: Socket = _
+
+  @GuardedBy("this")
+  private var readThread: Thread = _
+
+  @GuardedBy("this")
+  private val buckets = Seq.fill(numPartitions)(new ListBuffer[(String, 
Timestamp)])
+
+  @GuardedBy("this")
+  private var currentOffset: Int = -1
+
+  private var startOffset: TextSocketOffset = _
+
+  private val recordEndpoint = new ContinuousRecordEndpoint(buckets, this)
+  @volatile private var endpointRef: RpcEndpointRef = _
+
+  initialize()
+
+  override def mergeOffsets(offsets: Array[PartitionOffset]): Offset = {
+    assert(offsets.length == numPartitions)
+    val offs = offsets
+      .map(_.asInstanceOf[ContinuousRecordPartitionOffset])
+      .sortBy(_.partitionId)
+      .map(_.offset)
+      .toList
+    TextSocketOffset(offs)
+  }
+
+  override def deserializeOffset(json: String): Offset = {
+    TextSocketOffset(Serialization.read[List[Int]](json))
+  }
+
+  override def setStartOffset(offset: java.util.Optional[Offset]): Unit = {
+    this.startOffset = offset
+      .orElse(TextSocketOffset(List.fill(numPartitions)(0)))
+      .asInstanceOf[TextSocketOffset]
+    recordEndpoint.setStartOffsets(startOffset.offsets)
+  }
+
+  override def getStartOffset: Offset = startOffset
+
+  override def readSchema(): StructType = {
+    if (includeTimestamp) {
+      TextSocketReader.SCHEMA_TIMESTAMP
+    } else {
+      TextSocketReader.SCHEMA_REGULAR
+    }
+  }
+
+  override def planInputPartitions(): JList[InputPartition[InternalRow]] = {
+
+    val endpointName = 
s"TextSocketContinuousReaderEndpoint-${java.util.UUID.randomUUID()}"
+    endpointRef = recordEndpoint.rpcEnv.setupEndpoint(endpointName, 
recordEndpoint)
+
+    val offsets = startOffset match {
+      case off: TextSocketOffset => off.offsets
+      case off =>
+        throw new IllegalArgumentException(
+          s"invalid offset type ${off.getClass} for 
TextSocketContinuousReader")
+    }
+
+    if (offsets.size != numPartitions) {
+      throw new IllegalArgumentException(
+        s"The previous run contained ${offsets.size} partitions, but" +
+          s" $numPartitions partitions are currently configured. The 
numPartitions option" +
+          " cannot be changed.")
+    }
+
+    startOffset.offsets.zipWithIndex.map {
+      case (offset, i) =>
+        TextSocketContinuousInputPartition(
+          endpointName, i, offset, includeTimestamp): 
InputPartition[InternalRow]
+    }.asJava
+
+  }
+
+  override def commit(end: Offset): Unit = synchronized {
+    val endOffset = end match {
+      case off: TextSocketOffset => off
+      case _ => throw new 
IllegalArgumentException(s"TextSocketContinuousReader.commit()" +
+        s"received an offset ($end) that did not originate with an instance of 
this class")
+    }
+
+    endOffset.offsets.zipWithIndex.foreach {
+      case (offset, partition) =>
+        val max = startOffset.offsets(partition) + buckets(partition).size
+        if (offset > max) {
+          throw new IllegalStateException("Invalid offset " + offset + " to 
commit" +
+          " for partition " + partition + ". Max valid offset: " + max)
+        }
+        val n = offset - startOffset.offsets(partition)
+        buckets(partition).trimStart(n)
+    }
+    startOffset = endOffset
+    recordEndpoint.setStartOffsets(startOffset.offsets)
+  }
+
+  /** Stop this source. */
+  override def stop(): Unit = synchronized {
+    if (socket != null) {
+      try {
+        // Unfortunately, BufferedReader.readLine() cannot be interrupted, so 
the only way to
+        // stop the readThread is to close the socket.
+        socket.close()
+      } catch {
+        case e: IOException =>
+      }
+      socket = null
+    }
+    if (endpointRef != null) recordEndpoint.rpcEnv.stop(endpointRef)
+  }
+
+  private def initialize(): Unit = synchronized {
+    socket = new Socket(host, port)
+    val reader = new BufferedReader(new 
InputStreamReader(socket.getInputStream))
+    // Thread continuously reads from a socket and inserts data into buckets
+    readThread = new Thread(s"TextSocketContinuousReader($host, $port)") {
+      setDaemon(true)
+
+      override def run(): Unit = {
+        try {
+          while (true) {
+            val line = reader.readLine()
+            if (line == null) {
+              // End of file reached
+              logWarning(s"Stream closed by $host:$port")
+              return
+            }
+            TextSocketContinuousReader.this.synchronized {
+              currentOffset += 1
+              val newData = (line,
+                Timestamp.valueOf(
+                  
TextSocketReader.DATE_FORMAT.format(Calendar.getInstance().getTime()))
+              )
+              buckets(currentOffset % numPartitions) += newData
+            }
+          }
+        } catch {
+          case e: IOException =>
+        }
+      }
+    }
+
+    readThread.start()
+  }
+
+  override def toString: String = s"TextSocketContinuousReader[host: $host, 
port: $port]"
+
+  private def includeTimestamp: Boolean = 
options.getBoolean("includeTimestamp", false)
+
+}
+
+/**
+ * Continuous text socket input partition.
+ */
+case class TextSocketContinuousInputPartition(
+    driverEndpointName: String,
+    partitionId: Int,
+    startOffset: Int,
+    includeTimestamp: Boolean)
+extends InputPartition[InternalRow] {
+
+  override def createPartitionReader(): InputPartitionReader[InternalRow] =
+    new TextSocketContinuousInputPartitionReader(driverEndpointName, 
partitionId, startOffset,
+      includeTimestamp)
+}
+
+/**
+ * Continuous text socket input partition reader.
+ *
+ * Polls the driver endpoint for new records.
+ */
+class TextSocketContinuousInputPartitionReader(
+    driverEndpointName: String,
+    partitionId: Int,
+    startOffset: Int,
+    includeTimestamp: Boolean)
+  extends ContinuousInputPartitionReader[InternalRow] {
+
+  private val endpoint = RpcUtils.makeDriverRef(
+    driverEndpointName,
+    SparkEnv.get.conf,
+    SparkEnv.get.rpcEnv)
+
+  private var currentOffset = startOffset
+  private var current: Option[InternalRow] = None
+
+  override def next(): Boolean = {
+    try {
+      current = getRecord
+      while (current.isEmpty) {
+        Thread.sleep(100)
+        current = getRecord
+      }
+      currentOffset += 1
+    } catch {
+      case _: InterruptedException =>
+        // Someone's trying to end the task; just let them.
+        return false
+    }
+    true
+  }
+
+  override def get(): InternalRow = {
+    current.get
+  }
+
+  override def close(): Unit = {}
+
+  override def getOffset: PartitionOffset =
+    ContinuousRecordPartitionOffset(partitionId, currentOffset)
+
+  private def getRecord: Option[InternalRow] =
+    endpoint.askSync[Option[InternalRow]](GetRecord(
+      ContinuousRecordPartitionOffset(partitionId, currentOffset))).map(rec =>
+      if (includeTimestamp) {
+        rec
+      } else {
+        InternalRow(rec.get(0, TextSocketReader.SCHEMA_TIMESTAMP)
+          .asInstanceOf[(String, Timestamp)]._1)
+      }
+    )
+}
+
+case class TextSocketOffset(offsets: List[Int]) extends Offset {
+  private implicit val formats = Serialization.formats(NoTypeHints)
+  override def json: String = Serialization.write(offsets)
+}

http://git-wip-us.apache.org/repos/asf/spark/blob/9abe09bf/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ContinuousMemoryStream.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ContinuousMemoryStream.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ContinuousMemoryStream.scala
index 711f094..4a32217 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ContinuousMemoryStream.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ContinuousMemoryStream.scala
@@ -33,7 +33,6 @@ import org.apache.spark.rpc.{RpcCallContext, RpcEndpointRef, 
RpcEnv, ThreadSafeR
 import org.apache.spark.sql.{Encoder, SQLContext}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.execution.streaming._
-import 
org.apache.spark.sql.execution.streaming.sources.ContinuousMemoryStream.GetRecord
 import org.apache.spark.sql.sources.v2.{ContinuousReadSupport, 
DataSourceOptions}
 import org.apache.spark.sql.sources.v2.reader.InputPartition
 import 
org.apache.spark.sql.sources.v2.reader.streaming.{ContinuousInputPartitionReader,
 ContinuousReader, Offset, PartitionOffset}
@@ -63,7 +62,7 @@ class ContinuousMemoryStream[A : Encoder](id: Int, 
sqlContext: SQLContext, numPa
   @GuardedBy("this")
   private var startOffset: ContinuousMemoryStreamOffset = _
 
-  private val recordEndpoint = new RecordEndpoint()
+  private val recordEndpoint = new ContinuousRecordEndpoint(records, this)
   @volatile private var endpointRef: RpcEndpointRef = _
 
   def addData(data: TraversableOnce[A]): Offset = synchronized {
@@ -94,7 +93,7 @@ class ContinuousMemoryStream[A : Encoder](id: Int, 
sqlContext: SQLContext, numPa
   override def mergeOffsets(offsets: Array[PartitionOffset]): 
ContinuousMemoryStreamOffset = {
     ContinuousMemoryStreamOffset(
       offsets.map {
-        case ContinuousMemoryStreamPartitionOffset(part, num) => (part, num)
+        case ContinuousRecordPartitionOffset(part, num) => (part, num)
       }.toMap
     )
   }
@@ -127,27 +126,9 @@ class ContinuousMemoryStream[A : Encoder](id: Int, 
sqlContext: SQLContext, numPa
       options: DataSourceOptions): ContinuousReader = {
     this
   }
-
-  /**
-   * Endpoint for executors to poll for records.
-   */
-  private class RecordEndpoint extends ThreadSafeRpcEndpoint {
-    override val rpcEnv: RpcEnv = SparkEnv.get.rpcEnv
-
-    override def receiveAndReply(context: RpcCallContext): 
PartialFunction[Any, Unit] = {
-      case GetRecord(ContinuousMemoryStreamPartitionOffset(part, index)) =>
-        ContinuousMemoryStream.this.synchronized {
-          val buf = records(part)
-          val record = if (buf.size <= index) None else Some(buf(index))
-
-          context.reply(record.map(r => encoder.toRow(r).copy()))
-        }
-    }
-  }
 }
 
 object ContinuousMemoryStream {
-  case class GetRecord(offset: ContinuousMemoryStreamPartitionOffset)
   protected val memoryStreamId = new AtomicInteger(0)
 
   def apply[A : Encoder](implicit sqlContext: SQLContext): 
ContinuousMemoryStream[A] =
@@ -207,12 +188,12 @@ class ContinuousMemoryStreamInputPartitionReader(
 
   override def close(): Unit = {}
 
-  override def getOffset: ContinuousMemoryStreamPartitionOffset =
-    ContinuousMemoryStreamPartitionOffset(partition, currentOffset)
+  override def getOffset: ContinuousRecordPartitionOffset =
+    ContinuousRecordPartitionOffset(partition, currentOffset)
 
   private def getRecord: Option[InternalRow] =
     endpoint.askSync[Option[InternalRow]](
-      GetRecord(ContinuousMemoryStreamPartitionOffset(partition, 
currentOffset)))
+      GetRecord(ContinuousRecordPartitionOffset(partition, currentOffset)))
 }
 
 case class ContinuousMemoryStreamOffset(partitionNums: Map[Int, Int])
@@ -220,6 +201,3 @@ case class ContinuousMemoryStreamOffset(partitionNums: 
Map[Int, Int])
   private implicit val formats = Serialization.formats(NoTypeHints)
   override def json(): String = Serialization.write(partitionNums)
 }
-
-case class ContinuousMemoryStreamPartitionOffset(partition: Int, numProcessed: 
Int)
-  extends PartitionOffset

http://git-wip-us.apache.org/repos/asf/spark/blob/9abe09bf/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/socket.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/socket.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/socket.scala
index 9f53a18..874c479 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/socket.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/socket.scala
@@ -33,14 +33,16 @@ import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.execution.streaming.LongOffset
+import 
org.apache.spark.sql.execution.streaming.continuous.TextSocketContinuousReader
 import org.apache.spark.sql.sources.DataSourceRegister
-import org.apache.spark.sql.sources.v2.{DataSourceOptions, DataSourceV2, 
MicroBatchReadSupport}
+import org.apache.spark.sql.sources.v2.{ContinuousReadSupport, 
DataSourceOptions, DataSourceV2, MicroBatchReadSupport}
 import org.apache.spark.sql.sources.v2.reader.{InputPartition, 
InputPartitionReader}
-import org.apache.spark.sql.sources.v2.reader.streaming.{MicroBatchReader, 
Offset}
+import org.apache.spark.sql.sources.v2.reader.streaming.{ContinuousReader, 
MicroBatchReader, Offset}
 import org.apache.spark.sql.types.{StringType, StructField, StructType, 
TimestampType}
 import org.apache.spark.unsafe.types.UTF8String
 
-object TextSocketMicroBatchReader {
+// Shared object for micro-batch and continuous reader
+object TextSocketReader {
   val SCHEMA_REGULAR = StructType(StructField("value", StringType) :: Nil)
   val SCHEMA_TIMESTAMP = StructType(StructField("value", StringType) ::
     StructField("timestamp", TimestampType) :: Nil)
@@ -137,9 +139,9 @@ class TextSocketMicroBatchReader(options: 
DataSourceOptions) extends MicroBatchR
 
   override def readSchema(): StructType = {
     if (options.getBoolean("includeTimestamp", false)) {
-      TextSocketMicroBatchReader.SCHEMA_TIMESTAMP
+      TextSocketReader.SCHEMA_TIMESTAMP
     } else {
-      TextSocketMicroBatchReader.SCHEMA_REGULAR
+      TextSocketReader.SCHEMA_REGULAR
     }
   }
 
@@ -226,7 +228,7 @@ class TextSocketMicroBatchReader(options: 
DataSourceOptions) extends MicroBatchR
 }
 
 class TextSocketSourceProvider extends DataSourceV2
-  with MicroBatchReadSupport with DataSourceRegister with Logging {
+  with MicroBatchReadSupport with ContinuousReadSupport with 
DataSourceRegister with Logging {
 
   private def checkParameters(params: DataSourceOptions): Unit = {
     logWarning("The socket source should not be used for production 
applications! " +
@@ -258,6 +260,17 @@ class TextSocketSourceProvider extends DataSourceV2
     new TextSocketMicroBatchReader(options)
   }
 
+  override def createContinuousReader(
+      schema: Optional[StructType],
+      checkpointLocation: String,
+      options: DataSourceOptions): ContinuousReader = {
+    checkParameters(options)
+    if (schema.isPresent) {
+      throw new AnalysisException("The socket source does not support a 
user-specified schema.")
+    }
+    new TextSocketContinuousReader(options)
+  }
+
   /** String that represents the format that this data source provider uses. */
   override def shortName(): String = "socket"
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/9abe09bf/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/sources/TextSocketStreamSuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/sources/TextSocketStreamSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/sources/TextSocketStreamSuite.scala
index 52e8386..48e5cf7 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/sources/TextSocketStreamSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/sources/TextSocketStreamSuite.scala
@@ -32,12 +32,13 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.execution.datasources.DataSource
 import org.apache.spark.sql.execution.streaming._
+import org.apache.spark.sql.execution.streaming.continuous._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.sources.v2.{DataSourceOptions, 
MicroBatchReadSupport}
 import org.apache.spark.sql.sources.v2.reader.streaming.{MicroBatchReader, 
Offset}
 import org.apache.spark.sql.streaming.{StreamingQueryException, StreamTest}
 import org.apache.spark.sql.test.SharedSQLContext
-import org.apache.spark.sql.types.{StringType, StructField, StructType, 
TimestampType}
+import org.apache.spark.sql.types._
 
 class TextSocketStreamSuite extends StreamTest with SharedSQLContext with 
BeforeAndAfterEach {
 
@@ -300,6 +301,101 @@ class TextSocketStreamSuite extends StreamTest with 
SharedSQLContext with Before
     }
   }
 
+  test("continuous data") {
+    serverThread = new ServerThread()
+    serverThread.start()
+
+    val reader = new TextSocketContinuousReader(
+      new DataSourceOptions(Map("numPartitions" -> "2", "host" -> "localhost",
+        "port" -> serverThread.port.toString).asJava))
+    reader.setStartOffset(Optional.empty())
+    val tasks = reader.planInputPartitions()
+    assert(tasks.size == 2)
+
+    val numRecords = 10
+    val data = scala.collection.mutable.ListBuffer[Int]()
+    val offsets = scala.collection.mutable.ListBuffer[Int]()
+    import org.scalatest.time.SpanSugar._
+    failAfter(5 seconds) {
+      // inject rows, read and check the data and offsets
+      for (i <- 0 until numRecords) {
+        serverThread.enqueue(i.toString)
+      }
+      tasks.asScala.foreach {
+        case t: TextSocketContinuousInputPartition =>
+          val r = 
t.createPartitionReader().asInstanceOf[TextSocketContinuousInputPartitionReader]
+          for (i <- 0 until numRecords / 2) {
+            r.next()
+            
offsets.append(r.getOffset().asInstanceOf[ContinuousRecordPartitionOffset].offset)
+            data.append(r.get().get(0, 
DataTypes.StringType).asInstanceOf[String].toInt)
+            // commit the offsets in the middle and validate if processing 
continues
+            if (i == 2) {
+              commitOffset(t.partitionId, i + 1)
+            }
+          }
+          assert(offsets.toSeq == Range.inclusive(1, 5))
+          assert(data.toSeq == Range(t.partitionId, 10, 2))
+          offsets.clear()
+          data.clear()
+        case _ => throw new IllegalStateException("Unexpected task type")
+      }
+      assert(reader.getStartOffset.asInstanceOf[TextSocketOffset].offsets == 
List(3, 3))
+      reader.commit(TextSocketOffset(List(5, 5)))
+      assert(reader.getStartOffset.asInstanceOf[TextSocketOffset].offsets == 
List(5, 5))
+    }
+
+    def commitOffset(partition: Int, offset: Int): Unit = {
+      val offsetsToCommit = 
reader.getStartOffset.asInstanceOf[TextSocketOffset]
+        .offsets.updated(partition, offset)
+      reader.commit(TextSocketOffset(offsetsToCommit))
+      assert(reader.getStartOffset.asInstanceOf[TextSocketOffset].offsets == 
offsetsToCommit)
+    }
+  }
+
+  test("continuous data - invalid commit") {
+    serverThread = new ServerThread()
+    serverThread.start()
+
+    val reader = new TextSocketContinuousReader(
+      new DataSourceOptions(Map("numPartitions" -> "2", "host" -> "localhost",
+        "port" -> serverThread.port.toString).asJava))
+    reader.setStartOffset(Optional.of(TextSocketOffset(List(5, 5))))
+    // ok to commit same offset
+    reader.setStartOffset(Optional.of(TextSocketOffset(List(5, 5))))
+    assertThrows[IllegalStateException] {
+      reader.commit(TextSocketOffset(List(6, 6)))
+    }
+  }
+
+  test("continuous data with timestamp") {
+    serverThread = new ServerThread()
+    serverThread.start()
+
+    val reader = new TextSocketContinuousReader(
+      new DataSourceOptions(Map("numPartitions" -> "2", "host" -> "localhost",
+        "includeTimestamp" -> "true",
+        "port" -> serverThread.port.toString).asJava))
+    reader.setStartOffset(Optional.empty())
+    val tasks = reader.planInputPartitions()
+    assert(tasks.size == 2)
+
+    val numRecords = 4
+    // inject rows, read and check the data and offsets
+    for (i <- 0 until numRecords) {
+      serverThread.enqueue(i.toString)
+    }
+    tasks.asScala.foreach {
+      case t: TextSocketContinuousInputPartition =>
+        val r = 
t.createPartitionReader().asInstanceOf[TextSocketContinuousInputPartitionReader]
+        for (i <- 0 until numRecords / 2) {
+          r.next()
+          assert(r.get().get(0, TextSocketReader.SCHEMA_TIMESTAMP)
+            .isInstanceOf[(String, Timestamp)])
+        }
+      case _ => throw new IllegalStateException("Unexpected task type")
+    }
+  }
+
   /**
    * This class tries to mimic the behavior of netcat, so that we can ensure
    * TextSocketStream supports netcat, which only accepts the first connection


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-24127][SS] Continuous text socket source

Reply via email to