Github user viirya commented on a diff in the pull request: https://github.com/apache/spark/pull/21546#discussion_r194949076 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala --- @@ -34,17 +34,36 @@ private[sql] object PythonSQLUtils { } /** - * Python Callable function to convert ArrowPayloads into a [[DataFrame]]. + * Python callable function to convert an RDD of serialized ArrowRecordBatches into + * a [[DataFrame]]. * - * @param payloadRDD A JavaRDD of ArrowPayloads. - * @param schemaString JSON Formatted Schema for ArrowPayloads. + * @param arrowBatchRDD A JavaRDD of serialized ArrowRecordBatches. + * @param schemaString JSON Formatted Spark schema for Arrow batches. * @param sqlContext The active [[SQLContext]]. * @return The converted [[DataFrame]]. */ - def arrowPayloadToDataFrame( - payloadRDD: JavaRDD[Array[Byte]], + def arrowStreamToDataFrame( + arrowBatchRDD: JavaRDD[Array[Byte]], schemaString: String, sqlContext: SQLContext): DataFrame = { - ArrowConverters.toDataFrame(payloadRDD, schemaString, sqlContext) + ArrowConverters.toDataFrame(arrowBatchRDD, schemaString, sqlContext) + } + + /** + * Python callable function to read a file in Arrow stream format and create a [[DataFrame]] + * using each serialized ArrowRecordBatch as a partition. + * + * @param sqlContext The active [[SQLContext]]. + * @param filename File to read the Arrow stream from. + * @param schemaString JSON Formatted Spark schema for Arrow batches. + * @return A new [[DataFrame]]. + */ + def arrowReadStreamFromFile( + sqlContext: SQLContext, + filename: String, + schemaString: String): DataFrame = { + JavaSparkContext.fromSparkContext(sqlContext.sparkContext) --- End diff -- What is this line for?
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org