Github user HyukjinKwon commented on a diff in the pull request: https://github.com/apache/spark/pull/21546#discussion_r204321600 --- Diff: python/pyspark/sql/dataframe.py --- @@ -2146,14 +2148,15 @@ def toPandas(self): def _collectAsArrow(self): """ - Returns all records as list of deserialized ArrowPayloads, pyarrow must be installed - and available. + Returns all records as a list of ArrowRecordBatches and batch order as a list of indices, + pyarrow must be installed and available on driver and worker Python environments. .. note:: Experimental. """ + ser = BatchOrderSerializer(ArrowStreamSerializer()) with SCCallSiteSync(self._sc) as css: sock_info = self._jdf.collectAsArrowToPython() - return list(_load_from_socket(sock_info, ArrowSerializer())) + return list(_load_from_socket(sock_info, ser)), ser.get_batch_order_and_reset() --- End diff -- Hmmm .. @BryanCutler, would you mind if I ask why this batch order is required?
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org