Github user srowen commented on a diff in the pull request: https://github.com/apache/spark/pull/22404#discussion_r217155746 --- Diff: python/pyspark/context.py --- @@ -499,19 +506,32 @@ def f(split, iterator): def _serialize_to_jvm(self, data, parallelism, serializer): """ - Calling the Java parallelize() method with an ArrayList is too slow, - because it sends O(n) Py4J commands. As an alternative, serialized - objects are written to a file and loaded through textFile(). - """ - tempFile = NamedTemporaryFile(delete=False, dir=self._temp_dir) - try: - serializer.dump_stream(data, tempFile) - tempFile.close() - readRDDFromFile = self._jvm.PythonRDD.readRDDFromFile - return readRDDFromFile(self._jsc, tempFile.name, parallelism) - finally: - # readRDDFromFile eagerily reads the file so we can delete right after. - os.unlink(tempFile.name) + Using py4j to send a large dataset to the jvm is really slow, so we use either a file + or a socket if we have encryption enabled. + """ + if self._encryption_enabled: + # with encryption, we open a server in java and send the data directly + server = self._jvm.PythonParallelizeServer(self._jsc.sc(), parallelism) + (sock_file, _) = local_connect_and_auth(server.port(), server.secret()) + chunked_out = ChunkedStream(sock_file, 8192) + serializer.dump_stream(data, chunked_out) + chunked_out.close() + # this call will block until the server has read all the data and processed it (or + # throws an exception) + r = server.getResult() + return r --- End diff -- Nit, do you want to just `return server.getResult()` in cases like this?
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org