[ https://issues.apache.org/jira/browse/SPARK-32053?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Kayal reopened SPARK-32053: --------------------------- Hi, I have verified the issue in spark latest version 3.0.0 , the issue seems to be still there on windows. The problem is on windows when we try to pipline.write().overwrite().save(temp_dir) is failing with ~\AppData\Local\IBMWS\miniconda3\envs\desktop\lib\site-packages\pyspark\ml\util.py in save(self, path) 173 if not isinstance(path, basestring): 174 raise TypeError("path should be a basestring, got type %s" % type(path)) --> 175 self._jwrite.save(path) 176 177 def overwrite(self): ~\AppData\Local\IBMWS\miniconda3\envs\desktop\lib\site-packages\py4j\java_gateway.py in __call__(self, *args) 1303 answer = self.gateway_client.send_command(command) 1304 return_value = get_return_value( -> 1305 answer, self.gateway_client, self.target_id, self.name) 1306 1307 for temp_arg in temp_args: ~\AppData\Local\IBMWS\miniconda3\envs\desktop\lib\site-packages\pyspark\sql\utils.py in deco(*a, **kw) 129 def deco(*a, **kw): 130 try: --> 131 return f(*a, **kw) 132 except py4j.protocol.Py4JJavaError as e: 133 converted = convert_exception(e.java_exception) ~\AppData\Local\IBMWS\miniconda3\envs\desktop\lib\site-packages\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name) 326 raise Py4JJavaError( 327 "An error occurred while calling \{0}{1}\{2}.\n". --> 328 format(target_id, ".", name), value) 329 else: 330 raise Py4JError( Py4JJavaError: An error occurred while calling o662.save. : org.apache.spark.SparkException: Job aborted. at org.apache.spark.internal.io.SparkHadoopWriter$.write(SparkHadoopWriter.scala:100) at org.apache.spark.rdd.PairRDDFunctions.$anonfun$saveAsHadoopDataset$1(PairRDDFunctions.scala:1090) at org.apache.spark.rdd.PairRDDFunctions$$Lambda$2417.000000001D19A7B0.apply$mcV$sp(Unknown Source) at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) at org.apache.spark.rdd.RDD.withScope(RDD.scala:388) at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopDataset(PairRDDFunctions.scala:1088) at org.apache.spark.rdd.PairRDDFunctions.$anonfun$saveAsHadoopFile$4(PairRDDFunctions.scala:1061) at org.apache.spark.rdd.PairRDDFunctions$$Lambda$2415.000000000FE34B70.apply$mcV$sp(Unknown Source) at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) at org.apache.spark.rdd.RDD.withScope(RDD.scala:388) at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:1026) at org.apache.spark.rdd.PairRDDFunctions.$anonfun$saveAsHadoopFile$3(PairRDDFunctions.scala:1008) at org.apache.spark.rdd.PairRDDFunctions$$Lambda$2414.000000001CBB0D40.apply$mcV$sp(Unknown Source) at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) at org.apache.spark.rdd.RDD.withScope(RDD.scala:388) at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:1007) at org.apache.spark.rdd.PairRDDFunctions.$anonfun$saveAsHadoopFile$2(PairRDDFunctions.scala:964) at org.apache.spark.rdd.PairRDDFunctions$$Lambda$2413.000000001D196EA0.apply$mcV$sp(Unknown Source) at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) at org.apache.spark.rdd.RDD.withScope(RDD.scala:388) at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:962) at org.apache.spark.rdd.RDD.$anonfun$saveAsTextFile$2(RDD.scala:1552) at org.apache.spark.rdd.RDD$$Lambda$2411.0000000018FEB4E0.apply$mcV$sp(Unknown Source) at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) at org.apache.spark.rdd.RDD.withScope(RDD.scala:388) at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1552) at org.apache.spark.rdd.RDD.$anonfun$saveAsTextFile$1(RDD.scala:1538) at org.apache.spark.rdd.RDD$$Lambda$2410.000000001CA30180.apply$mcV$sp(Unknown Source) at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) at org.apache.spark.rdd.RDD.withScope(RDD.scala:388) at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1538) at org.apache.spark.ml.util.DefaultParamsWriter$.saveMetadata(ReadWrite.scala:413) at org.apache.spark.ml.Pipeline$SharedReadWrite$.$anonfun$saveImpl$1(Pipeline.scala:250) at org.apache.spark.ml.Pipeline$SharedReadWrite$.$anonfun$saveImpl$1$adapted(Pipeline.scala:247) at org.apache.spark.ml.Pipeline$SharedReadWrite$$$Lambda$2397.00000000190AB010.apply(Unknown Source) at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191) at org.apache.spark.ml.util.Instrumentation$$$Lambda$1390.0000000018680E40.apply(Unknown Source) at scala.util.Try$.apply(Try.scala:213) at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191) at org.apache.spark.ml.Pipeline$SharedReadWrite$.saveImpl(Pipeline.scala:247) at org.apache.spark.ml.Pipeline$PipelineWriter.saveImpl(Pipeline.scala:206) at org.apache.spark.ml.util.MLWriter.save(ReadWrite.scala:168) at org.apache.spark.ml.Pipeline$PipelineWriter.super$save(Pipeline.scala:204) at org.apache.spark.ml.Pipeline$PipelineWriter.$anonfun$save$2(Pipeline.scala:204) at org.apache.spark.ml.Pipeline$PipelineWriter$$Lambda$2391.0000000018FED1F0.apply$mcV$sp(Unknown Source) at org.apache.spark.ml.MLEvents.withSaveInstanceEvent(events.scala:176) at org.apache.spark.ml.MLEvents.withSaveInstanceEvent$(events.scala:171) at org.apache.spark.ml.util.Instrumentation.withSaveInstanceEvent(Instrumentation.scala:42) at org.apache.spark.ml.Pipeline$PipelineWriter.$anonfun$save$1(Pipeline.scala:204) at org.apache.spark.ml.Pipeline$PipelineWriter.$anonfun$save$1$adapted(Pipeline.scala:204) at org.apache.spark.ml.Pipeline$PipelineWriter$$Lambda$2390.0000000018CF4210.apply(Unknown Source) at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191) at org.apache.spark.ml.util.Instrumentation$$$Lambda$1390.0000000018680E40.apply(Unknown Source) at scala.util.Try$.apply(Try.scala:213) at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191) at org.apache.spark.ml.Pipeline$PipelineWriter.save(Pipeline.scala:204) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:90) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:55) at java.lang.reflect.Method.invoke(Method.java:508) at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) at py4j.Gateway.invoke(Gateway.java:282) at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) at py4j.commands.CallCommand.execute(CallCommand.java:79) at py4j.GatewayConnection.run(GatewayConnection.java:238) at java.lang.Thread.run(Thread.java:812) Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 17.0 failed 1 times, most recent failure: Lost task 0.0 in stage 17.0 (TID 17, elaine-svl1.fyre.ibm.com, executor driver): java.io.IOException: (null) entry in command string: null chmod 0644 C:\Users\Administrator\AppData\Roaming\IBM Watson Studio\projects\tempfile\metadata\_temporary\0\_temporary\attempt_20200813072339_0057_m_000000_0\part-00000 at org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:773) at org.apache.hadoop.util.Shell.execCommand(Shell.java:869) at org.apache.hadoop.util.Shell.execCommand(Shell.java:852) at org.apache.hadoop.fs.RawLocalFileSystem.setPermission(RawLocalFileSystem.java:733) at org.apache.hadoop.fs.RawLocalFileSystem$LocalFSFileOutputStream.<init>(RawLocalFileSystem.java:225) at org.apache.hadoop.fs.RawLocalFileSystem$LocalFSFileOutputStream.<init>(RawLocalFileSystem.java:209) at org.apache.hadoop.fs.RawLocalFileSystem.createOutputStreamWithMode(RawLocalFileSystem.java:307) at org.apache.hadoop.fs.RawLocalFileSystem.create(RawLocalFileSystem.java:296) at org.apache.hadoop.fs.RawLocalFileSystem.create(RawLocalFileSystem.java:328) at org.apache.hadoop.fs.ChecksumFileSystem$ChecksumFSOutputSummer.<init>(ChecksumFileSystem.java:398) at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:461) at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:440) at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:911) at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:804) at org.apache.hadoop.mapred.TextOutputFormat.getRecordWriter(TextOutputFormat.java:123) at org.apache.spark.internal.io.HadoopMapRedWriteConfigUtil.initWriter(SparkHadoopWriter.scala:230) at org.apache.spark.internal.io.SparkHadoopWriter$.executeTask(SparkHadoopWriter.scala:120) at org.apache.spark.internal.io.SparkHadoopWriter$.$anonfun$write$1(SparkHadoopWriter.scala:83) at org.apache.spark.internal.io.SparkHadoopWriter$$$Lambda$2430.00000000190AD130.apply(Unknown Source) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) at org.apache.spark.scheduler.Task.run(Task.scala:127) at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:444) at org.apache.spark.executor.Executor$TaskRunner$$Lambda$1930.0000000018D8A070.apply(Unknown Source) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:447) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1160) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635) at java.lang.Thread.run(Thread.java:812) Driver stacktrace: at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2023) at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:1972) at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:1971) at org.apache.spark.scheduler.DAGScheduler$$Lambda$2450.0000000016D72F80.apply(Unknown Source) at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62) at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49) at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1971) at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:950) at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:950) at org.apache.spark.scheduler.DAGScheduler$$Lambda$2448.00000000190AE890.apply(Unknown Source) at scala.Option.foreach(Option.scala:407) at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:950) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2203) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2152) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2141) at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49) at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:752) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2093) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2114) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2146) at org.apache.spark.internal.io.SparkHadoopWriter$.write(SparkHadoopWriter.scala:78) ... 78 more Caused by: java.io.IOException: (null) entry in command string: null chmod 0644 C:\Users\Administrator\AppData\Roaming\IBM Watson Studio\projects\tempfile\metadata\_temporary\0\_temporary\attempt_20200813072339_0057_m_000000_0\part-00000 at org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:773) at org.apache.hadoop.util.Shell.execCommand(Shell.java:869) at org.apache.hadoop.util.Shell.execCommand(Shell.java:852) at org.apache.hadoop.fs.RawLocalFileSystem.setPermission(RawLocalFileSystem.java:733) at org.apache.hadoop.fs.RawLocalFileSystem$LocalFSFileOutputStream.<init>(RawLocalFileSystem.java:225) at org.apache.hadoop.fs.RawLocalFileSystem$LocalFSFileOutputStream.<init>(RawLocalFileSystem.java:209) at org.apache.hadoop.fs.RawLocalFileSystem.createOutputStreamWithMode(RawLocalFileSystem.java:307) at org.apache.hadoop.fs.RawLocalFileSystem.create(RawLocalFileSystem.java:296) at org.apache.hadoop.fs.RawLocalFileSystem.create(RawLocalFileSystem.java:328) at org.apache.hadoop.fs.ChecksumFileSystem$ChecksumFSOutputSummer.<init>(ChecksumFileSystem.java:398) at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:461) at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:440) at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:911) at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:804) at org.apache.hadoop.mapred.TextOutputFormat.getRecordWriter(TextOutputFormat.java:123) at org.apache.spark.internal.io.HadoopMapRedWriteConfigUtil.initWriter(SparkHadoopWriter.scala:230) at org.apache.spark.internal.io.SparkHadoopWriter$.executeTask(SparkHadoopWriter.scala:120) at org.apache.spark.internal.io.SparkHadoopWriter$.$anonfun$write$1(SparkHadoopWriter.scala:83) at org.apache.spark.internal.io.SparkHadoopWriter$$$Lambda$2430.00000000190AD130.apply(Unknown Source) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) at org.apache.spark.scheduler.Task.run(Task.scala:127) at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:444) at org.apache.spark.executor.Executor$TaskRunner$$Lambda$1930.0000000018D8A070.apply(Unknown Source) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:447) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1160) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635) ... 1 more dir !dir > pyspark save of serialized model is failing for windows. > -------------------------------------------------------- > > Key: SPARK-32053 > URL: https://issues.apache.org/jira/browse/SPARK-32053 > Project: Spark > Issue Type: Bug > Components: PySpark > Affects Versions: 2.3.0 > Reporter: Kayal > Priority: Major > Attachments: image-2020-06-22-18-19-32-236.png > > > {color:#172b4d}Hi, {color} > {color:#172b4d}We are using spark functionality to save the serialized model > to disk . On windows platform we are seeing save of the serialized model is > failing with the error: o288.save() failed. {color} > > > > !image-2020-06-22-18-19-32-236.png! > > > -- This message was sent by Atlassian Jira (v8.3.4#803005) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org