Hello, I am using the Python API to perform a grid search and train models using LogisticRegressionWithSGD. I am using r3.xl machines in EC2, running on top of YARN in cluster mode.
The training RDD is persisted in memory and on disk. Some of the models train successfully, but then at some point during the grid search I get an error. It looks like the Python broadcast is looking for a part of the RDD which is no longer there. I scanned the logs for further errors but could not find anything. Any ideas of what could be causing this, and what should I be looking for? Many thanks. Cat model = LogisticRegressionWithSGD.train(the_training, iterations=i, regParam=c, miniBatchFraction=0.8) File "/home/hadoop/spark/python/pyspark/mllib/classification.py", line 164, in train return _regression_train_wrapper(train, LogisticRegressionModel, data, initialWeights) File "/home/hadoop/spark/python/pyspark/mllib/regression.py", line 140, in _regression_train_wrapper weights, intercept = train_func(data, _convert_to_vector(initial_weights)) File "/home/hadoop/spark/python/pyspark/mllib/classification.py", line 162, in train bool(intercept)) File "/home/hadoop/spark/python/pyspark/mllib/common.py", line 120, in callMLlibFunc return callJavaFunc(sc, api, *args) File "/home/hadoop/spark/python/pyspark/mllib/common.py", line 113, in callJavaFunc return _java2py(sc, func(*args)) File "/home/hadoop/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py", line 538, in __call__ self.target_id, self.name) File "/home/hadoop/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py", line 300, in get_return_value format(target_id, '.', name), value) Py4JJavaError: An error occurred while calling o271.trainLogisticRegressionModelWithSGD. : org.apache.spark.SparkException: Job aborted due to stage failure: Task serialization failed: java.io.FileNotFoundException: /mnt/spark/spark-b07b34f8-66c3-43ae-a3ed-0c291724409b/pyspark-4196e8e5-8024-4ec5-a7bb-a60b216e6e74/tmpbCjiSR (No such file or directory) java.io.FileInputStream.open(Native Method) java.io.FileInputStream.<init>(FileInputStream.java:146) org.apache.spark.api.python.PythonBroadcast$$anonfun$writeObject$1.apply$mcJ$sp(PythonRDD.scala:848) org.apache.spark.api.python.PythonBroadcast$$anonfun$writeObject$1.apply(PythonRDD.scala:847) org.apache.spark.api.python.PythonBroadcast$$anonfun$writeObject$1.apply(PythonRDD.scala:847) org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1153) org.apache.spark.api.python.PythonBroadcast.writeObject(PythonRDD.scala:847) sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57) sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) java.lang.reflect.Method.invoke(Method.java:606) java.io.ObjectStreamClass.invokeWriteObject(ObjectStreamClass.java:988) java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1495) java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1431) java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1177) java.io.ObjectOutputStream.writeObject(ObjectOutputStream.java:347) org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:44) org.apache.spark.serializer.SerializationStream.writeAll(Serializer.scala:110) org.apache.spark.storage.BlockManager.dataSerializeStream(BlockManager.scala:1176) org.apache.spark.storage.DiskStore.putIterator(DiskStore.scala:79) org.apache.spark.storage.DiskStore.putArray(DiskStore.scala:64) org.apache.spark.storage.BlockManager.dropFromMemory(BlockManager.scala:1028) org.apache.spark.storage.MemoryStore$$anonfun$ensureFreeSpace$4.apply(MemoryStore.scala:419) org.apache.spark.storage.MemoryStore$$anonfun$ensureFreeSpace$4.apply(MemoryStore.scala:408) scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47) org.apache.spark.storage.MemoryStore.ensureFreeSpace(MemoryStore.scala:408) org.apache.spark.storage.MemoryStore.unrollSafely(MemoryStore.scala:263) org.apache.spark.storage.MemoryStore.putIterator(MemoryStore.scala:136) org.apache.spark.storage.MemoryStore.putIterator(MemoryStore.scala:114) org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:786) org.apache.spark.storage.BlockManager.putIterator(BlockManager.scala:637) org.apache.spark.storage.BlockManager.putSingle(BlockManager.scala:991) org.apache.spark.broadcast.TorrentBroadcast.writeBlocks(TorrentBroadcast.scala:98) org.apache.spark.broadcast.TorrentBroadcast.<init>(TorrentBroadcast.scala:84) org.apache.spark.broadcast.TorrentBroadcastFactory.newBroadcast(TorrentBroadcastFactory.scala:34) org.apache.spark.broadcast.TorrentBroadcastFactory.newBroadcast(TorrentBroadcastFactory.scala:29) org.apache.spark.broadcast.BroadcastManager.newBroadcast(BroadcastManager.scala:62) org.apache.spark.SparkContext.broadcast(SparkContext.scala:1051) org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$submitMissingTasks(DAGScheduler.scala:839) org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$submitStage(DAGScheduler.scala:778) org.apache.spark.scheduler.DAGScheduler.handleJobSubmitted(DAGScheduler.scala:762) org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1362) org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1354) org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) -- View this message in context: http://apache-spark-user-list.1001560.n3.nabble.com/Temp-file-missing-when-training-logistic-regression-tp24153.html Sent from the Apache Spark User List mailing list archive at Nabble.com. --------------------------------------------------------------------- To unsubscribe, e-mail: user-unsubscr...@spark.apache.org For additional commands, e-mail: user-h...@spark.apache.org