Hi, I am doing some basic preprocessing in pyspark (local mode as follows):
files = [ input files] def read(filename,sc): #process file return rdd if __name__ =="__main__": conf = SparkConf() conf.setMaster('local') sc = SparkContext(conf =conf) sc.setCheckpointDir(root+"temp/") data = sc.parallelize([]) for i,f in enumerate(files): data = data.union(read(f,sc)) if i ==20: data.checkpoint() data.count() if i == 500:break #print data.count() #rdd_1 = read(files[0],sc) data.saveAsTextFile(root+"output/") But I see this error: keyed._jrdd.map(self.ctx._jvm.BytesToString()).saveAsTextFile(path) File "/Users/ping/Desktop/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py", line 538, in __call__ File "/Users/ping/Desktop/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py", line 300, in get_return_value py4j.protocol.Py4JJavaError: An error occurred while calling o9564.saveAsTextFile. : org.apache.spark.SparkException: Job aborted due to stage failure: Task serialization failed: java.lang.StackOverflowError java.io.Bits.putInt(Bits.java:93) java.io.ObjectOutputStream$BlockDataOutputStream.writeInt(ObjectOutputStream.java:1927)