Hi,
  I am doing some basic preprocessing in pyspark (local mode as follows):

files = [ input files]
def read(filename,sc):
  #process file
  return rdd

if __name__ =="__main__":
   conf = SparkConf()
  conf.setMaster('local')
  sc = SparkContext(conf =conf)
  sc.setCheckpointDir(root+"temp/")

  data = sc.parallelize([])

  for i,f in enumerate(files):

    data = data.union(read(f,sc))
    if i ==20:
      data.checkpoint()
      data.count()
    if i == 500:break
  #print data.count()
  #rdd_1 = read(files[0],sc)
  data.saveAsTextFile(root+"output/")


But I see this error:
  keyed._jrdd.map(self.ctx._jvm.BytesToString()).saveAsTextFile(path)
  File
"/Users/ping/Desktop/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py",
line 538, in __call__
  File
"/Users/ping/Desktop/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py",
line 300, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling
o9564.saveAsTextFile.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task
serialization failed: java.lang.StackOverflowError
java.io.Bits.putInt(Bits.java:93)
java.io.ObjectOutputStream$BlockDataOutputStream.writeInt(ObjectOutputStream.java:1927)

Reply via email to