Pradeep Misra created SPARK-18965: ------------------------------------- Summary: wholeTextFiles() is not able to read large files Key: SPARK-18965 URL: https://issues.apache.org/jira/browse/SPARK-18965 Project: Spark Issue Type: Bug Components: Spark Core Affects Versions: 1.6.2 Environment: All Platforms Reporter: Pradeep Misra
While working on wholeTextFiles() of size 134738099 (gz compressed) spark throws an OOM error. ERROR Executor: Exception in task 0.0 in stage 0.0 (TID 0) java.lang.OutOfMemoryError at java.io.ByteArrayOutputStream.hugeCapacity(ByteArrayOutputStream.java:123) at java.io.ByteArrayOutputStream.grow(ByteArrayOutputStream.java:117) at java.io.ByteArrayOutputStream.ensureCapacity(ByteArrayOutputStream.java:93) at java.io.ByteArrayOutputStream.write(ByteArrayOutputStream.java:153) at org.spark-project.guava.io.ByteStreams.copy(ByteStreams.java:211) at org.spark-project.guava.io.ByteStreams.toByteArray(ByteStreams.java:252) at org.apache.spark.input.WholeTextFileRecordReader.nextKeyValue(WholeTextFileRecordReader.scala:81) at org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader.nextKeyValue(CombineFileRecordReader.java:65) at org.apache.spark.rdd.NewHadoopRDD$$anon$1.hasNext(NewHadoopRDD.scala:168) at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:39) at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327) at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1631) at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1164) at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1164) at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1882) at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1882) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66) at org.apache.spark.scheduler.Task.run(Task.scala:89) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:227) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745) 16/11/30 14:25:36 ERROR SparkUncaughtExceptionHandler: Uncaught exception in thread Thread[Executor task launch worker-0,5,main] java.lang.OutOfMemoryError at java.io.ByteArrayOutputStream.hugeCapacity(ByteArrayOutputStream.java:123) at java.io.ByteArrayOutputStream.grow(ByteArrayOutputStream.java:117) at java.io.ByteArrayOutputStream.ensureCapacity(ByteArrayOutputStream.java:93) at java.io.ByteArrayOutputStream.write(ByteArrayOutputStream.java:153) at org.spark-project.guava.io.ByteStreams.copy(ByteStreams.java:211) at org.spark-project.guava.io.ByteStreams.toByteArray(ByteStreams.java:252) at org.apache.spark.input.WholeTextFileRecordReader.nextKeyValue(WholeTextFileRecordReader.scala:81) at org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader.nextKeyValue(CombineFileRecordReader.java:65) at org.apache.spark.rdd.NewHadoopRDD$$anon$1.hasNext(NewHadoopRDD.scala:168) at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:39) at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327) at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1631) at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1164) at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1164) at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1882) at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1882) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66) at org.apache.spark.scheduler.Task.run(Task.scala:89) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:227) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745) 16/11/30 14:25:36 INFO SparkContext: Invoking stop() from shutdown hook 16/11/30 14:25:36 WARN TaskSetManager: Lost task 0.0 in stage 0.0 (TID 0, localhost): java.lang.OutOfMemoryError at java.io.ByteArrayOutputStream.hugeCapacity(ByteArrayOutputStream.java:123) at java.io.ByteArrayOutputStream.grow(ByteArrayOutputStream.java:117) at java.io.ByteArrayOutputStream.ensureCapacity(ByteArrayOutputStream.java:93) at java.io.ByteArrayOutputStream.write(ByteArrayOutputStream.java:153) at org.spark-project.guava.io.ByteStreams.copy(ByteStreams.java:211) at org.spark-project.guava.io.ByteStreams.toByteArray(ByteStreams.java:252) at org.apache.spark.input.WholeTextFileRecordReader.nextKeyValue(WholeTextFileRecordReader.scala:81) at org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader.nextKeyValue(CombineFileRecordReader.java:65) at org.apache.spark.rdd.NewHadoopRDD$$anon$1.hasNext(NewHadoopRDD.scala:168) at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:39) at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327) at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1631) at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1164) at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1164) at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1882) at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1882) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66) at org.apache.spark.scheduler.Task.run(Task.scala:89) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:227) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745) -- This message was sent by Atlassian JIRA (v6.3.4#6332) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org