[ https://issues.apache.org/jira/browse/SPARK-18965?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16196872#comment-16196872 ]
sam commented on SPARK-18965: ----------------------------- [~pradeep_misra] [~srowen]. Yes it's a new feature. What we need is this: https://issues.apache.org/jira/browse/SPARK-22225 > wholeTextFiles() is not able to read large files > ------------------------------------------------ > > Key: SPARK-18965 > URL: https://issues.apache.org/jira/browse/SPARK-18965 > Project: Spark > Issue Type: Bug > Components: Spark Core > Affects Versions: 1.6.2 > Environment: All Platforms > Reporter: Pradeep Misra > Labels: ReadFile > Original Estimate: 1,344h > Remaining Estimate: 1,344h > > While working on wholeTextFiles() of size 134738099 (gz compressed) spark > throws an OOM error. > ERROR Executor: Exception in task 0.0 in stage 0.0 (TID 0) > java.lang.OutOfMemoryError > at > java.io.ByteArrayOutputStream.hugeCapacity(ByteArrayOutputStream.java:123) > at java.io.ByteArrayOutputStream.grow(ByteArrayOutputStream.java:117) > at > java.io.ByteArrayOutputStream.ensureCapacity(ByteArrayOutputStream.java:93) > at java.io.ByteArrayOutputStream.write(ByteArrayOutputStream.java:153) > at org.spark-project.guava.io.ByteStreams.copy(ByteStreams.java:211) > at > org.spark-project.guava.io.ByteStreams.toByteArray(ByteStreams.java:252) > at > org.apache.spark.input.WholeTextFileRecordReader.nextKeyValue(WholeTextFileRecordReader.scala:81) > at > org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader.nextKeyValue(CombineFileRecordReader.java:65) > at > org.apache.spark.rdd.NewHadoopRDD$$anon$1.hasNext(NewHadoopRDD.scala:168) > at > org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:39) > at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327) > at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1631) > at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1164) > at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1164) > at > org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1882) > at > org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1882) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66) > at org.apache.spark.scheduler.Task.run(Task.scala:89) > at > org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:227) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) > at java.lang.Thread.run(Thread.java:745) > 16/11/30 14:25:36 ERROR SparkUncaughtExceptionHandler: Uncaught exception in > thread Thread[Executor task launch worker-0,5,main] > java.lang.OutOfMemoryError > at > java.io.ByteArrayOutputStream.hugeCapacity(ByteArrayOutputStream.java:123) > at java.io.ByteArrayOutputStream.grow(ByteArrayOutputStream.java:117) > at > java.io.ByteArrayOutputStream.ensureCapacity(ByteArrayOutputStream.java:93) > at java.io.ByteArrayOutputStream.write(ByteArrayOutputStream.java:153) > at org.spark-project.guava.io.ByteStreams.copy(ByteStreams.java:211) > at > org.spark-project.guava.io.ByteStreams.toByteArray(ByteStreams.java:252) > at > org.apache.spark.input.WholeTextFileRecordReader.nextKeyValue(WholeTextFileRecordReader.scala:81) > at > org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader.nextKeyValue(CombineFileRecordReader.java:65) > at > org.apache.spark.rdd.NewHadoopRDD$$anon$1.hasNext(NewHadoopRDD.scala:168) > at > org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:39) > at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327) > at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1631) > at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1164) > at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1164) > at > org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1882) > at > org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1882) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66) > at org.apache.spark.scheduler.Task.run(Task.scala:89) > at > org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:227) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) > at java.lang.Thread.run(Thread.java:745) > 16/11/30 14:25:36 INFO SparkContext: Invoking stop() from shutdown hook > 16/11/30 14:25:36 WARN TaskSetManager: Lost task 0.0 in stage 0.0 (TID 0, > localhost): java.lang.OutOfMemoryError > at > java.io.ByteArrayOutputStream.hugeCapacity(ByteArrayOutputStream.java:123) > at java.io.ByteArrayOutputStream.grow(ByteArrayOutputStream.java:117) > at > java.io.ByteArrayOutputStream.ensureCapacity(ByteArrayOutputStream.java:93) > at java.io.ByteArrayOutputStream.write(ByteArrayOutputStream.java:153) > at org.spark-project.guava.io.ByteStreams.copy(ByteStreams.java:211) > at > org.spark-project.guava.io.ByteStreams.toByteArray(ByteStreams.java:252) > at > org.apache.spark.input.WholeTextFileRecordReader.nextKeyValue(WholeTextFileRecordReader.scala:81) > at > org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader.nextKeyValue(CombineFileRecordReader.java:65) > at > org.apache.spark.rdd.NewHadoopRDD$$anon$1.hasNext(NewHadoopRDD.scala:168) > at > org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:39) > at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327) > at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1631) > at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1164) > at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1164) > at > org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1882) > at > org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1882) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66) > at org.apache.spark.scheduler.Task.run(Task.scala:89) > at > org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:227) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) > at java.lang.Thread.run(Thread.java:745) -- This message was sent by Atlassian JIRA (v6.4.14#64029) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org