- Try doing less in each transformation
- Try using different data structures within the transformations
- Try not caching anything to free up more memory


On Wed, May 25, 2016 at 1:32 AM, pseudo oduesp <pseudo20...@gmail.com>
wrote:

> hi guys ,
> -i get this errors with pyspark 1.5.0 under cloudera CDH 5.5 (yarn)
>
> -i use yarn to deploy job on cluster.
> -i use hive context  and parquet file to save my data.
> limit container 16 GB
> number of executor i tested befor it s 12 GB (executor memory)
> -i tested  to increase number of partitions (by default it s 200) i
> multipie by 2 and 3  whitout succes.
>
> -I try to change number of sql partitins shuffle
>
>
> -i remarque in spark UI when (shuffle write it triggerd no problem) but
> (when shuffle read triggerd i lost executors and get erros)
>
>
>
> and realy blocked by this error  where she came from
>
>
>
>
>  ERROR util.SparkUncaughtExceptionHandler: Uncaught exception in thread
> Thread[Executor task launch worker-5,5,main]
> java.lang.OutOfMemoryError: Java heap space
>         at
> parquet.column.values.dictionary.IntList.initSlab(IntList.java:90)
>         at parquet.column.values.dictionary.IntList.<init>(IntList.java:86)
>         at
> parquet.column.values.dictionary.DictionaryValuesWriter.<init>(DictionaryValuesWriter.java:93)
>         at
> parquet.column.values.dictionary.DictionaryValuesWriter$PlainBinaryDictionaryValuesWriter.<init>(DictionaryValuesWriter.java:229)
>         at
> parquet.column.ParquetProperties.dictionaryWriter(ParquetProperties.java:131)
>         at
> parquet.column.ParquetProperties.dictWriterWithFallBack(ParquetProperties.java:178)
>         at
> parquet.column.ParquetProperties.getValuesWriter(ParquetProperties.java:203)
>         at
> parquet.column.impl.ColumnWriterV1.<init>(ColumnWriterV1.java:84)
>         at
> parquet.column.impl.ColumnWriteStoreV1.newMemColumn(ColumnWriteStoreV1.java:68)
>         at
> parquet.column.impl.ColumnWriteStoreV1.getColumnWriter(ColumnWriteStoreV1.java:56)
>         at
> parquet.io.MessageColumnIO$MessageColumnIORecordConsumer.<init>(MessageColumnIO.java:207)
>         at
> parquet.io.MessageColumnIO.getRecordWriter(MessageColumnIO.java:405)
>         at
> parquet.hadoop.InternalParquetRecordWriter.initStore(InternalParquetRecordWriter.java:107)
>         at
> parquet.hadoop.InternalParquetRecordWriter.<init>(InternalParquetRecordWriter.java:97)
>         at
> parquet.hadoop.ParquetRecordWriter.<init>(ParquetRecordWriter.java:100)
>         at
> parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:326)
>         at
> parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:282)
>         at
> org.apache.spark.sql.execution.datasources.parquet.ParquetOutputWriter.<init>(ParquetRelation.scala:94)
>         at
> org.apache.spark.sql.execution.datasources.parquet.ParquetRelation$$anon$3.newInstance(ParquetRelation.scala:272)
>         at
> org.apache.spark.sql.execution.datasources.DefaultWriterContainer.writeRows(WriterContainer.scala:233)
>  at parquet.io.MessageColumnIO.getRecordWriter(MessageColumnIO.java:405)
>         at
> parquet.hadoop.InternalParquetRecordWriter.initStore(InternalParquetRecordWriter.java:107)
>         at
> parquet.hadoop.InternalParquetRecordWriter.<init>(InternalParquetRecordWriter.java:97)
>         at
> parquet.hadoop.ParquetRecordWriter.<init>(ParquetRecordWriter.java:100)
>         at
> parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:326)
>         at
> parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:282)
>         at
> org.apache.spark.sql.execution.datasources.parquet.ParquetOutputWriter.<init>(ParquetRelation.scala:94)
>         at
> org.apache.spark.sql.execution.datasources.parquet.ParquetRelation$$anon$3.newInstance(ParquetRelation.scala:272)
>         at
> org.apache.spark.sql.execution.datasources.DefaultWriterContainer.writeRows(WriterContainer.scala:233)
>         at
> org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelation$$anonfun$run$1$$anonfun$apply$mcV$sp$3.apply(InsertIntoHadoopFsRelation.scala:150)
>         at
> org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelation$$anonfun$run$1$$anonfun$apply$mcV$sp$3.apply(InsertIntoHadoopFsRelation.scala:150)
>         at
> org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
>         at org.apache.spark.scheduler.Task.run(Task.scala:88)
>         at
> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
>         at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
>         at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
>         at java.lang.Thread.run(Thread.java:745)
> 16/05/25 09:54:42 ERROR util.SparkUncaughtExceptionHandler: Uncaught
> exception in thread Thread[Executor task launch worker-6,5,main]
> java.lang.OutOfMemoryError: Java heap space
>         at
> parquet.column.values.dictionary.IntList.initSlab(IntList.java:90)
>         at parquet.column.values.dictionary.IntList.<init>(IntList.java:86)
>         at
> parquet.column.values.dictionary.DictionaryValuesWriter.<init>(DictionaryValuesWriter.java:93)
>         at
> parquet.column.values.dictionary.DictionaryValuesWriter$PlainBinaryDictionaryValuesWriter.<init>(DictionaryValuesWriter.java:229)
>         at
> parquet.column.ParquetProperties.dictionaryWriter(ParquetProperties.java:131)
>         at
> parquet.column.ParquetProperties.dictWriterWithFallBack(ParquetProperties.java:178)
>         at
> parquet.column.ParquetProperties.getValuesWriter(ParquetProperties.java:203)
>         at
> parquet.column.impl.ColumnWriterV1.<init>(ColumnWriterV1.java:84)
>         at
> parquet.column.impl.ColumnWriteStoreV1.newMemColumn(ColumnWriteStoreV1.java:68)
>         at
> parquet.column.impl.ColumnWriteStoreV1.getColumnWriter(ColumnWriteStoreV1.java:56)
>         at
> parquet.io.MessageColumnIO$MessageColumnIORecordConsumer.<init>(MessageColumnIO.java:207)
>         at
> parquet.io.MessageColumnIO.getRecordWriter(MessageColumnIO.java:405)
>         at
> parquet.hadoop.InternalParquetRecordWriter.initStore(InternalParquetRecordWriter.java:107)
>         at
> parquet.hadoop.InternalParquetRecordWriter.<init>(InternalParquetRecordWriter.java:97)
>         at
> parquet.hadoop.ParquetRecordWriter.<init>(ParquetRecordWriter.java:100)
>         at
> parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:326)
>         at
> parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:282)
>         at
> org.apache.spark.sql.execution.datasources.parquet.ParquetOutputWriter.<init>(ParquetRelation.scala:94)
>         at
> org.apache.spark.sql.execution.datasources.parquet.ParquetRelation$$anon$3.newInstance(ParquetRelation.scala:272)
>         at
> org.apache.spark.sql.execution.datasources.DefaultWriterContainer.writeRows(WriterContainer.scala:233)
>         at
> org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelation$$anonfun$run$1$$anonfun$apply$mcV$sp$3.apply(InsertIntoHadoopFsRelation.scala:150)
>         at
> org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelation$$anonfun$run$1$$anonfun$apply$mcV$sp$3.apply(InsertIntoHadoopFsRelation.scala:150)
>         at
> org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
>         at org.apache.spark.scheduler.Task.run(Task.scala:88)
>         at
> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
>         at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
>         at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
>         at java.lang.Thread.run(Thread.java:745)
>
>
> i
>

Reply via email to