hi guys , -i get this errors with pyspark 1.5.0 under cloudera CDH 5.5 (yarn)
-i use yarn to deploy job on cluster. -i use hive context and parquet file to save my data. limit container 16 GB number of executor i tested befor it s 12 GB (executor memory) -i tested to increase number of partitions (by default it s 200) i multipie by 2 and 3 whitout succes. -I try to change number of sql partitins shuffle -i remarque in spark UI when (shuffle write it triggerd no problem) but (when shuffle read triggerd i lost executors and get erros) and realy blocked by this error where she came from ERROR util.SparkUncaughtExceptionHandler: Uncaught exception in thread Thread[Executor task launch worker-5,5,main] java.lang.OutOfMemoryError: Java heap space at parquet.column.values.dictionary.IntList.initSlab(IntList.java:90) at parquet.column.values.dictionary.IntList.<init>(IntList.java:86) at parquet.column.values.dictionary.DictionaryValuesWriter.<init>(DictionaryValuesWriter.java:93) at parquet.column.values.dictionary.DictionaryValuesWriter$PlainBinaryDictionaryValuesWriter.<init>(DictionaryValuesWriter.java:229) at parquet.column.ParquetProperties.dictionaryWriter(ParquetProperties.java:131) at parquet.column.ParquetProperties.dictWriterWithFallBack(ParquetProperties.java:178) at parquet.column.ParquetProperties.getValuesWriter(ParquetProperties.java:203) at parquet.column.impl.ColumnWriterV1.<init>(ColumnWriterV1.java:84) at parquet.column.impl.ColumnWriteStoreV1.newMemColumn(ColumnWriteStoreV1.java:68) at parquet.column.impl.ColumnWriteStoreV1.getColumnWriter(ColumnWriteStoreV1.java:56) at parquet.io.MessageColumnIO$MessageColumnIORecordConsumer.<init>(MessageColumnIO.java:207) at parquet.io.MessageColumnIO.getRecordWriter(MessageColumnIO.java:405) at parquet.hadoop.InternalParquetRecordWriter.initStore(InternalParquetRecordWriter.java:107) at parquet.hadoop.InternalParquetRecordWriter.<init>(InternalParquetRecordWriter.java:97) at parquet.hadoop.ParquetRecordWriter.<init>(ParquetRecordWriter.java:100) at parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:326) at parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:282) at org.apache.spark.sql.execution.datasources.parquet.ParquetOutputWriter.<init>(ParquetRelation.scala:94) at org.apache.spark.sql.execution.datasources.parquet.ParquetRelation$$anon$3.newInstance(ParquetRelation.scala:272) at org.apache.spark.sql.execution.datasources.DefaultWriterContainer.writeRows(WriterContainer.scala:233) at parquet.io.MessageColumnIO.getRecordWriter(MessageColumnIO.java:405) at parquet.hadoop.InternalParquetRecordWriter.initStore(InternalParquetRecordWriter.java:107) at parquet.hadoop.InternalParquetRecordWriter.<init>(InternalParquetRecordWriter.java:97) at parquet.hadoop.ParquetRecordWriter.<init>(ParquetRecordWriter.java:100) at parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:326) at parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:282) at org.apache.spark.sql.execution.datasources.parquet.ParquetOutputWriter.<init>(ParquetRelation.scala:94) at org.apache.spark.sql.execution.datasources.parquet.ParquetRelation$$anon$3.newInstance(ParquetRelation.scala:272) at org.apache.spark.sql.execution.datasources.DefaultWriterContainer.writeRows(WriterContainer.scala:233) at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelation$$anonfun$run$1$$anonfun$apply$mcV$sp$3.apply(InsertIntoHadoopFsRelation.scala:150) at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelation$$anonfun$run$1$$anonfun$apply$mcV$sp$3.apply(InsertIntoHadoopFsRelation.scala:150) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66) at org.apache.spark.scheduler.Task.run(Task.scala:88) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) at java.lang.Thread.run(Thread.java:745) 16/05/25 09:54:42 ERROR util.SparkUncaughtExceptionHandler: Uncaught exception in thread Thread[Executor task launch worker-6,5,main] java.lang.OutOfMemoryError: Java heap space at parquet.column.values.dictionary.IntList.initSlab(IntList.java:90) at parquet.column.values.dictionary.IntList.<init>(IntList.java:86) at parquet.column.values.dictionary.DictionaryValuesWriter.<init>(DictionaryValuesWriter.java:93) at parquet.column.values.dictionary.DictionaryValuesWriter$PlainBinaryDictionaryValuesWriter.<init>(DictionaryValuesWriter.java:229) at parquet.column.ParquetProperties.dictionaryWriter(ParquetProperties.java:131) at parquet.column.ParquetProperties.dictWriterWithFallBack(ParquetProperties.java:178) at parquet.column.ParquetProperties.getValuesWriter(ParquetProperties.java:203) at parquet.column.impl.ColumnWriterV1.<init>(ColumnWriterV1.java:84) at parquet.column.impl.ColumnWriteStoreV1.newMemColumn(ColumnWriteStoreV1.java:68) at parquet.column.impl.ColumnWriteStoreV1.getColumnWriter(ColumnWriteStoreV1.java:56) at parquet.io.MessageColumnIO$MessageColumnIORecordConsumer.<init>(MessageColumnIO.java:207) at parquet.io.MessageColumnIO.getRecordWriter(MessageColumnIO.java:405) at parquet.hadoop.InternalParquetRecordWriter.initStore(InternalParquetRecordWriter.java:107) at parquet.hadoop.InternalParquetRecordWriter.<init>(InternalParquetRecordWriter.java:97) at parquet.hadoop.ParquetRecordWriter.<init>(ParquetRecordWriter.java:100) at parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:326) at parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:282) at org.apache.spark.sql.execution.datasources.parquet.ParquetOutputWriter.<init>(ParquetRelation.scala:94) at org.apache.spark.sql.execution.datasources.parquet.ParquetRelation$$anon$3.newInstance(ParquetRelation.scala:272) at org.apache.spark.sql.execution.datasources.DefaultWriterContainer.writeRows(WriterContainer.scala:233) at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelation$$anonfun$run$1$$anonfun$apply$mcV$sp$3.apply(InsertIntoHadoopFsRelation.scala:150) at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelation$$anonfun$run$1$$anonfun$apply$mcV$sp$3.apply(InsertIntoHadoopFsRelation.scala:150) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66) at org.apache.spark.scheduler.Task.run(Task.scala:88) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) at java.lang.Thread.run(Thread.java:745) i