[ https://issues.apache.org/jira/browse/SPARK-27076?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16793868#comment-16793868 ]
Steve Loughran commented on SPARK-27076: ---------------------------------------- I've seen this before in the specific circumstance: there aren't enough http connections in the pool, and things are timing out. I'd recommend you set "fs.s3a.connection.maximum" to at least 2x whatever your # of worker threads are. If you are still seeing this; something may be keeping input streams open, so leaking http connections > Getting the timeout error while writing parquet/csv files to s3 > --------------------------------------------------------------- > > Key: SPARK-27076 > URL: https://issues.apache.org/jira/browse/SPARK-27076 > Project: Spark > Issue Type: Bug > Components: Spark Core > Affects Versions: 2.2.0 > Reporter: srinivas rao gajjala > Priority: Major > > Hi, > I'm trying to writing parquet/csv files from s3 using Amazon EMR clusters > with the lable(emr-5.9.0) and below is the error I'm facing. > {code:java} > org.apache.spark.SparkException: Job aborted. at > org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply$mcV$sp(FileFormatWriter.scala:215) > at > org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:173) > at > org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:173) > at > org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65) > at > org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:173) > at > org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:145) > at > org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:58) > at > org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:56) > at > org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:74) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:117) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:117) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:138) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) > at > org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:135) at > org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:116) at > org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:92) > at > org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:92) > at > org.apache.spark.sql.execution.datasources.DataSource.writeInFileFormat(DataSource.scala:438) > at > org.apache.spark.sql.execution.datasources.DataSource.write(DataSource.scala:474) > at > org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:48) > at > org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:58) > at > org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:56) > at > org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:74) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:117) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:117) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:138) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) > at > org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:135) at > org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:116) at > org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:92) > at > org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:92) > at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:610) > at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:233) at > org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:217) at > org.apache.spark.sql.DataFrameWriter.csv(DataFrameWriter.scala:598) at > DataFrameFromTo.dataFrameToFile(DataFrameFromTo.scala:120) at > Migration.migrate(Migration.scala:211) at > DataMigrationFramework$$anonfun$main$3$$anonfun$apply$mcV$sp$1.apply$mcVI$sp(DataMigrationFramework.scala:353) > at scala.collection.immutable.Range.foreach$mVc$sp(Range.scala:160) at > DataMigrationFramework$$anonfun$main$3.apply$mcV$sp(DataMigrationFramework.scala:351) > at scala.util.control.Breaks.breakable(Breaks.scala:38) at > DataMigrationFramework$.main(DataMigrationFramework.scala:350) at > DataMigrationFramework.main(DataMigrationFramework.scala) at > sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at > sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) > at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.lang.reflect.Method.invoke(Method.java:498) at > org.apache.spark.deploy.yarn.ApplicationMaster$$anon$2.run(ApplicationMaster.scala:635) > Caused by: org.apache.spark.SparkException: Job aborted due to stage > failure: Task 1120 in stage 5.0 failed 16 times, most recent failure: Lost > task 1120.15 in stage 5.0 (TID 8886, ip-10-120-60-82.ec2.internal, executor > 4): com.amazonaws.SdkClientException: Unable to execute HTTP request: Timeout > waiting for connection from pool at > com.amazonaws.http.AmazonHttpClient$RequestExecutor.handleRetryableException(AmazonHttpClient.java:1068) > at > com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeHelper(AmazonHttpClient.java:1034) > at > com.amazonaws.http.AmazonHttpClient$RequestExecutor.doExecute(AmazonHttpClient.java:741) > at > com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeWithTimer(AmazonHttpClient.java:715) > at > com.amazonaws.http.AmazonHttpClient$RequestExecutor.execute(AmazonHttpClient.java:697) > at > com.amazonaws.http.AmazonHttpClient$RequestExecutor.access$500(AmazonHttpClient.java:665) > at > com.amazonaws.http.AmazonHttpClient$RequestExecutionBuilderImpl.execute(AmazonHttpClient.java:647) > at com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:511) at > com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:4227) at > com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:4174) at > com.amazonaws.services.s3.AmazonS3Client.getObjectMetadata(AmazonS3Client.java:1253) > at > com.amazonaws.services.s3.AmazonS3Client.getObjectMetadata(AmazonS3Client.java:1228) > at > org.apache.hadoop.fs.s3a.S3AFileSystem.getFileStatus(S3AFileSystem.java:903) > at > org.apache.hadoop.fs.s3a.S3AFileSystem.getFileStatus(S3AFileSystem.java:77) > at > org.apache.parquet.hadoop.util.HadoopInputFile.fromPath(HadoopInputFile.java:39) > at > org.apache.parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:401) > at > org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase.initialize(SpecificParquetRecordReaderBase.java:106) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.initialize(VectorizedParquetRecordReader.java:109) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReaderWithPartitionValues$1.apply(ParquetFileFormat.scala:363) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReaderWithPartitionValues$1.apply(ParquetFileFormat.scala:337) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:124) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:174) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:105) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.agg_doAggregateWithKeys$(Unknown > Source) at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:395) > at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) at > org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125) > at > org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96) at > org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53) at > org.apache.spark.scheduler.Task.run(Task.scala:108) at > org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335) at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) > at java.lang.Thread.run(Thread.java:748) > Caused by: org.apache.http.conn.ConnectionPoolTimeoutException: Timeout > waiting for connection from pool at > org.apache.http.impl.conn.PoolingHttpClientConnectionManager.leaseConnection(PoolingHttpClientConnectionManager.java:292) > at > org.apache.http.impl.conn.PoolingHttpClientConnectionManager$1.get(PoolingHttpClientConnectionManager.java:269) > at sun.reflect.GeneratedMethodAccessor47.invoke(Unknown Source) at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.lang.reflect.Method.invoke(Method.java:498) at > com.amazonaws.http.conn.ClientConnectionRequestFactory$Handler.invoke(ClientConnectionRequestFactory.java:70) > at com.amazonaws.http.conn.$Proxy11.get(Unknown Source) at > org.apache.http.impl.execchain.MainClientExec.execute(MainClientExec.java:191) > at > org.apache.http.impl.execchain.ProtocolExec.execute(ProtocolExec.java:185) at > org.apache.http.impl.client.InternalHttpClient.doExecute(InternalHttpClient.java:185) > at > org.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:83) > at > org.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:56) > at > com.amazonaws.http.apache.client.impl.SdkHttpClient.execute(SdkHttpClient.java:72) > at > com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeOneRequest(AmazonHttpClient.java:1189) > at > com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeHelper(AmazonHttpClient.java:1029) > ... 35 more Driver stacktrace: at > org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1690) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1678) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1677) > at > scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) > at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) at > org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1677) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:855) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:855) > at scala.Option.foreach(Option.scala:257) at > org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:855) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1905) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1860) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1849) > at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) at > org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:671) at > org.apache.spark.SparkContext.runJob(SparkContext.scala:2022) at > org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply$mcV$sp(FileFormatWriter.scala:188) > ... 47 more Caused by: com.amazonaws.SdkClientException: Unable to execute > HTTP request: Timeout waiting for connection from pool at > com.amazonaws.http.AmazonHttpClient$RequestExecutor.handleRetryableException(AmazonHttpClient.java:1068) > at > com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeHelper(AmazonHttpClient.java:1034) > at > com.amazonaws.http.AmazonHttpClient$RequestExecutor.doExecute(AmazonHttpClient.java:741) > at > com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeWithTimer(AmazonHttpClient.java:715) > at > com.amazonaws.http.AmazonHttpClient$RequestExecutor.execute(AmazonHttpClient.java:697) > at > com.amazonaws.http.AmazonHttpClient$RequestExecutor.access$500(AmazonHttpClient.java:665) > at > com.amazonaws.http.AmazonHttpClient$RequestExecutionBuilderImpl.execute(AmazonHttpClient.java:647) > at com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:511) at > com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:4227) at > com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:4174) at > com.amazonaws.services.s3.AmazonS3Client.getObjectMetadata(AmazonS3Client.java:1253) > at > com.amazonaws.services.s3.AmazonS3Client.getObjectMetadata(AmazonS3Client.java:1228) > at > org.apache.hadoop.fs.s3a.S3AFileSystem.getFileStatus(S3AFileSystem.java:903) > at > org.apache.hadoop.fs.s3a.S3AFileSystem.getFileStatus(S3AFileSystem.java:77) > at > org.apache.parquet.hadoop.util.HadoopInputFile.fromPath(HadoopInputFile.java:39) > at > org.apache.parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:401) > at > org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase.initialize(SpecificParquetRecordReaderBase.java:106) > at > org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.initialize(VectorizedParquetRecordReader.java:109) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReaderWithPartitionValues$1.apply(ParquetFileFormat.scala:363) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReaderWithPartitionValues$1.apply(ParquetFileFormat.scala:337) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:124) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:174) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:105) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.scan_nextBatch$(Unknown > Source) at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.agg_doAggregateWithKeys$(Unknown > Source) at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:395) > at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) at > org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125) > at > org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96) at > org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53) at > org.apache.spark.scheduler.Task.run(Task.scala:108) at > org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335) at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) > at java.lang.Thread.run(Thread.java:748) Caused by: > org.apache.http.conn.ConnectionPoolTimeoutException: Timeout waiting for > connection from pool at > org.apache.http.impl.conn.PoolingHttpClientConnectionManager.leaseConnection(PoolingHttpClientConnectionManager.java:292) > at > org.apache.http.impl.conn.PoolingHttpClientConnectionManager$1.get(PoolingHttpClientConnectionManager.java:269) > at sun.reflect.GeneratedMethodAccessor47.invoke(Unknown Source) at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.lang.reflect.Method.invoke(Method.java:498) at > com.amazonaws.http.conn.ClientConnectionRequestFactory$Handler.invoke(ClientConnectionRequestFactory.java:70) > at com.amazonaws.http.conn.$Proxy11.get(Unknown Source) at > org.apache.http.impl.execchain.MainClientExec.execute(MainClientExec.java:191) > at > org.apache.http.impl.execchain.ProtocolExec.execute(ProtocolExec.java:185) at > org.apache.http.impl.client.InternalHttpClient.doExecute(InternalHttpClient.java:185) > at > org.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:83) > at > org.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:56) > at > com.amazonaws.http.apache.client.impl.SdkHttpClient.execute(SdkHttpClient.java:72) > at > com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeOneRequest(AmazonHttpClient.java:1189) > at > com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeHelper(AmazonHttpClient.java:1029) > ... 35 more at > {code} > I saw few issues of this sort for avro formats, please let me know if I'm > missing something. Thanks > -- This message was sent by Atlassian JIRA (v7.6.3#76005) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org