xccui opened a new issue, #8554: URL: https://github.com/apache/hudi/issues/8554
We hit some S3 http connection pool issues when running a Flink writer job and it caused the connection pool on `StreamWriteOperatorCoordinator` to close. However, after failure recovery, the connection pool won't be reset. I feel that we should reset the connection pool, as well as some other resources during a failover to avoid being trapped in an unhealthy loop. Our job kept restarting and throwing the following exception. ``` 2023-04-24 03:42:43 [pool-25-thread-1] ERROR org.apache.hudi.sink.StreamWriteOperatorCoordinator [] - Executor executes action [initialize instant ] error java.lang.IllegalStateException: Connection pool shut down at com.amazonaws.thirdparty.apache.http.util.Asserts.check(Asserts.java:34) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at com.amazonaws.thirdparty.apache.http.impl.conn.PoolingHttpClientConnectionManager.requestConnection(PoolingHttpClientConnectionManager.java:269) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at jdk.internal.reflect.GeneratedMethodAccessor35.invoke(Unknown Source) ~[?:?] at jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source) ~[?:?] at java.lang.reflect.Method.invoke(Unknown Source) ~[?:?] at com.amazonaws.http.conn.ClientConnectionManagerFactory$Handler.invoke(ClientConnectionManagerFactory.java:76) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at com.amazonaws.http.conn.$Proxy52.requestConnection(Unknown Source) ~[?:?] at com.amazonaws.thirdparty.apache.http.impl.execchain.MainClientExec.execute(MainClientExec.java:176) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at com.amazonaws.thirdparty.apache.http.impl.execchain.ProtocolExec.execute(ProtocolExec.java:186) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at com.amazonaws.thirdparty.apache.http.impl.client.InternalHttpClient.doExecute(InternalHttpClient.java:185) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at com.amazonaws.thirdparty.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:83) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at com.amazonaws.thirdparty.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:56) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at com.amazonaws.http.apache.client.impl.SdkHttpClient.execute(SdkHttpClient.java:72) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeOneRequest(AmazonHttpClient.java:1346) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeHelper(AmazonHttpClient.java:1157) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at com.amazonaws.http.AmazonHttpClient$RequestExecutor.doExecute(AmazonHttpClient.java:814) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeWithTimer(AmazonHttpClient.java:781) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at com.amazonaws.http.AmazonHttpClient$RequestExecutor.execute(AmazonHttpClient.java:755) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at com.amazonaws.http.AmazonHttpClient$RequestExecutor.access$500(AmazonHttpClient.java:715) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at com.amazonaws.http.AmazonHttpClient$RequestExecutionBuilderImpl.execute(AmazonHttpClient.java:697) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:561) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:541) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:5456) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:5403) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at com.amazonaws.services.s3.AmazonS3Client.getObjectMetadata(AmazonS3Client.java:1372) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at org.apache.hadoop.fs.s3a.S3AFileSystem.lambda$getObjectMetadata$10(S3AFileSystem.java:2545) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at org.apache.hadoop.fs.s3a.Invoker.retryUntranslated(Invoker.java:414) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at org.apache.hadoop.fs.s3a.Invoker.retryUntranslated(Invoker.java:377) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at org.apache.hadoop.fs.s3a.S3AFileSystem.getObjectMetadata(S3AFileSystem.java:2533) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at org.apache.hadoop.fs.s3a.S3AFileSystem.getObjectMetadata(S3AFileSystem.java:2513) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at org.apache.hadoop.fs.s3a.S3AFileSystem.s3GetFileStatus(S3AFileSystem.java:3776) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at org.apache.hadoop.fs.s3a.S3AFileSystem.innerGetFileStatus(S3AFileSystem.java:3688) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at org.apache.hadoop.fs.s3a.S3AFileSystem.lambda$getFileStatus$24(S3AFileSystem.java:3556) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.lambda$trackDurationOfOperation$5(IOStatisticsBinding.java:499) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.trackDuration(IOStatisticsBinding.java:444) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at org.apache.hadoop.fs.s3a.S3AFileSystem.trackDurationAndSpan(S3AFileSystem.java:2337) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at org.apache.hadoop.fs.s3a.S3AFileSystem.trackDurationAndSpan(S3AFileSystem.java:2356) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at org.apache.hadoop.fs.s3a.S3AFileSystem.getFileStatus(S3AFileSystem.java:3554) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at org.apache.hudi.common.fs.HoodieWrapperFileSystem.lambda$getFileStatus$17(HoodieWrapperFileSystem.java:410) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at org.apache.hudi.common.fs.HoodieWrapperFileSystem.executeFuncWithTimeMetrics(HoodieWrapperFileSystem.java:114) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at org.apache.hudi.common.fs.HoodieWrapperFileSystem.getFileStatus(HoodieWrapperFileSystem.java:404) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at org.apache.hudi.exception.TableNotFoundException.checkTableValidity(TableNotFoundException.java:51) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at org.apache.hudi.common.table.HoodieTableMetaClient.<init>(HoodieTableMetaClient.java:137) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at org.apache.hudi.common.table.HoodieTableMetaClient.newMetaClient(HoodieTableMetaClient.java:689) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at org.apache.hudi.common.table.HoodieTableMetaClient.access$000(HoodieTableMetaClient.java:81) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at org.apache.hudi.common.table.HoodieTableMetaClient$Builder.build(HoodieTableMetaClient.java:770) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at org.apache.hudi.table.HoodieFlinkTable.create(HoodieFlinkTable.java:62) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at org.apache.hudi.client.HoodieFlinkTableServiceClient.createTable(HoodieFlinkTableServiceClient.java:172) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at org.apache.hudi.client.BaseHoodieTableServiceClient.rollbackFailedWrites(BaseHoodieTableServiceClient.java:706) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at org.apache.hudi.client.BaseHoodieWriteClient.lambda$startCommit$afea71c0$1(BaseHoodieWriteClient.java:810) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at org.apache.hudi.common.util.CleanerUtils.rollbackFailedWrites(CleanerUtils.java:156) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at org.apache.hudi.client.BaseHoodieWriteClient.startCommit(BaseHoodieWriteClient.java:809) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at org.apache.hudi.sink.StreamWriteOperatorCoordinator.startInstant(StreamWriteOperatorCoordinator.java:399) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at org.apache.hudi.sink.StreamWriteOperatorCoordinator.lambda$initInstant$6(StreamWriteOperatorCoordinator.java:426) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at org.apache.hudi.sink.utils.NonThrownExecutor.lambda$wrapAction$0(NonThrownExecutor.java:130) ~[blob_p-abdf98cc6fdb80521c5886e97d0250884f55321b-5fd12d7a052c31efa7e4c3e5be67b915:?] at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source) [?:?] at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source) [?:?] at java.lang.Thread.run(Unknown Source) [?:?] ``` As a workaround, we killed the JobManager by force and then the job can successfully recover. **Environment Description** * Hudi version : bdb50ddccc9631317dfb06a06abc38cbd3714ce8 * Flink version : 1.16.1 * Storage (HDFS/S3/GCS..) : S3 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org