[ 
https://issues.apache.org/jira/browse/HUDI-4066?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Jian Feng reassigned HUDI-4066:
-------------------------------

    Assignee: Jian Feng

> HiveMetastoreBasedLockProvider can not release lock when writer fails
> ---------------------------------------------------------------------
>
>                 Key: HUDI-4066
>                 URL: https://issues.apache.org/jira/browse/HUDI-4066
>             Project: Apache Hudi
>          Issue Type: Bug
>          Components: multi-writer
>    Affects Versions: 0.10.1
>            Reporter: Jian Feng
>            Assignee: Jian Feng
>            Priority: Critical
>             Fix For: 1.0.0
>
>
> we use HiveMetastoreBasedLockProvider in the Prod environment, one writer is 
> ingesting data with Flink, and another writer will delete some old partitions 
> with Spark. sometimes spark job failed, but the lock was not released. then 
> all writers failed.  
> {code:java}
> // error log
> 22/04/01 08:12:18 INFO TransactionManager: Transaction starting without a 
> transaction owner22/04/01 08:12:18 INFO LockManager: LockProvider 
> org.apache.hudi.hive.HiveMetastoreBasedLockProvider22/04/01 08:12:19 INFO 
> metastore: Trying to connect to metastore with URI 
> thrift://10.128.152.245:908322/04/01 08:12:19 INFO metastore: Opened a 
> connection to metastore, current connections: 122/04/01 08:12:19 INFO 
> metastore: Connected to metastore.22/04/01 08:12:20 INFO 
> HiveMetastoreBasedLockProvider: ACQUIRING lock at database dev_video and 
> table dwd_traffic_log22/04/01 08:12:25 INFO TransactionManager: Transaction 
> ending without a transaction owner22/04/01 08:12:25 INFO 
> HiveMetastoreBasedLockProvider: RELEASING lock at database dev_video and 
> table dwd_traffic_log22/04/01 08:12:25 INFO TransactionManager: Transaction 
> ended without a transaction ownerException in thread "main" 
> org.apache.hudi.exception.HoodieLockException: Unable to acquire lock, lock 
> object     at 
> org.apache.hudi.client.transaction.lock.LockManager.lock(LockManager.java:71) 
>    at 
> org.apache.hudi.client.transaction.TransactionManager.beginTransaction(TransactionManager.java:51)
>     at 
> org.apache.hudi.client.SparkRDDWriteClient.getTableAndInitCtx(SparkRDDWriteClient.java:430)
>     at 
> org.apache.hudi.client.SparkRDDWriteClient.deletePartitions(SparkRDDWriteClient.java:261)
>     at 
> org.apache.hudi.DataSourceUtils.doDeletePartitionsOperation(DataSourceUtils.java:234)
>     at 
> org.apache.hudi.HoodieSparkSqlWriter$.write(HoodieSparkSqlWriter.scala:217)   
>  at org.apache.hudi.DefaultSource.createRelation(DefaultSource.scala:164)    
> at 
> org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:46)
>     at 
> org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70)
>     at 
> org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:68)
>     at 
> org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:90)
>     at 
> org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180)
>     at 
> org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
>     at 
> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
>     at 
> org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)    
> at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)    
> at 
> org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:132)
>     at 
> org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:131) 
>    at 
> org.apache.spark.sql.DataFrameWriter.$anonfun$runCommand$1(DataFrameWriter.scala:991)
>     at 
> org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
>     at 
> org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
>     at 
> org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
>     at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)   
>  at 
> org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
>     at 
> org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:991)    
> at 
> org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:438)
>     at 
> org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:415)  
>   at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:293)    
> at 
> com.shopee.ci.hudi.tasks.ExpiredPartitionDelete$.$anonfun$main$2(ExpiredPartitionDelete.scala:82)
>     at 
> com.shopee.ci.hudi.tasks.ExpiredPartitionDelete$.$anonfun$main$2$adapted(ExpiredPartitionDelete.scala:65)
>     at scala.collection.Iterator.foreach(Iterator.scala:941)    at 
> scala.collection.Iterator.foreach$(Iterator.scala:941)    at 
> scala.collection.AbstractIterator.foreach(Iterator.scala:1429)    at 
> scala.collection.IterableLike.foreach(IterableLike.scala:74)    at 
> scala.collection.IterableLike.foreach$(IterableLike.scala:73)    at 
> scala.collection.AbstractIterable.foreach(Iterable.scala:56)    at 
> com.shopee.ci.hudi.tasks.ExpiredPartitionDelete$.$anonfun$main$1(ExpiredPartitionDelete.scala:65)
>     at 
> com.shopee.ci.hudi.tasks.ExpiredPartitionDelete$.$anonfun$main$1$adapted(ExpiredPartitionDelete.scala:61)
>     at scala.collection.Iterator.foreach(Iterator.scala:941)    at 
> scala.collection.Iterator.foreach$(Iterator.scala:941)    at 
> scala.collection.AbstractIterator.foreach(Iterator.scala:1429)    at 
> scala.collection.IterableLike.foreach(IterableLike.scala:74)    at 
> scala.collection.IterableLike.foreach$(IterableLike.scala:73)    at 
> scala.collection.AbstractIterable.foreach(Iterable.scala:56)    at 
> com.shopee.ci.hudi.tasks.ExpiredPartitionDelete$.main(ExpiredPartitionDelete.scala:61)
>     at 
> com.shopee.ci.hudi.tasks.ExpiredPartitionDelete.main(ExpiredPartitionDelete.scala)
>     at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)    at 
> sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) 
>    at 
> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
>     at java.lang.reflect.Method.invoke(Method.java:498)    at 
> org.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)  
>   at 
> org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:951)
>     at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180) 
>    at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)    at 
> org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)    at 
> org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1039)  
>   at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1048)    at 
> org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)Caused by: 
> org.apache.hudi.exception.HoodieLockException: FAILED_TO_ACQUIRE lock at 
> database dev_video and table dwd_traffic_log    at 
> org.apache.hudi.hive.HiveMetastoreBasedLockProvider.tryLock(HiveMetastoreBasedLockProvider.java:114)
>     at 
> org.apache.hudi.client.transaction.lock.LockManager.lock(LockManager.java:62) 
>    ... 57 moreCaused by: java.util.concurrent.ExecutionException: 
> org.apache.thrift.TApplicationException: Internal error processing lock    at 
> java.util.concurrent.FutureTask.report(FutureTask.java:122)    at 
> java.util.concurrent.FutureTask.get(FutureTask.java:206)    at 
> org.apache.hudi.hive.HiveMetastoreBasedLockProvider.acquireLockInternal(HiveMetastoreBasedLockProvider.java:185)
>     at 
> org.apache.hudi.hive.HiveMetastoreBasedLockProvider.acquireLock(HiveMetastoreBasedLockProvider.java:139)
>     at 
> org.apache.hudi.hive.HiveMetastoreBasedLockProvider.tryLock(HiveMetastoreBasedLockProvider.java:112)
>     ... 58 moreCaused by: org.apache.thrift.TApplicationException: Internal 
> error processing lock    at 
> org.apache.thrift.TServiceClient.receiveBase(TServiceClient.java:79)    at 
> org.apache.hadoop.hive.metastore.api.ThriftHiveMetastore$Client.recv_lock(ThriftHiveMetastore.java:4743)
>     at 
> org.apache.hadoop.hive.metastore.api.ThriftHiveMetastore$Client.lock(ThriftHiveMetastore.java:4730)
>     at 
> org.apache.hadoop.hive.metastore.HiveMetaStoreClient.lock(HiveMetaStoreClient.java:2174)
>     at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)    at 
> sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) 
>    at 
> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
>     at java.lang.reflect.Method.invoke(Method.java:498)    at 
> org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.invoke(RetryingMetaStoreClient.java:173)
>     at com.sun.proxy.$Proxy45.lock(Unknown Source)    at 
> sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)    at 
> sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) 
>    at 
> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
>     at java.lang.reflect.Method.invoke(Method.java:498)    at 
> org.apache.hadoop.hive.metastore.HiveMetaStoreClient$SynchronizedHandler.invoke(HiveMetaStoreClient.java:2348)
>     at com.sun.proxy.$Proxy45.lock(Unknown Source)    at 
> org.apache.hudi.hive.HiveMetastoreBasedLockProvider.lambda$acquireLockInternal$0(HiveMetastoreBasedLockProvider.java:184)
>     at java.util.concurrent.FutureTask.run(FutureTask.java:266)    at 
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
>     at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
>  {code}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

Reply via email to