Updated Branches:
  refs/heads/vmsync fc0713fd5 -> 42b483295

A temporary fix to address MySql deadlock issue


Project: http://git-wip-us.apache.org/repos/asf/cloudstack/repo
Commit: http://git-wip-us.apache.org/repos/asf/cloudstack/commit/42b48329
Tree: http://git-wip-us.apache.org/repos/asf/cloudstack/tree/42b48329
Diff: http://git-wip-us.apache.org/repos/asf/cloudstack/diff/42b48329

Branch: refs/heads/vmsync
Commit: 42b483295600188968ce603d61e22009e234c423
Parents: fc0713f
Author: Kelven Yang <[email protected]>
Authored: Wed Jun 26 15:15:51 2013 -0700
Committer: Kelven Yang <[email protected]>
Committed: Wed Jun 26 15:15:51 2013 -0700

----------------------------------------------------------------------
 .../cloudstack/framework/jobs/AsyncJob.java     |  2 +
 .../jobs/impl/AsyncJobManagerImpl.java          | 80 ++++++++++++++++++--
 2 files changed, 74 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/cloudstack/blob/42b48329/framework/jobs/src/org/apache/cloudstack/framework/jobs/AsyncJob.java
----------------------------------------------------------------------
diff --git 
a/framework/jobs/src/org/apache/cloudstack/framework/jobs/AsyncJob.java 
b/framework/jobs/src/org/apache/cloudstack/framework/jobs/AsyncJob.java
index 995eaaf..2ed75a9 100644
--- a/framework/jobs/src/org/apache/cloudstack/framework/jobs/AsyncJob.java
+++ b/framework/jobs/src/org/apache/cloudstack/framework/jobs/AsyncJob.java
@@ -39,6 +39,8 @@ public interface AsyncJob extends JobInfo {
         // need to distinguish them to such level. Therefore, only one wakeup 
signal
         // is defined
         public static final int SIGNAL_MASK_WAKEUP = 1;
+        
+        public static final String SYNC_LOCK_NAME = "SyncLock";
     }
        
     @Override

http://git-wip-us.apache.org/repos/asf/cloudstack/blob/42b48329/framework/jobs/src/org/apache/cloudstack/framework/jobs/impl/AsyncJobManagerImpl.java
----------------------------------------------------------------------
diff --git 
a/framework/jobs/src/org/apache/cloudstack/framework/jobs/impl/AsyncJobManagerImpl.java
 
b/framework/jobs/src/org/apache/cloudstack/framework/jobs/impl/AsyncJobManagerImpl.java
index 7b199ff..a59aea3 100644
--- 
a/framework/jobs/src/org/apache/cloudstack/framework/jobs/impl/AsyncJobManagerImpl.java
+++ 
b/framework/jobs/src/org/apache/cloudstack/framework/jobs/impl/AsyncJobManagerImpl.java
@@ -79,7 +79,9 @@ public class AsyncJobManagerImpl extends ManagerBase 
implements AsyncJobManager,
             "60", "Time (in minutes) for async-jobs to be forcely cancelled if 
it has been in process for long", true, null);
 
     private static final Logger s_logger = 
Logger.getLogger(AsyncJobManagerImpl.class);
+
     private static final int ACQUIRE_GLOBAL_LOCK_TIMEOUT_FOR_COOPERATION = 3;  
// 3 seconds
+    private static final int ACQUIRE_GLOBAL_LOCK_TIMEOUT_FOR_SYNC              
= 60;   // 60 seconds
 
     private static final int MAX_ONETIME_SCHEDULE_SIZE = 50;
     private static final int HEARTBEAT_INTERVAL = 2000;
@@ -338,7 +340,63 @@ public class AsyncJobManagerImpl extends ManagerBase 
implements AsyncJobManager,
     
     @Override @DB
     public void completeJoin(long joinJobId, JobInfo.Status joinStatus, String 
joinResult) {
-       _joinMapDao.completeJoin(joinJobId, joinStatus, joinResult, getMsid());
+       //
+       // TODO
+       // this is a temporary solution to solve strange MySQL deadlock issue,
+       // completeJoin() causes deadlock happens at async_job table
+
+/*     
+       ------------------------
+       LATEST DETECTED DEADLOCK
+       ------------------------
+       130625 20:03:10
+       *** (1) TRANSACTION:
+       TRANSACTION 0 98087127, ACTIVE 0 sec, process no 1489, OS thread id 
139837829175040 fetching rows, thread declared inside InnoDB 494
+       mysql tables in use 2, locked 1
+       LOCK WAIT 3 lock struct(s), heap size 368, 2 row lock(s), undo log 
entries 1
+       MySQL thread id 28408, query id 368571321 localhost 127.0.0.1 cloud 
preparing
+       UPDATE async_job SET job_pending_signals=1 WHERE id IN (SELECT job_id 
FROM async_job_join_map WHERE join_job_id = 9)
+       *** (1) WAITING FOR THIS LOCK TO BE GRANTED:
+       RECORD LOCKS space id 0 page no 1275 n bits 80 index `PRIMARY` of table 
`cloud`.`async_job` trx id 0 98087127 lock_mode X locks rec but not gap waiting
+       Record lock, heap no 9 PHYSICAL RECORD: n_fields 26; compact format; 
info bits 0
+       0: len 8; hex 0000000000000008; asc         ;; 1: len 6; hex 
000005d8b0d8; asc       ;; 2: len 7; hex 00000009270110; asc     '  ;; 3: len 
8; hex 0000000000000002; asc         ;; 4: len 8; hex 0000000000000002; asc     
    ;; 5: SQL NULL; 6: SQL NULL; 7: len 30; hex 
6f72672e6170616368652e636c6f7564737461636b2e6170692e636f6d6d; asc 
org.apache.cloudstack.api.comm;...(truncated); 8: len 30; hex 
7b226964223a2232222c22706879736963616c6e6574776f726b6964223a; asc 
{"id":"2","physicalnetworkid":;...(truncated); 9: len 4; hex 80000000; asc     
;; 10: len 4; hex 80000001; asc     ;; 11: len 4; hex 80000000; asc     ;; 12: 
len 4; hex 80000000; asc     ;; 13: len 30; hex 
6f72672e6170616368652e636c6f7564737461636b2e6170692e72657370; asc 
org.apache.cloudstack.api.resp;...(truncated); 14: len 8; hex 80001a6f7bb0d0a8; 
asc    o{   ;; 15: len 8; hex 80001a6f7bb0d0a8; asc    o{   ;; 16: len 8; hex 
8000124f06cfd5b6; asc    O    ;; 17: len 8; hex 8000124f06cfd5b6; asc    O    
;; 18: SQL NULL; 19: SQ
 L NULL; 20: len 30; hex 
66376466396532362d323139622d346338652d393231332d393766653636; asc 
f7df9e26-219b-4c8e-9213-97fe66;...(truncated); 21: len 30; hex 
36623238306364362d663436652d343563322d383833642d333863616439; asc 
6b280cd6-f46e-45c2-883d-38cad9;...(truncated); 22: SQL NULL; 23: len 21; hex 
4170694173796e634a6f6244697370617463686572; asc ApiAsyncJobDispatcher;; 24: SQL 
NULL; 25: len 4; hex 80000000; asc     ;;
+
+       *** (2) TRANSACTION:
+       TRANSACTION 0 98087128, ACTIVE 0 sec, process no 1489, OS thread id 
139837671909120 fetching rows, thread declared inside InnoDB 492
+       mysql tables in use 2, locked 1
+       3 lock struct(s), heap size 368, 2 row lock(s), undo log entries 1
+       MySQL thread id 28406, query id 368571323 localhost 127.0.0.1 cloud 
preparing
+       UPDATE async_job SET job_pending_signals=1 WHERE id IN (SELECT job_id 
FROM async_job_join_map WHERE join_job_id = 8)
+       *** (2) HOLDS THE LOCK(S):
+       RECORD LOCKS space id 0 page no 1275 n bits 80 index `PRIMARY` of table 
`cloud`.`async_job` trx id 0 98087128 lock_mode X locks rec but not gap
+       Record lock, heap no 9 PHYSICAL RECORD: n_fields 26; compact format; 
info bits 0
+       0: len 8; hex 0000000000000008; asc         ;; 1: len 6; hex 
000005d8b0d8; asc       ;; 2: len 7; hex 00000009270110; asc     '  ;; 3: len 
8; hex 0000000000000002; asc         ;; 4: len 8; hex 0000000000000002; asc     
    ;; 5: SQL NULL; 6: SQL NULL; 7: len 30; hex 
6f72672e6170616368652e636c6f7564737461636b2e6170692e636f6d6d; asc 
org.apache.cloudstack.api.comm;...(truncated); 8: len 30; hex 
7b226964223a2232222c22706879736963616c6e6574776f726b6964223a; asc 
{"id":"2","physicalnetworkid":;...(truncated); 9: len 4; hex 80000000; asc     
;; 10: len 4; hex 80000001; asc     ;; 11: len 4; hex 80000000; asc     ;; 12: 
len 4; hex 80000000; asc     ;; 13: len 30; hex 
6f72672e6170616368652e636c6f7564737461636b2e6170692e72657370; asc 
org.apache.cloudstack.api.resp;...(truncated); 14: len 8; hex 80001a6f7bb0d0a8; 
asc    o{   ;; 15: len 8; hex 80001a6f7bb0d0a8; asc    o{   ;; 16: len 8; hex 
8000124f06cfd5b6; asc    O    ;; 17: len 8; hex 8000124f06cfd5b6; asc    O    
;; 18: SQL NULL; 19: SQ
 L NULL; 20: len 30; hex 
66376466396532362d323139622d346338652d393231332d393766653636; asc 
f7df9e26-219b-4c8e-9213-97fe66;...(truncated); 21: len 30; hex 
36623238306364362d663436652d343563322d383833642d333863616439; asc 
6b280cd6-f46e-45c2-883d-38cad9;...(truncated); 22: SQL NULL; 23: len 21; hex 
4170694173796e634a6f6244697370617463686572; asc ApiAsyncJobDispatcher;; 24: SQL 
NULL; 25: len 4; hex 80000000; asc     ;;
+
+       *** (2) WAITING FOR THIS LOCK TO BE GRANTED:
+       RECORD LOCKS space id 0 page no 1275 n bits 80 index `PRIMARY` of table 
`cloud`.`async_job` trx id 0 98087128 lock_mode X locks rec but not gap waiting
+       Record lock, heap no 10 PHYSICAL RECORD: n_fields 26; compact format; 
info bits 0
+       0: len 8; hex 0000000000000009; asc         ;; 1: len 6; hex 
000005d8b0d7; asc       ;; 2: len 7; hex 00000009280110; asc     (  ;; 3: len 
8; hex 0000000000000002; asc         ;; 4: len 8; hex 0000000000000002; asc     
    ;; 5: SQL NULL; 6: SQL NULL; 7: len 30; hex 
6f72672e6170616368652e636c6f7564737461636b2e6170692e636f6d6d; asc 
org.apache.cloudstack.api.comm;...(truncated); 8: len 30; hex 
7b226964223a2233222c22706879736963616c6e6574776f726b6964223a; asc 
{"id":"3","physicalnetworkid":;...(truncated); 9: len 4; hex 80000000; asc     
;; 10: len 4; hex 80000001; asc     ;; 11: len 4; hex 80000000; asc     ;; 12: 
len 4; hex 80000000; asc     ;; 13: len 30; hex 
6f72672e6170616368652e636c6f7564737461636b2e6170692e72657370; asc 
org.apache.cloudstack.api.resp;...(truncated); 14: len 8; hex 80001a6f7bb0d0a8; 
asc    o{   ;; 15: len 8; hex 80001a6f7bb0d0a8; asc    o{   ;; 16: len 8; hex 
8000124f06cfd5b6; asc    O    ;; 17: len 8; hex 8000124f06cfd5b6; asc    O    
;; 18: SQL NULL; 19: SQ
 L NULL; 20: len 30; hex 
62313065306432342d336233352d343663622d386361622d623933623562; asc 
b10e0d24-3b35-46cb-8cab-b93b5b;...(truncated); 21: len 30; hex 
39353664383563632d383336622d346663612d623738622d646238343739; asc 
956d85cc-836b-4fca-b78b-db8479;...(truncated); 22: SQL NULL; 23: len 21; hex 
4170694173796e634a6f6244697370617463686572; asc ApiAsyncJobDispatcher;; 24: SQL 
NULL; 25: len 4; hex 80000000; asc     ;;
+
+       *** WE ROLL BACK TRANSACTION (2)
+*/     
+       
+       //
+       // TODO
+       // ACQUIRE_GLOBAL_LOCK_TIMEOUT_FOR_SYNC is a hard-coded time out value, 
this value
+       // should actually be in sync with mysql settings
+       //
+       // TODO
+       // how to handle failures from locking?
+       
+       if(_jobDao.lockInLockTable(AsyncJob.Contants.SYNC_LOCK_NAME, 
ACQUIRE_GLOBAL_LOCK_TIMEOUT_FOR_SYNC)) {
+               try {
+                       _joinMapDao.completeJoin(joinJobId, joinStatus, 
joinResult, getMsid());
+               } finally {
+                       
_jobDao.unlockFromLockTable(AsyncJob.Contants.SYNC_LOCK_NAME);
+               }
+       } else {
+               s_logger.error("If this happens, it means too bad");
+       }
     }
     
     @Override
@@ -642,13 +700,19 @@ public class AsyncJobManagerImpl extends ManagerBase 
implements AsyncJobManager,
                         }
                     }
               
-                    List<Long> standaloneWakeupJobs = _joinMapDao.wakeupScan();
-                    for(Long jobId : standaloneWakeupJobs) {
-                       // TODO, we assume that all jobs in this category is 
API job only
-                       AsyncJobVO job = _jobDao.findById(jobId);
-                        if (job != null && (job.getPendingSignals() & 
AsyncJob.Contants.SIGNAL_MASK_WAKEUP) != 0)
-                           scheduleExecution(job, false);
-                    }
+                       
if(_jobDao.lockInLockTable(AsyncJob.Contants.SYNC_LOCK_NAME, 
ACQUIRE_GLOBAL_LOCK_TIMEOUT_FOR_SYNC)) {
+                               try {
+                                   List<Long> standaloneWakeupJobs = 
_joinMapDao.wakeupScan();
+                                   for(Long jobId : standaloneWakeupJobs) {
+                                       // TODO, we assume that all jobs in 
this category is API job only
+                                       AsyncJobVO job = 
_jobDao.findById(jobId);
+                                       if (job != null && 
(job.getPendingSignals() & AsyncJob.Contants.SIGNAL_MASK_WAKEUP) != 0)
+                                           scheduleExecution(job, false);
+                                   }
+                               } finally {
+                                       
_jobDao.unlockFromLockTable(AsyncJob.Contants.SYNC_LOCK_NAME);
+                               }
+                       }
                 } catch(Throwable e) {
                     s_logger.error("Unexpected exception when trying to 
execute queue item, ", e);
                 } finally {

Reply via email to