Hello, attached is a small helper script that parses the output of /proc/locks. The file format is described here: https://www.centos.org/docs/5/html/5.2/Deployment_Guide/s2-proc-locks.html
It resolves the program name from the pid number and also resolves the filename from the inode number. That way we can see which program is currently holding an exclusive lock on a file and which program is waiting for the lock to come free. This would probably also show an ABBA deadlock, though I didn't encounter one in the wild yet. You may need to adapt the paths in the "config" section of the program to your local setup. The layout of /proc/locks might change depending on the kernel version, I've tested it with 3.14.x. The script works with python 3.x and 2.7. When you call it with "--shown-non-waiting", it will show you which mailboxes currently hold a READ lock. The program does a bit of filtering to keep the noise ratio down, so be sure to check out the other options described in "--help". Hopefully this helps others if they should run into a deadlock situation. Cheers, Thomas
#!/usr/bin/python3 # Parse /proc/locks and list waiting programs. # Useful for identifying the cause of a deadlock. # Should be executed as root for /proc access. # # If you identified the first hanging process, # run "gstack pid" to get a stack trace. # # Licensed under the same terms as cyrus-imapd 2.4 / 2.5 / 2.6 # (c) 2015 Intra2net AG - Thomas Jarosch import os import re import argparse parser = argparse.ArgumentParser(description='Parse /proc/locks with special ' 'regard for cyrus imapd') parser.add_argument('--all-programs', action='store_true', default=False, help='Show all locks, not just cyrus') parser.add_argument('--show-sockets', action='store_true', default=False, help='Show locks in imap-db/socket/') parser.add_argument('--show-pidfiles', action='store_true', default=False, help='Show locks in /var/run') parser.add_argument('--show-non-waiting', action='store_true', default=False, help="Show locks that don't have waiters") args = parser.parse_args() cyrus_bindir = '/usr/cyrus/bin/' socket_dir = '/datastore/imap-db/socket/' locks = [] # parse locks and waiters with open('/proc/locks') as f: for line in f: # remove double spaces line = line.replace(' ', ' ') # Format examples: # 46: FLOCK ADVISORY WRITE 5542 00:25:829847 0 EOF # 46: -> FLOCK ADVISORY WRITE 5544 00:25:829847 0 EOF # 50: POSIX MANDATORY READ 4820 fd:04:3815033 1073741826 1073742335 fields = re.match('\d+: (?P<waiting>-> )?[A-Z]+ [A-Z]+ ' '(?P<mode>READ|WRITE) ' '(?P<pid>\d+) ' '(?P<dev_major>[a-f0-9]{2}):' '(?P<dev_minor>[a-f0-9]{2}):' '(?P<inode>\d+) .*', line) if not fields: print('WARN: Ignoring unmatched line output: {0}'.format(line)) continue waiting = False if fields.group('waiting') is not None: waiting = True pid = fields.group('pid') proc_path = os.path.join('/proc', pid) if not os.path.isdir(proc_path): print('INFO: Program with pid {0} vanished'.format(pid)) continue prog_name = os.readlink(os.path.join(proc_path, 'exe')) # convert dev number from hex to decimal # as returned in os.stat() decimal_devnode = int('{0}{1}'.format(fields.group('dev_major'), fields.group('dev_minor')), 16) # look up filename locked_filename = 'UNKNOWN' fd_path = os.path.join(proc_path, 'fd') for fd_file in os.listdir(fd_path): fd_fullpath = os.path.join(fd_path, fd_file) stat_res = os.stat(fd_fullpath) if stat_res.st_ino == int(fields.group('inode')) and \ stat_res.st_dev == decimal_devnode: locked_filename = os.readlink(fd_fullpath) break # store new_lock = {'prog': prog_name, 'pid': pid, 'mode': fields.group('mode'), 'file': locked_filename, 'waiters': [] } if waiting: locks[-1]['waiters'].append(new_lock) else: locks.append(new_lock) # Output locks and possible waiters shown_something = False for lock in locks: prog = lock['prog'] file = lock['file'] # Skip known locks that are always waiting if file.startswith(socket_dir) and not args.show_sockets: continue if file.endswith('.pid') and not args.show_pidfiles: continue if not prog.startswith(cyrus_bindir) and \ args.all_programs is False: continue waiters = lock['waiters'] if len(waiters) or args.show_non_waiting: shown_something = True print('{0} (pid {1}) holding {2} lock for {3}'.format( prog, lock['pid'], lock['mode'], lock['file'])) for waiter in waiters: print('{0} (pid {1}) ++WAITING++ for {2} lock on {3}'.format( waiter['prog'], waiter['pid'], waiter['mode'], waiter['file'])) if len(waiters): print('') if len(locks) and shown_something is False: print('Hint: No locks shown. Try --all-programs or --help for more modes')