Hello,

attached is a small helper script that parses
the output of /proc/locks. The file format is described here:
https://www.centos.org/docs/5/html/5.2/Deployment_Guide/s2-proc-locks.html

It resolves the program name from the pid number and also
resolves the filename from the inode number.

That way we can see which program is currently holding an exclusive
lock on a file and which program is waiting for the lock to come free.
This would probably also show an ABBA deadlock,
though I didn't encounter one in the wild yet.

You may need to adapt the paths in the "config" section of the program
to your local setup. 

The layout of /proc/locks might change depending on the kernel version,
I've tested it with 3.14.x. The script works with python 3.x and 2.7.

When you call it with "--shown-non-waiting", it will show you
which mailboxes currently hold a READ lock. The program does a bit
of filtering to keep the noise ratio down, so be sure to check
out the other options described in "--help".

Hopefully this helps others if they should run into a deadlock situation.

Cheers,
Thomas
#!/usr/bin/python3
# Parse /proc/locks and list waiting programs.
# Useful for identifying the cause of a deadlock.
# Should be executed as root for /proc access.
#
# If you identified the first hanging process,
# run "gstack pid" to get a stack trace.
#
# Licensed under the same terms as cyrus-imapd 2.4 / 2.5 / 2.6
# (c) 2015 Intra2net AG - Thomas Jarosch
import os
import re
import argparse

parser = argparse.ArgumentParser(description='Parse /proc/locks with special '
                                             'regard for cyrus imapd')
parser.add_argument('--all-programs', action='store_true',
                    default=False, help='Show all locks, not just cyrus')
parser.add_argument('--show-sockets', action='store_true',
                    default=False, help='Show locks in imap-db/socket/')
parser.add_argument('--show-pidfiles', action='store_true',
                    default=False, help='Show locks in /var/run')
parser.add_argument('--show-non-waiting', action='store_true',
                    default=False, help="Show locks that don't have waiters")
args = parser.parse_args()

cyrus_bindir = '/usr/cyrus/bin/'
socket_dir = '/datastore/imap-db/socket/'

locks = []

# parse locks and waiters
with open('/proc/locks') as f:
    for line in f:
        # remove double spaces
        line = line.replace('  ', ' ')

        # Format examples:
        # 46: FLOCK  ADVISORY  WRITE 5542 00:25:829847 0 EOF
        # 46: -> FLOCK  ADVISORY  WRITE 5544 00:25:829847 0 EOF
        # 50: POSIX MANDATORY READ  4820 fd:04:3815033 1073741826 1073742335
        fields = re.match('\d+: (?P<waiting>-> )?[A-Z]+ [A-Z]+ '
                          '(?P<mode>READ|WRITE) '
                          '(?P<pid>\d+) '
                          '(?P<dev_major>[a-f0-9]{2}):'
                          '(?P<dev_minor>[a-f0-9]{2}):'
                          '(?P<inode>\d+) .*', line)

        if not fields:
            print('WARN: Ignoring unmatched line output: {0}'.format(line))
            continue

        waiting = False
        if fields.group('waiting') is not None:
            waiting = True

        pid = fields.group('pid')
        proc_path = os.path.join('/proc', pid)
        if not os.path.isdir(proc_path):
            print('INFO: Program with pid {0} vanished'.format(pid))
            continue

        prog_name = os.readlink(os.path.join(proc_path, 'exe'))

        # convert dev number from hex to decimal
        # as returned in os.stat()
        decimal_devnode = int('{0}{1}'.format(fields.group('dev_major'),
                                              fields.group('dev_minor')), 16)

        # look up filename
        locked_filename = 'UNKNOWN'
        fd_path = os.path.join(proc_path, 'fd')
        for fd_file in os.listdir(fd_path):
            fd_fullpath = os.path.join(fd_path, fd_file)

            stat_res = os.stat(fd_fullpath)
            if stat_res.st_ino == int(fields.group('inode')) and \
               stat_res.st_dev == decimal_devnode:
                locked_filename = os.readlink(fd_fullpath)
                break

        # store
        new_lock = {'prog': prog_name,
                    'pid': pid,
                    'mode': fields.group('mode'),
                    'file': locked_filename,
                    'waiters': []
                    }

        if waiting:
            locks[-1]['waiters'].append(new_lock)
        else:
            locks.append(new_lock)


# Output locks and possible waiters
shown_something = False
for lock in locks:
    prog = lock['prog']
    file = lock['file']
    # Skip known locks that are always waiting
    if file.startswith(socket_dir) and not args.show_sockets:
        continue
    if file.endswith('.pid') and not args.show_pidfiles:
        continue
    if not prog.startswith(cyrus_bindir) and \
       args.all_programs is False:
        continue

    waiters = lock['waiters']
    if len(waiters) or args.show_non_waiting:
        shown_something = True

        print('{0} (pid {1}) holding {2} lock for {3}'.format(
            prog, lock['pid'], lock['mode'], lock['file']))

        for waiter in waiters:
            print('{0} (pid {1}) ++WAITING++ for {2} lock on {3}'.format(
                waiter['prog'], waiter['pid'], waiter['mode'], waiter['file']))

        if len(waiters):
            print('')

if len(locks) and shown_something is False:
    print('Hint: No locks shown. Try --all-programs or --help for more modes')

Reply via email to