We occasionally have issues with LVM freezing at which point no data can be
written to volumes and no LVM-commands (like lvs,vgs) can retrieve any data and
just get stuck. This happens on multiple machines that share the same config.
The issue starts while making changes to volumes, removing snapshots, removing
volumes, resizing volumes, etc
For detailed information on one such server, check the Appendices
Our situation is the following.
We use an mdadm raid-config consisting of 4 or more SSD's/Disks where we use
part of the disks for a raid1,raid10 or raid5. We create volumes on 2 nodes and
use DRBD to keep these 2 volumes in sync and we run a virtual machine (using
KVM) on this volume.
These freezes happen on different machines, using Ubuntu 16.04 and 18.04 with
different kernels, 4.4.0, 4.15.0 and 5.0.0
We have done extensive tests, but we are not able to reliably reproduce this
issue.
The issues seem to happen more often when volumes that are changed
(resized/removed) have been active for a longer time.
## Appendices:
# Appendix 1 - LVM Version
LVM version: 2.02.176(2) (2017-11-03)
Library version: 1.02.145 (2017-11-03)
Driver version: 4.39.0
Configuration: ./configure --build=x86_64-linux-gnu --prefix=/usr
--includedir=${prefix}/include --mandir=${prefix}/share/man
--infodir=${prefix}/share/info --sysconfdir=/etc --localstatedir=/var
--disable-silent-rules --libdir=${prefix}/lib/x86_64-linux-gnu
--libexecdir=${prefix}/lib/x86_64-linux-gnu --runstatedir=/run
--disable-maintainer-mode --disable-dependency-tracking --exec-prefix=
--bindir=/bin --libdir=/lib/x86_64-linux-gnu --sbindir=/sbin
--with-usrlibdir=/usr/lib/x86_64-linux-gnu --with-optimisation=-O2
--with-cache=internal --with-clvmd=corosync --with-cluster=internal
--with-device-uid=0 --with-device-gid=6 --with-device-mode=0660
--with-default-pid-dir=/run --with-default-run-dir=/run/lvm
--with-default-locking-dir=/run/lock/lvm --with-thin=internal
--with-thin-check=/usr/sbin/thin_check --with-thin-dump=/usr/sbin/thin_dump
--with-thin-repair=/usr/sbin/thin_repair --enable-applib --enable-blkid_wiping
--enable-cmdlib --enable-cmirrord --enable-dmeventd --enable-dbus-se
rvice --enable-lvmetad --enable-lvmlockd-dlm --enable-lvmlockd-sanlock
--enable-lvmpolld --enable-notify-dbus --enable-pkgconfig --enable-readline
--enable-udev_rules --enable-udev_sync
# Appendix 2 - MDADM-config
/dev/md3:
Version : 1.2
Creation Time : Tue Aug 28 11:49:14 2018
Raid Level : raid10
Array Size : 2790712320 (2661.43 GiB 2857.69 GB)
Used Dev Size : 930237440 (887.14 GiB 952.56 GB)
Raid Devices : 6
Total Devices : 6
Persistence : Superblock is persistent
Intent Bitmap : Internal
Update Time : Wed Apr 8 16:23:34 2020
State : active
Active Devices : 6
Working Devices : 6
Failed Devices : 0
Spare Devices : 0
Layout : near=2
Chunk Size : 512K
Consistency Policy : bitmap
Name : node1:3
UUID : 62594d9d:de7eb2e6:bc3c1523:ff7327f7
Events : 3973
Number Major Minor RaidDevice State
0 8 4 0 active sync set-A /dev/sda4
1 8 20 1 active sync set-B /dev/sdb4
2 8 36 2 active sync set-A /dev/sdc4
3 8 52 3 active sync set-B /dev/sdd4
4 8 68 4 active sync set-A /dev/sde4
5 8 84 5 active sync set-B /dev/sdf4
# Appendix 3 - LVM Config
config {
checks=1
abort_on_errors=0
profile_dir="/etc/lvm/profile"
}
local {
}
dmeventd {
mirror_library="libdevmapper-event-lvm2mirror.so"
snapshot_library="libdevmapper-event-lvm2snapshot.so"
thin_library="libdevmapper-event-lvm2thin.so"
}
activation {
checks=0
udev_sync=1
udev_rules=1
verify_udev_operations=0
retry_deactivation=1
missing_stripe_filler="error"
use_linear_target=1
reserved_stack=64
reserved_memory=8192
process_priority=-18
raid_region_size=512
readahead="auto"
raid_fault_policy="warn"
mirror_image_fault_policy="remove"
mirror_log_fault_policy="allocate"
snapshot_autoextend_threshold=100
snapshot_autoextend_percent=20
thin_pool_autoextend_threshold=100
thin_pool_autoextend_percent=20
use_mlockall=0
monitoring=1
polling_interval=15
activation_mode="degraded"
}
global {
umask=63
test=0
units="h"
si_unit_consistency=1
suffix=1
activation=1
proc="/proc"
etc="/etc"
locking_type=1
wait_for_locks=1
fallback_to_clustered_locking=1
fallback_to_local_locking=1
locking_dir="/run/lock/lvm"
prioritise_write_locks=1
abort_on_internal_errors=0
detect_internal_vg_cache_corruption=0
metadata_read_only=0
mirror_segtype_default="raid1"
raid10_segtype_default="raid10"
sparse_segtype_default="thin"
use_lvmetad=0
use_lvmlockd=0
system_id_source="none"
use_lvmpolld=1
}
shell {
history_size=100
}
backup {
backup=1
backup_dir="/etc/lvm/backup"
archive=1
archive_dir="/etc/lvm/archive"
retain_min=10
retain_days=30
}
log {
verbose=0
silent=0
syslog=1
overwrite=0
level=0
indent=1
command_names=0
prefix=" "
activation=0
debug_classes=["memory","devices","activation","allocation","lvmetad","metadata","cache","locking","lvmpolld"]
}
allocation {
maximise_cling=1
use_blkid_wiping=1
wipe_signatures_when_zeroing_new_lvs=1
mirror_logs_require_sep