Good Morning, When drbd syncs a few resources everything works fine. But when drbd needs to sync all resources (i.e. a host came back up) it hangs the business app running above.
All our configuration drbd settings are default, this is a resource sample: resource "vm-100-disk-3" { options { cpu-mask ""; # default on-no-data-accessible io-error; # default auto-promote yes; # default peer-ack-window 4096s; # bytes, default peer-ack-delay 100; # milliseconds, default twopc-timeout 300; # 1/10 seconds, default twopc-retry-timeout 1; # 1/10 seconds, default auto-promote-timeout 20; # 1/10 seconds, default max-io-depth 8000; # default quorum majority; on-no-quorum io-error; quorum-minimum-redundancy off; # default on-suspended-primary-outdated disconnect; # default } _this_host { node-id 0; volume 0 { device minor 1017; disk "/dev/vgthc1/vm-100-disk-3_00000"; meta-disk internal; disk { size 0s; # bytes, default on-io-error detach; # default disk-barrier no; # default disk-flushes yes; # default disk-drain yes; # default md-flushes yes; # default resync-after -1; # default al-extents 1237; # default al-updates yes; # default discard-zeroes-if-aligned yes; # default disable-write-same no; # default disk-timeout 0; # 1/10 seconds, default read-balancing prefer-local; # default rs-discard-granularity 1048576; # bytes } } } connection { _peer_node_id 2; path { _this_host ipv4 10.0.7.106:7017; _remote_host ipv4 10.100.1.3:7017; } net { transport ""; # default protocol C; # default timeout 60; # 1/10 seconds, default max-epoch-size 2048; # default connect-int 10; # seconds, default ping-int 10; # seconds, default sndbuf-size 0; # bytes, default rcvbuf-size 0; # bytes, default ko-count 7; # default allow-two-primaries no; # default cram-hmac-alg "sha1"; shared-secret "*"; after-sb-0pri disconnect; # default after-sb-1pri disconnect; # default after-sb-2pri disconnect; # default always-asbp no; # default rr-conflict disconnect; # default ping-timeout 5; # 1/10 seconds, default data-integrity-alg ""; # default tcp-cork yes; # default on-congestion block; # default congestion-fill 0s; # bytes, default congestion-extents 1237; # default csums-alg ""; # default csums-after-crash-only no; # default verify-alg "crct10dif-pclmul"; use-rle yes; # default socket-check-timeout 0; # default fencing dont-care; # default max-buffers 2048; # default allow-remote-read yes; # default _name "C"; } volume 0 { disk { resync-rate 250k; # bytes/second, default c-plan-ahead 20; # 1/10 seconds, default c-delay-target 10; # 1/10 seconds, default c-fill-target 100s; # bytes, default c-max-rate 102400k; # bytes/second, default c-min-rate 250k; # bytes/second, default bitmap no; } } } connection { _peer_node_id 1; path { _this_host ipv4 10.0.7.106:7017; _remote_host ipv4 10.0.7.105:7017; } net { transport ""; # default protocol C; # default timeout 60; # 1/10 seconds, default max-epoch-size 2048; # default connect-int 10; # seconds, default ping-int 10; # seconds, default sndbuf-size 0; # bytes, default rcvbuf-size 0; # bytes, default ko-count 7; # default allow-two-primaries no; # default cram-hmac-alg "sha1"; shared-secret "*"; after-sb-0pri disconnect; # default after-sb-1pri disconnect; # default after-sb-2pri disconnect; # default always-asbp no; # default rr-conflict disconnect; # default ping-timeout 5; # 1/10 seconds, default data-integrity-alg ""; # default tcp-cork yes; # default on-congestion block; # default congestion-fill 0s; # bytes, default congestion-extents 1237; # default csums-alg ""; # default csums-after-crash-only no; # default verify-alg "crct10dif-pclmul"; use-rle yes; # default socket-check-timeout 0; # default fencing dont-care; # default max-buffers 2048; # default allow-remote-read yes; # default _name "T"; } volume 0 { disk { resync-rate 250k; # bytes/second, default c-plan-ahead 20; # 1/10 seconds, default c-delay-target 10; # 1/10 seconds, default c-fill-target 100s; # bytes, default c-max-rate 102400k; # bytes/second, default c-min-rate 250k; # bytes/second, default bitmap yes; # default } } } } We have 39 defined resoruces using the same settings. And all these resources are running on the same RAID supported by two physical nvme ssd drives. We have two combined hosts and a diskless satellite host. The network card between the two hosts is a 1Gb card. I have read the following guide https://kb.linbit.com/tuning-drbds-resync-controller and I think our current installation might have to be tuned in order to avoid those application hungs. I think that I have to tune the c-max-rate for all the devices but I don't know it for sure. Do I have a way to limit the whole c-max-rate globally? Or do I have to limit it for every resource so that when they sum up they don't exceed our current physical limitations? I've seen a global_common configuration but I don't know if it is meant to be a global conf for the whole drbd system or a conf applied to all defined resources individually. If anyone can guide me through this I'll be grateful. Thanks and regards, Ferran
_______________________________________________ Star us on GITHUB: https://github.com/LINBIT drbd-user mailing list drbd-user@lists.linbit.com https://lists.linbit.com/mailman/listinfo/drbd-user