Hi,

I'm trying to setup a pacemaker cluster based on DRBD Active/Active and
GFS2.

Everything is working fine on normal startup. But when I try to mess
around with the cluster, I come across unrecoverable problems with the
GFS2 partition mounting.

Here is what I did and what happens :

  - Remove the network link between the two nodes.
  - Show how the cluster behaves for a while
  - Get the network interface up again
  - As one machine whas stonithed by the other (meatware for the tests),
I restarted the node.
  - on reboot, the cluste can't get the FileSystem resource up and hit
timeout.

This is what I did to show details of the mounting operation :

    # strace /sbin/mount.gfs2 /dev/drbd0 /data -o rw
    ...
    socket(PF_FILE, SOCK_STREAM, 0)         = 3
    connect(3, {sa_family=AF_FILE, path=@"gfsc_sock"}, 12) = 0
    write(3,
"\\o\\o\1\0\1\0\7\0\0\0\0\0\0\0`p\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
28768) = 28768
    read(3,

I suspect there is a problem with the DLM holding one more lock than
necessary. The GFS partition was created with 2 journals (and has to run
on 2 nodes).

Does someone rely on such setup for a prodution use ?
Realy ?
If so, can you help me debug my problem ? The pacemaker config is pretty
much as in the docs (DRBD+GFS2). In case it matters, the config is shown
below.

Thank you !

node orque \
        attributes standby="false"
node orque2 \
        attributes standby="off"
primitive drbd-data ocf:linbit:drbd \
        params drbd_resource="orque-raid" \
        op start interval="0" timeout="240s" start-delay="5s" \
        op stop interval="0" timeout="100s" \
        op monitor interval="30s" timeout="30s" start-delay="5s"
primitive dlm ocf:pacemaker:controld \
        op monitor interval="120s" \
        op start interval="0" timeout="90s" \
        op stop interval="0" timeout="100s"
primitive gfs-control ocf:pacemaker:controld \
        params daemon="gfs_controld.pcmk" args="-g 0" \
        op monitor interval="120s" \
        op start interval="0" timeout="90s" \
        op stop interval="0" timeout="100s"
primitive orque-fs ocf:heartbeat:Filesystem \
        params device="/dev/drbd/by-res/orque-raid" directory="/data"
fstype="gfs2" \
        op start interval="0" timeout="60s" \
        op stop interval="0" timeout="60s"
primitive kvm-adonga ocf:heartbeat:VirtualDomain \
        params config="/etc/libvirt/qemu/adonga.xml"
hypervisor="qemu:///system" migration_transport="ssh" \
        meta allow-migrate="true" target-role="Started" is-managed="true" \
        op start interval="0" timeout="200s" \
        op stop interval="0" timeout="200s" \
        op monitor interval="10" timeout="200s" on-fail="restart" depth="0"
primitive kvm-observatoire-test ocf:heartbeat:VirtualDomain \
        params config="/etc/libvirt/qemu/observatoire-test.xml"
hypervisor="qemu:///system" migration_transport="ssh" \
        meta allow-migrate="true" target-role="Started" is-managed="true" \
        op start interval="0" timeout="200s" \
        op stop interval="0" timeout="200s" \
        op monitor interval="10" timeout="200s" on-fail="restart" depth="0"
primitive kvm-testVM ocf:heartbeat:VirtualDomain \
        params config="/etc/libvirt/qemu/testVM.xml"
hypervisor="qemu:///system" migration_transport="ssh" \
        meta allow-migrate="true" target-role="Stopped" is-managed="true" \
        op start interval="0" timeout="200s" \
        op stop interval="0" timeout="200s" \
        op monitor interval="10" timeout="200s" on-fail="restart" depth="0"
primitive orque-fencing stonith:meatware \
        params hostlist="orque" \
        meta is-managed="true"
primitive orque2-fencing stonith:meatware \
        params hostlist="orque2" \
        meta is-managed="true" target-role="Started"
ms drbd-data-clone drbd-data \
        meta master-max="2" master-node-max="1" clone-max="2"
clone-node-max="1" notify="true"
clone dlm-clone dlm \
        meta interleave="true" target-role="Started"
clone gfs-clone gfs-control \
        meta interleave="true" target-role="Started"
clone orque-fs-clone orque-fs \
        meta is-managed="true" target-role="Started" interleave="true"
ordered="true"
location kvm-testVM-prefers-orque kvm-testVM 50: orque
location loc-orque-fencing orque-fencing -inf: orque
location loc-orque2-fencing orque2-fencing -inf: orque2
colocation gfs-with-dlm inf: gfs-clone dlm-clone
colocation kvm-adonga-with-orque-fs inf: kvm-adonga orque-fs-clone
colocation kvm-observatoire-test-with-orque-fs inf:
kvm-observatoire-test orque-fs-clone
colocation kvm-testVM-with-orque-fs inf: kvm-testVM orque-fs-clone
colocation orque-fs-with-gfs-control inf: orque-fs-clone gfs-clone
order gfs-after-dlm inf: dlm-clone gfs-clone
order kvm-adonga-after-orque-fs inf: orque-fs-clone kvm-adonga
order kvm-observatoire-test-after-orque-fs inf: orque-fs-clone
kvm-observatoire-test
order kvm-testVM-after-orque-fs inf: orque-fs-clone kvm-testVM
order orque-fs-after-drbd-data inf: drbd-data-clone:promote
orque-fs-clone:start
order orque-fs-after-gfs-control inf: gfs-clone orque-fs-clone
property $id="cib-bootstrap-options" \
        dc-version="1.0.9-unknown" \
        cluster-infrastructure="openais" \
        expected-quorum-votes="2" \
        stonith-enabled="true" \
        no-quorum-policy="ignore" \
        last-lrm-refresh="1299772235"
rsc_defaults $id="rsc-options" \
        resource-stickiness="100"

Attachment: 0xA8657ED2.asc
Description: application/pgp-keys

Attachment: signature.asc
Description: OpenPGP digital signature

_______________________________________________
Linux-HA mailing list
Linux-HA@lists.linux-ha.org
http://lists.linux-ha.org/mailman/listinfo/linux-ha
See also: http://linux-ha.org/ReportingProblems

Reply via email to