On Thu, Mar 10, 2011 at 5:53 PM, Jonathan Schaeffer
<jonathan.schaef...@univ-brest.fr> wrote:
> Hi,
>
> I'm trying to setup a pacemaker cluster based on DRBD Active/Active and
> GFS2.
>
> Everything is working fine on normal startup. But when I try to mess
> around with the cluster, I come across unrecoverable problems with the
> GFS2 partition mounting.
>
> Here is what I did and what happens :
>
>  - Remove the network link between the two nodes.
>  - Show how the cluster behaves for a while
>  - Get the network interface up again
>  - As one machine whas stonithed by the other (meatware for the tests),
> I restarted the node.

Did you run the meatware confirmation command too?

>  - on reboot, the cluste can't get the FileSystem resource up and hit
> timeout.
>
> This is what I did to show details of the mounting operation :
>
>    # strace /sbin/mount.gfs2 /dev/drbd0 /data -o rw
>    ...
>    socket(PF_FILE, SOCK_STREAM, 0)         = 3
>    connect(3, {sa_family=AF_FILE, path=@"gfsc_sock"}, 12) = 0
>    write(3,
> "\\o\\o\1\0\1\0\7\0\0\0\0\0\0\0`p\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
> 28768) = 28768
>    read(3,
>
> I suspect there is a problem with the DLM holding one more lock than
> necessary. The GFS partition was created with 2 journals (and has to run
> on 2 nodes).
>
> Does someone rely on such setup for a prodution use ?
> Realy ?
> If so, can you help me debug my problem ? The pacemaker config is pretty
> much as in the docs (DRBD+GFS2). In case it matters, the config is shown
> below.
>
> Thank you !
>
> node orque \
>        attributes standby="false"
> node orque2 \
>        attributes standby="off"
> primitive drbd-data ocf:linbit:drbd \
>        params drbd_resource="orque-raid" \
>        op start interval="0" timeout="240s" start-delay="5s" \
>        op stop interval="0" timeout="100s" \
>        op monitor interval="30s" timeout="30s" start-delay="5s"
> primitive dlm ocf:pacemaker:controld \
>        op monitor interval="120s" \
>        op start interval="0" timeout="90s" \
>        op stop interval="0" timeout="100s"
> primitive gfs-control ocf:pacemaker:controld \
>        params daemon="gfs_controld.pcmk" args="-g 0" \
>        op monitor interval="120s" \
>        op start interval="0" timeout="90s" \
>        op stop interval="0" timeout="100s"
> primitive orque-fs ocf:heartbeat:Filesystem \
>        params device="/dev/drbd/by-res/orque-raid" directory="/data"
> fstype="gfs2" \
>        op start interval="0" timeout="60s" \
>        op stop interval="0" timeout="60s"
> primitive kvm-adonga ocf:heartbeat:VirtualDomain \
>        params config="/etc/libvirt/qemu/adonga.xml"
> hypervisor="qemu:///system" migration_transport="ssh" \
>        meta allow-migrate="true" target-role="Started" is-managed="true" \
>        op start interval="0" timeout="200s" \
>        op stop interval="0" timeout="200s" \
>        op monitor interval="10" timeout="200s" on-fail="restart" depth="0"
> primitive kvm-observatoire-test ocf:heartbeat:VirtualDomain \
>        params config="/etc/libvirt/qemu/observatoire-test.xml"
> hypervisor="qemu:///system" migration_transport="ssh" \
>        meta allow-migrate="true" target-role="Started" is-managed="true" \
>        op start interval="0" timeout="200s" \
>        op stop interval="0" timeout="200s" \
>        op monitor interval="10" timeout="200s" on-fail="restart" depth="0"
> primitive kvm-testVM ocf:heartbeat:VirtualDomain \
>        params config="/etc/libvirt/qemu/testVM.xml"
> hypervisor="qemu:///system" migration_transport="ssh" \
>        meta allow-migrate="true" target-role="Stopped" is-managed="true" \
>        op start interval="0" timeout="200s" \
>        op stop interval="0" timeout="200s" \
>        op monitor interval="10" timeout="200s" on-fail="restart" depth="0"
> primitive orque-fencing stonith:meatware \
>        params hostlist="orque" \
>        meta is-managed="true"
> primitive orque2-fencing stonith:meatware \
>        params hostlist="orque2" \
>        meta is-managed="true" target-role="Started"
> ms drbd-data-clone drbd-data \
>        meta master-max="2" master-node-max="1" clone-max="2"
> clone-node-max="1" notify="true"
> clone dlm-clone dlm \
>        meta interleave="true" target-role="Started"
> clone gfs-clone gfs-control \
>        meta interleave="true" target-role="Started"
> clone orque-fs-clone orque-fs \
>        meta is-managed="true" target-role="Started" interleave="true"
> ordered="true"
> location kvm-testVM-prefers-orque kvm-testVM 50: orque
> location loc-orque-fencing orque-fencing -inf: orque
> location loc-orque2-fencing orque2-fencing -inf: orque2
> colocation gfs-with-dlm inf: gfs-clone dlm-clone
> colocation kvm-adonga-with-orque-fs inf: kvm-adonga orque-fs-clone
> colocation kvm-observatoire-test-with-orque-fs inf:
> kvm-observatoire-test orque-fs-clone
> colocation kvm-testVM-with-orque-fs inf: kvm-testVM orque-fs-clone
> colocation orque-fs-with-gfs-control inf: orque-fs-clone gfs-clone
> order gfs-after-dlm inf: dlm-clone gfs-clone
> order kvm-adonga-after-orque-fs inf: orque-fs-clone kvm-adonga
> order kvm-observatoire-test-after-orque-fs inf: orque-fs-clone
> kvm-observatoire-test
> order kvm-testVM-after-orque-fs inf: orque-fs-clone kvm-testVM
> order orque-fs-after-drbd-data inf: drbd-data-clone:promote
> orque-fs-clone:start
> order orque-fs-after-gfs-control inf: gfs-clone orque-fs-clone
> property $id="cib-bootstrap-options" \
>        dc-version="1.0.9-unknown" \
>        cluster-infrastructure="openais" \
>        expected-quorum-votes="2" \
>        stonith-enabled="true" \
>        no-quorum-policy="ignore" \
>        last-lrm-refresh="1299772235"
> rsc_defaults $id="rsc-options" \
>        resource-stickiness="100"
>
> _______________________________________________
> Linux-HA mailing list
> Linux-HA@lists.linux-ha.org
> http://lists.linux-ha.org/mailman/listinfo/linux-ha
> See also: http://linux-ha.org/ReportingProblems
>
_______________________________________________
Linux-HA mailing list
Linux-HA@lists.linux-ha.org
http://lists.linux-ha.org/mailman/listinfo/linux-ha
See also: http://linux-ha.org/ReportingProblems

Reply via email to