[ClusterLabs] Pacemaker not reacting as I would expect when two resources fail at the same time

Harvey Shepherd Thu, 30 May 2019 16:40:53 -0700

Hi All,


I'm running Pacemaker 2.0.1 on a cluster containing two nodes; one master and 
one slave. I have a main master/slave resource (m_main_system), a group of 
resources that run in active-active mode (active_active - i.e. run on both 
nodes), and a group that runs in active-disabled mode (snmp_active_disabled - 
resources only run on the current promoted master). The snmp_active_disabled 
group is configured to be co-located with the master of m_main_system, so only 
a failure of the master m_main_system resource can trigger a failover. The 
constraints specify that m_main_system must be started before 
snmp_active_disabled.


The problem I'm having is that when a resource in the snmp_active_disabled 
group fails and gets into a constant cycle where Pacemaker tries to restart it, 
and I then kill m_main_system on the master, then Pacemaker still constantly 
tries to restart the failed snmp_active_disabled resource and ignores the more 
important m_main_system process which should be triggering a failover. If I 
stabilise the snmp_active_disabled resource then Pacemaker finally acts on the 
m_main_system failure. I hope I've described this well enough, but I've 
included a cut down form of my CIB config below if it helps!


Is this a bug or an error in my config? Perhaps the order in which the groups 
are defined in the CIB matters despite the constraints? Any help would be 
gratefully received.


Thanks,

Harvey


<configuration>
  <crm_config>
    <cluster_property_set id="cib-bootstrap-options">
      <nvpair name="stonith-enabled" value="false" 
id="cib-bootstrap-options-stonith-enabled"/>
      <nvpair name="no-quorum-policy" value="ignore" 
id="cib-bootstrap-options-no-quorum-policy"/>
      <nvpair name="have-watchdog" value="false" 
id="cib-bootstrap-options-have-watchdog"/>
      <nvpair name="cluster-name" value="lbcluster" 
id="cib-bootstrap-options-cluster-name"/>
      <nvpair name="start-failure-is-fatal" value="false" 
id="cib-bootstrap-options-start-failure-is-fatal"/>
      <nvpair name="cluster-recheck-interval" value="0s" 
id="cib-bootstrap-options-cluster-recheck-interval"/>
    </cluster_property_set>
  </crm_config>
  <nodes>
    <node id="1" uname="primary"/>
    <node id="2" uname="secondary"/>
  </nodes>
  <resources>
    <group id="snmp_active_disabled">
        <primitive id="snmpd" class="lsb" type="snmpd">
          <operations>
            <op name="monitor" interval="10s" id="snmpd-monitor-10s"/>
            <op name="start" interval="0" timeout="30s" id="snmpd-start-30s"/>
            <op name="stop" interval="0" timeout="30s" id="snmpd-stop-30s"/>
          </operations>
        </primitive>
        <primitive id="snmp-auxiliaries" class="lsb" type="snmp-auxiliaries">
          <operations>
            <op name="monitor" interval="10s" 
id="snmp-auxiliaries-monitor-10s"/>
            <op name="start" interval="0" timeout="30s" 
id="snmp-auxiliaries-start-30s"/>
            <op name="stop" interval="0" timeout="30s" 
id="snmp-auxiliaries-stop-30s"/>
          </operations>
        </primitive>
    </group>
    <clone id="clone_active_active">
      <meta_attributes id="clone_active_active_meta_attributes">
        <nvpair id="group-unique" name="globally-unique" value="false"/>
      </meta_attributes>
      <group id="active_active">
        <primitive id="logd" class="lsb" type="logd">
          <operations>
            <op name="monitor" interval="10s" id="logd-monitor-10s"/>
            <op name="start" interval="0" timeout="30s" id="logd-start-30s"/>
            <op name="stop" interval="0" timeout="30s" id="logd-stop-30s"/>
          </operations>
        </primitive>
        <primitive id="serviced" class="lsb" type="serviced">
          <operations>
            <op name="monitor" interval="10s" id="serviced-monitor-10s"/>
            <op name="start" interval="0" timeout="30s" 
id="serviced-start-30s"/>
            <op name="stop" interval="0" timeout="30s" id="serviced-stop-30s"/>
          </operations>
        </primitive>
      </group>
    </clone>
    <master id="m_main_system">
      <meta_attributes id="m_main_system-meta_attributes">
        <nvpair name="notify" value="true" 
id="m_main_system-meta_attributes-notify"/>
        <nvpair name="clone-max" value="2" 
id="m_main_system-meta_attributes-clone-max"/>
        <nvpair name="promoted-max" value="1" 
id="m_main_system-meta_attributes-promoted-max"/>
        <nvpair name="promoted-node-max" value="1" 
id="m_main_system-meta_attributes-promoted-node-max"/>
      </meta_attributes>
      <primitive id="main_system" class="ocf" provider="acme" 
type="main-system-ocf">
        <operations>
          <op name="start" interval="0" timeout="120s" 
id="main_system-start-0"/>
          <op name="stop" interval="0" timeout="120s" id="main_system-stop-0"/>
          <op name="promote" interval="0" timeout="120s" 
id="main_system-promote-0"/>
          <op name="demote" interval="0" timeout="120s" 
id="main_system-demote-0"/>
          <op name="monitor" interval="10s" timeout="10s" role="Master" 
id="main_system-monitor-10s"/>
          <op name="monitor" interval="11s" timeout="10s" role="Slave" 
id="main_system-monitor-11s"/>
          <op name="notify" interval="0" timeout="60s" 
id="main_system-notify-0"/>
         </operations>
       </primitive>
    </master>
  </resources>
  <constraints>
    <rsc_colocation id="master_only_snmp_rscs_with_main_system" 
score="INFINITY" rsc="snmp_active_disabled" with-rsc="m_main_system" 
with-rsc-role="Master"/>
    <rsc_order id="snmp_active_disabled_after_main_system" kind="Mandatory" 
first="m_main_system" then="snmp_active_disabled"/>
    <rsc_order id="active_active_after_main_system" kind="Mandatory" 
first="m_main_system" then="clone_active_active"/>
  </constraints>
  <rsc_defaults>
    <meta_attributes id="rsc-options">
      <nvpair name="resource-stickiness" value="1" 
id="rsc-options-resource-stickiness"/>
      <nvpair name="migration-threshold" value="0" 
id="rsc-options-migration-threshold"/>
      <nvpair name="requires" value="nothing" id="rsc-options-requires"/>
    </meta_attributes>
  </rsc_defaults>
</configuration>

_______________________________________________
Manage your subscription:
https://lists.clusterlabs.org/mailman/listinfo/users

ClusterLabs home: https://www.clusterlabs.org/

[ClusterLabs] Pacemaker not reacting as I would expect when two resources fail at the same time

Reply via email to