On 07/16/2012 01:34 PM, Phil Frost wrote:
I've been doing some study of the iscsi RA since my first post, and it
seems to me now that the "failure" in the monitor action isn't
actually in the monitor action at all. Rather, it appears that for
*all* actions, the RA does a "discovery" step, and that's what is
failing. I'm not really sure what this is, or why I need it. Is it
simply to find an unspecified portal for a given IQN? Is it therefore
useless in my case, since I've explicitly specified the portal in the
resource parameters?
If I were to disable the "discovery" step, what are people's thoughts
on the case where the target is operational, but the initiator for
some reason (network failure) can't reach it? In this case, assume
Pacemaker knows the target is up; is there a way to encourage it to
decide to attempt migrating the initiator to another node?
Well, after reading through the iscsi RA a dozen times, I could not
formulate any reasonable idea of why the discovery step might be
necessary. The portal parameter is required, so it couldn't be to locate
the portal. And, there is logic in the discovery function to handle the
case when a target returns multiple portals for the same target -- by
finding the one that was specified in the portal parameter. So it can't
really be discovering anything. It does raise an error in this case if
the portal parameter isn't specified, but then the portal parameter
isn't optional, so that case could never occur. It smelled like rotten
code to me.
So, given all that, and given how it introduces a nasty race condition
in the case that the target isn't running (or is just in the process of
migrating to another node), I decided it was better to just get rid of
it. Patch attached. I suppose I've introduced a different failure in
that an initiator that can't contact a running target won't be migrated,
but I'd rather have one of my VMs trying to run, unsuccessfully, and
able to automatically recover when the fault is cleared, than have an
entire VM host shot in the head on the basis of a race condition in
non-failure situations.
One minor nastiness was observed with my patch: if the portal isn't
specified exactly as udev will format it, then the RA will wait forever
for the device node to appear, expecting the wrong device filename.
Maybe canonicalizing the portal was one useful function of the discovery
function, but in my opinion, not worth the other problems.
--- heartbeat/iscsi 2012-07-16 13:10:14.000000000 -0400
+++ macpros/iscsi 2012-07-16 14:50:57.000000000 -0400
@@ -31,7 +31,6 @@
# OCF_RESKEY_portal: the iSCSI portal address or host name (required)
# OCF_RESKEY_target: the iSCSI target (required)
# OCF_RESKEY_iscsiadm: iscsiadm program path (optional)
-# OCF_RESKEY_discovery_type: discovery type (optional; default: sendtargets)
#
# Initialization:
@@ -41,11 +40,9 @@
# Defaults
OCF_RESKEY_udev_default="yes"
OCF_RESKEY_iscsiadm_default="iscsiadm"
-OCF_RESKEY_discovery_type_default="sendtargets"
: ${OCF_RESKEY_udev=${OCF_RESKEY_udev_default}}
: ${OCF_RESKEY_iscsiadm=${OCF_RESKEY_iscsiadm_default}}
-: ${OCF_RESKEY_discovery_type=${OCF_RESKEY_discovery_type_default}}
usage() {
methods=`iscsi_methods`
@@ -96,15 +93,6 @@
<content type="string" />
</parameter>
-<parameter name="discovery_type" unique="0" required="0">
-<longdesc lang="en">
-Target discovery type. Check the open-iscsi documentation for
-supported discovery types.
-</longdesc>
-<shortdesc lang="en">Target discovery type</shortdesc>
-<content type="string" default="${OCF_RESKEY_discovery_type_default}" />
-</parameter>
-
<parameter name="iscsiadm" unique="0" required="0">
<longdesc lang="en">
open-iscsi administration utility binary.
@@ -128,8 +116,8 @@
</parameters>
<actions>
-<action name="start" timeout="120" />
-<action name="stop" timeout="120" />
+<action name="start" timeout="60" />
+<action name="stop" timeout="60" />
<action name="status" timeout="30" />
<action name="monitor" depth="0" timeout="30" interval="120" />
<action name="validate-all" timeout="5" />
@@ -166,7 +154,6 @@
fi
}
open_iscsi_setup() {
- discovery=open_iscsi_discovery
add_disk=open_iscsi_add
remove_disk=open_iscsi_remove
disk_status=open_iscsi_status
@@ -179,72 +166,6 @@
return $OCF_ERR_INSTALLED
}
-#
-# discovery return codes:
-# 0: ok (variable portal set)
-# 1: target not found
-# 2: target found but can't connect it unambigously
-# 3: iscsiadm returned error
-#
-# open-iscsi >= "2.0-872" changed discovery semantics
-# see http://www.mail-archive.com/open-iscsi@googlegroups.com/msg04883.html
-# there's a new discoverydb command which should be used instead discovery
-
-open_iscsi_discovery() {
- local output
- local severity=err
- local discovery_variant="discovery"
- local options=""
- local cmd
- local version=`$iscsiadm --version | awk '{print $3}'`
-
- ocf_version_cmp "$version" "2.0-871"
- if [ $? -eq 2 ]; then # newer than 2.0-871?
- discovery_variant="discoverydb"
- [ "$discovery_type" = "sendtargets" ] &&
- options="-D"
- fi
- cmd="$iscsiadm -m $discovery_variant -p $OCF_RESKEY_portal -t $discovery_type $options"
- ocf_is_probe && severity=info
- output=`$cmd`
- if [ $? -ne 0 -o x = "x$output" ]; then
- [ x != "x$output" ] && {
- ocf_log $severity "$cmd FAILED"
- echo "$output"
- }
- return 3
- fi
- portal=`echo "$output" |
- awk -v target="$OCF_RESKEY_target" '
- $NF==target{
- if( NF==3 ) portal=$2; # sles compat mode
- else portal=$1;
- sub(",.*","",portal);
- print portal;
- }'`
-
- case `echo "$portal" | wc -w` in
- 0) #target not found
- echo "$output"
- ocf_log $severity "target $OCF_RESKEY_target not found at portal $OCF_RESKEY_portal"
- return 1
- ;;
- 1) #we're ok
- return 0
- ;;
- *) # handle multihome hosts reporting multiple portals
- for p in $portal; do
- if [ "$OCF_RESKEY_portal" = "$p" ]; then
- portal="$OCF_RESKEY_portal"
- return 0
- fi
- done
- echo "$output"
- ocf_log err "sorry, can't handle multihomed hosts unless you specify the portal exactly"
- return 2
- ;;
- esac
-}
open_iscsi_add() {
$iscsiadm -m node -p $1 -T $2 -l
}
@@ -259,7 +180,7 @@
# NB: this is udev specific!
#
wait_for_udev() {
- dev=/dev/disk/by-path/ip-$portal-iscsi-$OCF_RESKEY_target
+ dev=/dev/disk/by-path/ip-${OCF_RESKEY_portal}-iscsi-$OCF_RESKEY_target
while :; do
ls $dev* >/dev/null 2>&1 && break
ocf_log warning "waiting for udev to create $dev"
@@ -267,7 +188,7 @@
done
}
iscsi_status() {
- if $disk_status $portal $OCF_RESKEY_target; then
+ if $disk_status ${OCF_RESKEY_portal} $OCF_RESKEY_target; then
return $OCF_SUCCESS
else
return $OCF_NOT_RUNNING
@@ -275,10 +196,10 @@
}
iscsi_start() {
if iscsi_status; then
- ocf_log info "iscsi $portal $OCF_RESKEY_target already running"
+ ocf_log info "iscsi ${OCF_RESKEY_portal} $OCF_RESKEY_target already running"
return $OCF_SUCCESS
else
- $add_disk $portal $OCF_RESKEY_target ||
+ $add_disk ${OCF_RESKEY_portal} $OCF_RESKEY_target ||
return $OCF_ERR_GENERIC
case "$udev" in
[Yy]es) wait_for_udev ||
@@ -295,7 +216,7 @@
}
iscsi_stop() {
if iscsi_status; then
- $remove_disk $portal $OCF_RESKEY_target ||
+ $remove_disk ${OCF_RESKEY_portal} $OCF_RESKEY_target ||
return $OCF_ERR_GENERIC
if iscsi_status; then
return $OCF_ERR_GENERIC
@@ -303,13 +224,13 @@
return $OCF_SUCCESS
fi
else
- ocf_log info "iscsi $portal $OCF_RESKEY_target already stopped"
+ ocf_log info "iscsi ${OCF_RESKEY_portal} $OCF_RESKEY_target already stopped"
return $OCF_SUCCESS
fi
}
iscsi_monitor() {
- if $disk_status $portal $OCF_RESKEY_target; then
+ if $disk_status ${OCF_RESKEY_portal} $OCF_RESKEY_target; then
return $OCF_SUCCESS
else
return $OCF_NOT_RUNNING
@@ -371,9 +292,7 @@
exit $OCF_ERR_PERM
fi
-discovery_type=${OCF_RESKEY_discovery_type}
udev=${OCF_RESKEY_udev}
-$discovery # discover and setup the real portal string (address)
case $? in
0) ;;
1) [ "$1" = stop ] && exit $OCF_SUCCESS
_______________________________________________
Pacemaker mailing list: Pacemaker@oss.clusterlabs.org
http://oss.clusterlabs.org/mailman/listinfo/pacemaker
Project Home: http://www.clusterlabs.org
Getting started: http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
Bugs: http://bugs.clusterlabs.org