On 07/16/2012 01:34 PM, Phil Frost wrote:
I've been doing some study of the iscsi RA since my first post, and it seems to me now that the "failure" in the monitor action isn't actually in the monitor action at all. Rather, it appears that for *all* actions, the RA does a "discovery" step, and that's what is failing. I'm not really sure what this is, or why I need it. Is it simply to find an unspecified portal for a given IQN? Is it therefore useless in my case, since I've explicitly specified the portal in the resource parameters?

If I were to disable the "discovery" step, what are people's thoughts on the case where the target is operational, but the initiator for some reason (network failure) can't reach it? In this case, assume Pacemaker knows the target is up; is there a way to encourage it to decide to attempt migrating the initiator to another node?

Well, after reading through the iscsi RA a dozen times, I could not formulate any reasonable idea of why the discovery step might be necessary. The portal parameter is required, so it couldn't be to locate the portal. And, there is logic in the discovery function to handle the case when a target returns multiple portals for the same target -- by finding the one that was specified in the portal parameter. So it can't really be discovering anything. It does raise an error in this case if the portal parameter isn't specified, but then the portal parameter isn't optional, so that case could never occur. It smelled like rotten code to me.

So, given all that, and given how it introduces a nasty race condition in the case that the target isn't running (or is just in the process of migrating to another node), I decided it was better to just get rid of it. Patch attached. I suppose I've introduced a different failure in that an initiator that can't contact a running target won't be migrated, but I'd rather have one of my VMs trying to run, unsuccessfully, and able to automatically recover when the fault is cleared, than have an entire VM host shot in the head on the basis of a race condition in non-failure situations.

One minor nastiness was observed with my patch: if the portal isn't specified exactly as udev will format it, then the RA will wait forever for the device node to appear, expecting the wrong device filename. Maybe canonicalizing the portal was one useful function of the discovery function, but in my opinion, not worth the other problems.
--- heartbeat/iscsi	2012-07-16 13:10:14.000000000 -0400
+++ macpros/iscsi	2012-07-16 14:50:57.000000000 -0400
@@ -31,7 +31,6 @@
 #	OCF_RESKEY_portal: the iSCSI portal address or host name (required)
 #	OCF_RESKEY_target: the iSCSI target (required)
 #	OCF_RESKEY_iscsiadm: iscsiadm program path (optional)
-#	OCF_RESKEY_discovery_type: discovery type (optional; default: sendtargets)
 #
 # Initialization:
 
@@ -41,11 +40,9 @@
 # Defaults
 OCF_RESKEY_udev_default="yes"
 OCF_RESKEY_iscsiadm_default="iscsiadm"
-OCF_RESKEY_discovery_type_default="sendtargets"
 
 : ${OCF_RESKEY_udev=${OCF_RESKEY_udev_default}}
 : ${OCF_RESKEY_iscsiadm=${OCF_RESKEY_iscsiadm_default}}
-: ${OCF_RESKEY_discovery_type=${OCF_RESKEY_discovery_type_default}}
 
 usage() {
   methods=`iscsi_methods`
@@ -96,15 +93,6 @@
 <content type="string" />
 </parameter>
 
-<parameter name="discovery_type" unique="0" required="0">
-<longdesc lang="en">
-Target discovery type. Check the open-iscsi documentation for
-supported discovery types.
-</longdesc>
-<shortdesc lang="en">Target discovery type</shortdesc>
-<content type="string" default="${OCF_RESKEY_discovery_type_default}" />
-</parameter>
-
 <parameter name="iscsiadm" unique="0" required="0">
 <longdesc lang="en">
 open-iscsi administration utility binary.
@@ -128,8 +116,8 @@
 </parameters>
 
 <actions>
-<action name="start" timeout="120" />
-<action name="stop" timeout="120" />
+<action name="start" timeout="60" />
+<action name="stop" timeout="60" />
 <action name="status" timeout="30" />
 <action name="monitor" depth="0" timeout="30" interval="120" />
 <action name="validate-all" timeout="5" />
@@ -166,7 +154,6 @@
 	fi
 }
 open_iscsi_setup() {
-	discovery=open_iscsi_discovery
 	add_disk=open_iscsi_add
 	remove_disk=open_iscsi_remove
 	disk_status=open_iscsi_status
@@ -179,72 +166,6 @@
 		return $OCF_ERR_INSTALLED
 }
 
-#
-# discovery return codes:
-#   0: ok (variable portal set)
-#   1: target not found
-#   2: target found but can't connect it unambigously
-#   3: iscsiadm returned error
-#
-# open-iscsi >= "2.0-872" changed discovery semantics
-# see http://www.mail-archive.com/open-iscsi@googlegroups.com/msg04883.html
-# there's a new discoverydb command which should be used instead discovery
- 
-open_iscsi_discovery() {
-	local output
-	local severity=err
-	local discovery_variant="discovery"
-	local options=""
-	local cmd
-	local version=`$iscsiadm --version | awk '{print $3}'`
-
-	ocf_version_cmp "$version" "2.0-871"
-	if [ $? -eq 2 ]; then # newer than 2.0-871?
-		discovery_variant="discoverydb"
-		[ "$discovery_type" = "sendtargets" ] &&
-			options="-D"
-	fi
-	cmd="$iscsiadm -m $discovery_variant -p $OCF_RESKEY_portal -t $discovery_type $options"
-	ocf_is_probe && severity=info
-	output=`$cmd`
-	if [ $? -ne 0 -o x = "x$output" ]; then
-		[ x != "x$output" ] && {
-			ocf_log $severity "$cmd FAILED"
-			echo "$output"
-		}
-		return 3
-	fi
-	portal=`echo "$output" |
-		awk -v target="$OCF_RESKEY_target" '
-		$NF==target{
-			if( NF==3 ) portal=$2; # sles compat mode
-			else portal=$1;
-			sub(",.*","",portal);
-			print portal;
-		}'`
-
-	case `echo "$portal" | wc -w` in
-	0) #target not found
-		echo "$output"
-		ocf_log $severity "target $OCF_RESKEY_target not found at portal $OCF_RESKEY_portal"
-		return 1
-	;;
-	1) #we're ok
-		return 0
-	;;
-	*) # handle multihome hosts reporting multiple portals
-		for p in $portal; do
-			if [ "$OCF_RESKEY_portal" = "$p" ]; then
-				portal="$OCF_RESKEY_portal"
-				return 0
-			fi
-		done
-		echo "$output"
-		ocf_log err "sorry, can't handle multihomed hosts unless you specify the portal exactly"
-		return 2
-	;;
-	esac
-}
 open_iscsi_add() {
 	$iscsiadm -m node -p $1 -T $2 -l
 }
@@ -259,7 +180,7 @@
 # NB: this is udev specific!
 #
 wait_for_udev() {
-	dev=/dev/disk/by-path/ip-$portal-iscsi-$OCF_RESKEY_target
+	dev=/dev/disk/by-path/ip-${OCF_RESKEY_portal}-iscsi-$OCF_RESKEY_target
 	while :; do
 		ls $dev* >/dev/null 2>&1 && break
 		ocf_log warning "waiting for udev to create $dev" 
@@ -267,7 +188,7 @@
 	done
 }
 iscsi_status() {
-	if $disk_status $portal $OCF_RESKEY_target; then
+	if $disk_status ${OCF_RESKEY_portal} $OCF_RESKEY_target; then
 		return $OCF_SUCCESS
 	else
 		return $OCF_NOT_RUNNING
@@ -275,10 +196,10 @@
 }
 iscsi_start() {
 	if iscsi_status; then
-		ocf_log info "iscsi $portal $OCF_RESKEY_target already running"
+		ocf_log info "iscsi ${OCF_RESKEY_portal} $OCF_RESKEY_target already running"
 		return $OCF_SUCCESS
 	else
-		$add_disk $portal $OCF_RESKEY_target ||
+		$add_disk ${OCF_RESKEY_portal} $OCF_RESKEY_target ||
 			return $OCF_ERR_GENERIC
 		case "$udev" in
 		[Yy]es) wait_for_udev ||
@@ -295,7 +216,7 @@
 }
 iscsi_stop() {
 	if iscsi_status; then
-		$remove_disk $portal $OCF_RESKEY_target ||
+		$remove_disk ${OCF_RESKEY_portal} $OCF_RESKEY_target ||
 			return $OCF_ERR_GENERIC
 		if iscsi_status; then
 			return $OCF_ERR_GENERIC
@@ -303,13 +224,13 @@
 			return $OCF_SUCCESS
 		fi
 	else
-		ocf_log info "iscsi $portal $OCF_RESKEY_target already stopped"
+		ocf_log info "iscsi ${OCF_RESKEY_portal} $OCF_RESKEY_target already stopped"
 		return $OCF_SUCCESS
 	fi
 }
 
 iscsi_monitor() {
-	if $disk_status $portal $OCF_RESKEY_target; then
+	if $disk_status ${OCF_RESKEY_portal} $OCF_RESKEY_target; then
 		return $OCF_SUCCESS
     else
 		return $OCF_NOT_RUNNING
@@ -371,9 +292,7 @@
 	exit $OCF_ERR_PERM
 fi
 
-discovery_type=${OCF_RESKEY_discovery_type}
 udev=${OCF_RESKEY_udev}
-$discovery  # discover and setup the real portal string (address)
 case $? in
 0) ;;
 1) [ "$1" = stop ] && exit $OCF_SUCCESS
_______________________________________________
Pacemaker mailing list: Pacemaker@oss.clusterlabs.org
http://oss.clusterlabs.org/mailman/listinfo/pacemaker

Project Home: http://www.clusterlabs.org
Getting started: http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
Bugs: http://bugs.clusterlabs.org

Reply via email to