Hi Kazutomo-san, On Wed, Feb 10, 2010 at 11:11:42AM +0900, nakah...@intellilink.co.jp wrote: > Hi, Dejan > > Thank you for your comment for my patch. > And sorry for not getting back to you any earlier. > > I agree to all your points. > Could you commit patches to the agents repository?
It turns out that the patches need quite a bit of work and I'm afraid I can't afford to spend the time right now. Can you please do that yourself: fix spelling mistakes, don't log unnecessarily in the monitor operations, and the status operation should output to stdout and not log. Thanks, Dejan > Best Regards, > NAKAHIRA Kazutomo > > Quoting Dejan Muhamedagic <deja...@fastmail.fm>: > > > Hi Kazutomo-san, > > > > On Fri, Jan 22, 2010 at 06:31:15PM +0900, NAKAHIRA Kazutomo wrote: > >> Hi, all > >> > >> I improved logging of the oracle/oralsnr RA. > >> This patch has aimed to record the execution result of the sqlplus > >> (and other commands) and output a detailed failure log using ocf_log > >> when some problems occurred in RA operation. > > > > Thanks for the patch. There are a few spelling problems, but I'll > > fix those. Also, the status operation must output to stdout, > > that's not meant for the logs. > > > > Cheers, > > > > Dejan > > > >> Best Regards, > >> NAKAHIRA Kazutomo > >> > >> -- > >> ---------------------------------------- > >> NAKAHIRA Kazutomo > >> NTT DATA INTELLILINK CORPORATION > >> Open Source Business Unit > >> Software Services Integration Business Division > > > >> # HG changeset patch > >> # User r...@prec370b > >> # Date 1264145021 -32400 > >> # Node ID 0ecef9560522601936888ce168dae5f563662402 > >> # Parent 3024963150433960c51aa1bdccde39839efb09b7 > >> oracle: improve logging > >> > >> diff -r 302496315043 -r 0ecef9560522 heartbeat/oracle > >> --- a/heartbeat/oracle Thu Jan 21 16:42:40 2010 +0100 > >> +++ b/heartbeat/oracle Fri Jan 22 16:23:41 2010 +0900 > >> @@ -231,11 +231,23 @@ ora_info() { > >> > >> testoraenv() { > >> # Let's make sure a few important things are set... > >> - [ x != "x$ORACLE_HOME" -a x != "x$ORACLE_OWNER" ] || > >> + if [ x == "x$ORACLE_HOME" -o x == "x$ORACLE_OWNER" ]; then > >> + ocf_log err "Either of ORACLE_HOME or ORACLE_OWNER is NULL. > >> ORACLE_HOME=$ORACLE_HOME, ORACLE_OWNER=$ORACLE_OWNER." > >> return 1 > >> + fi > >> # and some important things are there > >> - [ -x "$sqlplus" -a -x "$lsnrctl" -a -x "$tnsping" ] || > >> + if [ ! -x "$sqlplus" ]; then > >> + ocf_log err "Executeble sqlplus command($sqlplus) dose not > >> exist." > >> return 1 > >> + fi > >> + if [ ! -x "$lsnrctl" ]; then > >> + ocf_log err "Executeble lsnrctl command($lsnrctl) dose not > >> exist." > >> + return 1 > >> + fi > >> + if [ ! -x "$tnsping" ]; then > >> + ocf_log err "Executeble tnsping command($tnsping) dose not > >> exist." > >> + return 1 > >> + fi > >> return 0 > >> } > >> > >> @@ -367,13 +379,20 @@ showdbstat() { > >> # Part 1: Oracle > >> dumpinstipc() { > >> local dumpdest=`dbasql getdumpdest` > >> - [ "x$dumpdest" != x -a -d "$dumpdest" ] || return 1 > >> + if [ "x$dumpdest" == x -o ! -d "$dumpdest" ]; then > >> + ocf_log warn "dumpdest($dumpdest) is not a regular directory." > >> + return 1 > >> + fi > >> local -i fcount=`ls -rt $dumpdest | wc -l` > >> - dbasql getipc >/dev/null 2>&1 > >> + output=`dbasql getipc` > >> local lastf=`ls -rt $dumpdest | grep -v '^\.*$' | tail -1` > >> local -i fcount2=`ls -rt $dumpdest | wc -l` > >> - [ $((fcount+1)) -eq $fcount2 ] || return 1 # more than one file created > >> - echo $dumpdest/$lastf > >> + if [ $((fcount+1)) -eq $fcount2 ]; then > >> + echo $dumpdest/$lastf > >> + else > >> + ocf_log warn "dumpinstipc failed bacause the number of output > >> files is wrong. before dump file count=$fcount, after dump file > >> count=$fcount2, getipc result=$output" > >> + return 1 > >> + fi > >> } > >> parseipc() { > >> local inf=$1 > >> @@ -440,7 +459,13 @@ is_oracle_up() { > >> } > >> # instance in OPEN state? > >> instance_live() { > >> - [ "`dbasql dbstat`" = OPEN ] > >> + output=`dbasql dbstat` > >> + if [ "$output" = OPEN ]; then > >> + return 0 > >> + else > >> + ocf_log info "Instance state is not OPEN. dbstat result=$output" > >> + return 1 > >> + fi > >> } > >> > >> ora_cleanup() { > >> @@ -498,6 +523,7 @@ oracle_start() { > >> # try to cleanup in case of > >> # ORA-01081: cannot start already-running ORACLE - shut it down > >> first > >> if echo "$output" | grep ORA-01081 >/dev/null 2>&1; then > >> + ocf_log info "ORA-01081 error was found. try to cleanup > >> oracle. > >> DB start output=$output" > >> ora_cleanup > >> output=`dbasql dbstart_mount` > >> fi > >> @@ -510,7 +536,7 @@ oracle_start() { > >> ;; > >> *) > >> : error!! > >> - ocf_log error "Oracle $ORACLE_SID can not mount." > >> + ocf_log error "Oracle $ORACLE_SID can not mount. DB > >> status=$status, DB start output=$output" > >> return $OCF_ERR_GENERIC > >> ;; > >> esac > >> @@ -523,13 +549,16 @@ oracle_start() { > >> fi > >> output=`dbasql dbopen` > >> > >> - if is_oracle_up && instance_live; then > >> + if ! is_oracle_up; then > >> + ocf_log err "Oracle process is not started: $output" > >> + return $OCF_ERR_GENERIC > >> + elif ! instance_live; then > >> + ocf_log err "Oracle instance $ORACLE_SID not started: $output" > >> + return $OCF_ERR_GENERIC > >> + else > >> : cool, we are up and running > >> ocf_log info "Oracle instance $ORACLE_SID started: $output" > >> return $OCF_SUCCESS > >> - else > >> - ocf_log err "Oracle instance $ORACLE_SID not started: $output" > >> - return $OCF_ERR_GENERIC > >> fi > >> } > >> > >> @@ -562,13 +591,21 @@ killprocs() { > >> killprocs() { > >> local sig=$1 > >> shift 1 > >> - kill -$sig $* >/dev/null 2>&1 > >> + # Record stderr > >> + kill -$sig $* >/dev/null > >> } > >> ora_kill() { > >> - killprocs TERM `eval $procs | awk '{print $1}'` > >> + oraprocs=`eval $procs | awk '{print $1}'` > >> + if [ -z "$oraprocs" ]; then > >> + ocf_log debug "All oracle processes are already stopped." > >> + return > >> + fi > >> + killprocs TERM $oraprocs > >> for i in 1 2 3 4 5; do > >> - killprocs 0 `eval $procs | awk '{print $1}'` || > >> + if [ -z "`eval $procs | awk '{print $1}'`" ]; then > >> + ocf_log debug "All oracle processes are killed." > >> return > >> + fi > >> sleep 5 > >> done > >> killprocs KILL `eval $procs | awk '{print $1}'` > >> @@ -578,14 +615,16 @@ ora_kill() { > >> # oracle_monitor: Can the Oracle instance do anything useful? > >> # > >> oracle_monitor() { > >> - if is_oracle_up && instance_live > >> - then > >> - #ocf_log info "Oracle instance $ORACLE_SID is alive" > >> - return $OCF_SUCCESS > >> - else > >> + if ! is_oracle_up; then > >> + ocf_log info "Oracle process is down" > >> + return $OCF_NOT_RUNNING > >> + fi > >> + if ! instance_live; then > >> ocf_log info "Oracle instance $ORACLE_SID is down" > >> return $OCF_NOT_RUNNING > >> fi > >> + #ocf_log info "Oracle instance $ORACLE_SID is alive" > >> + return $OCF_SUCCESS > >> } > >> > >> # > >> @@ -675,10 +714,10 @@ case "$1" in > >> > >> status) if is_oracle_up > >> then > >> - echo Oracle instance $ORACLE_SID is running > >> + ocf_log info "Oracle instance $ORACLE_SID is running" > >> exit $OCF_SUCCESS > >> else > >> - echo Oracle instance $ORACLE_SID is stopped > >> + ocf_log info "Oracle instance $ORACLE_SID is stopped" > >> exit $OCF_NOT_RUNNING > >> fi > >> ;; > > > >> # HG changeset patch > >> # User r...@prec370b > >> # Date 1264145083 -32400 > >> # Node ID b36abb1554500f4bdf33858989205d02606c609d > >> # Parent 0ecef9560522601936888ce168dae5f563662402 > >> oralsnr: improve logging > >> > >> diff -r 0ecef9560522 -r b36abb155450 heartbeat/oralsnr > >> --- a/heartbeat/oralsnr Fri Jan 22 16:23:41 2010 +0900 > >> +++ b/heartbeat/oralsnr Fri Jan 22 16:24:43 2010 +0900 > >> @@ -158,11 +158,23 @@ ora_info() { > >> > >> testoraenv() { > >> # Let's make sure a few important things are set... > >> - [ x != "x$ORACLE_HOME" -a x != "x$ORACLE_OWNER" ] || > >> + if [ x == "x$ORACLE_HOME" -o x == "x$ORACLE_OWNER" ]; then > >> + ocf_log err "Either of ORACLE_HOME or ORACLE_OWNER is NULL. > >> ORACLE_HOME=$ORACLE_HOME, ORACLE_OWNER=$ORACLE_OWNER." > >> return 1 > >> + fi > >> # and some important things are there > >> - [ -x "$sqlplus" -a -x "$lsnrctl" -a -x "$tnsping" ] || > >> + if [ ! -x "$sqlplus" ]; then > >> + ocf_log err "Executeble sqlplus command($sqlplus) dose not > >> exist." > >> return 1 > >> + fi > >> + if [ ! -x "$lsnrctl" ]; then > >> + ocf_log err "Executeble lsnrctl command($lsnrctl) dose not > >> exist." > >> + return 1 > >> + fi > >> + if [ ! -x "$tnsping" ]; then > >> + ocf_log err "Executeble tnsping command($tnsping) dose not > >> exist." > >> + return 1 > >> + fi > >> return 0 > >> } > >> > >> @@ -247,10 +259,17 @@ oralsnr_stop() { > >> # kill the listener procs > >> # give them 10 secs to exit cleanly (5 times 2) > >> oralsnr_kill() { > >> - killprocs TERM `eval $procs | awk '{print $1}'` > >> + oraprocs=`eval $procs | awk '{print $1}'` > >> + if [ -z "$oraprocs" ]; then > >> + ocf_log debug "All oralsnr processes are already stopped." > >> + return > >> + fi > >> + killprocs TERM $oraprocs > >> for i in 1 2 3 4 5; do > >> - killprocs 0 `eval $procs | awk '{print $1}'` || > >> + if [ -z "`eval $procs | awk '{print $1}'`" ]; then > >> + ocf_log debug "All oralsnr processes are killed." > >> return > >> + fi > >> sleep 2 > >> done > >> killprocs KILL `eval $procs | awk '{print $1}'` > >> @@ -258,7 +277,8 @@ killprocs() { > >> killprocs() { > >> sig=$1 > >> shift 1 > >> - kill -$sig $* >/dev/null 2>&1 > >> + # Record stderr > >> + kill -$sig $* >/dev/null > >> } > >> > >> # > >> @@ -269,11 +289,23 @@ is_oralsnr_up() { > >> [ x != "x`eval $procs`" ] > >> } > >> oralsnr_status() { > >> - $lsnrctl status $listener | tail -1 | grep -qs 'completed successfully' > >> + output=`$lsnrctl status $listener` > >> + echo "$output" | tail -1 | grep -qs 'completed successfully' > >> + RET=$? > >> + if [ $RET -ne 0 ]; then > >> + ocf_log info "$listener status failed: $output" > >> + fi > >> + return $RET > >> } > >> # and does it work? > >> tnsping() { > >> - $tnsping $ORACLE_SID | tail -1 | grep -qs '^OK' > >> + output=`$tnsping $ORACLE_SID` > >> + echo "$output" | tail -1 | grep -qs '^OK' > >> + RET=$? > >> + if [ $RET -ne 0 ]; then > >> + ocf_log info "$tnsping $ORACLE_SID failed: $output" > >> + fi > >> + return $RET > >> } > >> > >> # > >> @@ -285,10 +317,10 @@ oralsnr_monitor() { > >> : good > >> #ocf_log info "Listener $listener running" > >> return $OCF_SUCCESS > >> - else > >> + else > >> ocf_log info "Listener $listener not running" > >> return $OCF_NOT_RUNNING > >> - fi > >> + fi > >> } > >> > >> # > >> @@ -368,10 +400,10 @@ case "$1" in > >> > >> status) if oralsnr_status > >> then > >> - echo Listener $listener is running > >> + ocf_log info "Listener $listener is running" > >> exit $OCF_SUCCESS > >> else > >> - echo Listener $listener is stopped > >> + ocf_log info "Listener $listener is stopped" > >> exit $OCF_NOT_RUNNING > >> fi > >> ;; > > > >> _______________________________________________________ > >> Linux-HA-Dev: Linux-HA-Dev@lists.linux-ha.org > >> http://lists.linux-ha.org/mailman/listinfo/linux-ha-dev > >> Home Page: http://linux-ha.org/ > > > > _______________________________________________________ > > Linux-HA-Dev: Linux-HA-Dev@lists.linux-ha.org > > http://lists.linux-ha.org/mailman/listinfo/linux-ha-dev > > Home Page: http://linux-ha.org/ > > > > > > > > _______________________________________________________ > Linux-HA-Dev: Linux-HA-Dev@lists.linux-ha.org > http://lists.linux-ha.org/mailman/listinfo/linux-ha-dev > Home Page: http://linux-ha.org/ _______________________________________________________ Linux-HA-Dev: Linux-HA-Dev@lists.linux-ha.org http://lists.linux-ha.org/mailman/listinfo/linux-ha-dev Home Page: http://linux-ha.org/