Jcrespo has submitted this change and it was merged. Change subject: Use heartbeat when possible to check slave lag ......................................................................
Use heartbeat when possible to check slave lag Change current check_mariadb.pl script to try to use heartbeat first, then failback to the current system. This will allow more accurante measurements, independent if the system has replication running or not. FIXME: for full backwards compatibility, this uses the immediate master as the original master, something that will have to be fixed afterwards by indicating which host is the real shard master (ready on the script, but has to be added on puppet on a subsequent commit. Bug: T112473 T114752 Change-Id: I2d229173a0fdf5d91e6c95b4a6cb1335fa56f317 --- M files/icinga/check_mariadb.pl 1 file changed, 51 insertions(+), 21 deletions(-) Approvals: Jcrespo: Verified; Looks good to me, approved diff --git a/files/icinga/check_mariadb.pl b/files/icinga/check_mariadb.pl index af0f0d5..84b0612 100755 --- a/files/icinga/check_mariadb.pl +++ b/files/icinga/check_mariadb.pl @@ -20,12 +20,15 @@ my $user = ""; my $pass = ""; my $sock = ""; +my $master_server_id = ""; my $sql_lag_warn = 30; my $sql_lag_crit = 60; # Warn when IO or SQL stopped cleanly (no errno) my $warn_stopped = 0; + +my $heartbeat_table = 'heartbeat.heartbeat'; my @vars = (); @@ -70,6 +73,10 @@ elsif ($arg =~ /^--no-warn-stopped$/) { $warn_stopped = 0; + } + elsif ($arg =~ /^--master-server-id=(.+)$/) + { + $master_server_id = $1; } elsif ($arg =~ /^--set=(.+)$/) { @@ -162,8 +169,18 @@ if ($check eq "slave_sql_lag") { - # TODO: Make this check heartbeat - +# The slave lag is checked using the $heartbeat_table table, +# usually created and updated by running pt-heartbeat on the +# master. +# For that, --master-server-id is strongly suggested to be +# set. In case it is not, the lag from its direct master is +# reported. If the heartbeat table does not exist, the record +# for the master is not found or any other errors happens, +# it failbacks to using Seconds_Behind_Master. +# If the server is not a slave, it returns OK. If lag cannot +# be determined neither by using heartbeat nor seconds behind +# master, it returns unknown, unless the replication is +# stopped manually- reporting optionally a warning. my $status = $db->selectrow_hashref("show slave status"); unless ($status) { @@ -171,36 +188,49 @@ exit($EOK); } - # Either IO or SQL threads stopped? WARN - if ($status->{Slave_IO_Running} ne "Yes" || $status->{Slave_SQL_Running} ne "Yes") { - if ($warn_stopped == 1) { - printf("%s %s Slave_IO_Running: %s, Slave_SQL_Running: %s\n", - $WARN, $check, $status->{Slave_IO_Running}, $status->{Slave_SQL_Running}); - exit($EWARN); - } - printf("%s %s Slave_IO_Running: %s, Slave_SQL_Running: %s, (no error; intentional)\n", - $OK, $check, $status->{Slave_IO_Running}, $status->{Slave_SQL_Running}); - exit($EOK); + if ($master_server_id eq "") { + $master_server_id = $status->{Master_Server_Id}; } + my $heartbeat = $db->selectrow_hashref("SELECT TIMESTAMPDIFF(MICROSECOND,ts,UTC_TIMESTAMP(6)) AS lag FROM heartbeat.heartbeat WHERE server_id = $master_server_id"); + my $lag = $heartbeat->{lag}?$heartbeat->{lag}/1000000:$status->{Seconds_Behind_Master}; + + if ($lag eq "NULL") { + # Either IO or SQL threads stopped? WARN + if ($status->{Slave_IO_Running} ne "Yes" || $status->{Slave_SQL_Running} ne "Yes") { + if ($warn_stopped == 1) { + printf("%s %s Slave_IO_Running: %s, Slave_SQL_Running: %s\n", + $WARN, $check, $status->{Slave_IO_Running}, $status->{Slave_SQL_Running}); + exit($EWARN); + } + printf("%s %s Slave_IO_Running: %s, Slave_SQL_Running: %s, (no error; intentional)\n", + $OK, $check, $status->{Slave_IO_Running}, $status->{Slave_SQL_Running}); + exit($EOK); + } + # lag could not be determined + printf("%s %s lag could not be determined\n", $UNKN, $check); + exit($EUNKN); + + } # Small lag? OK - if ($status->{Seconds_Behind_Master} < $sql_lag_warn) { - printf("%s %s Seconds_Behind_Master: %s\n", - $OK, $check, $status->{Seconds_Behind_Master}); + if ($lag < $sql_lag_warn) { + printf("%s %s Replication lag: %s seconds\n", + $OK, $check, $lag); exit($EOK); } # Medium lag? WARN - if ($status->{Seconds_Behind_Master} < $sql_lag_crit) { - printf("%s %s Seconds_Behind_Master: %s\n", - $WARN, $check, $status->{Seconds_Behind_Master}); + if ($lag < $sql_lag_crit) { + printf("%s %s Replication lag: %s seconds\n", + $WARN, $check, $lag); exit($EWARN); } - printf("%s %s Seconds_Behind_Master: %s\n", - $CRIT, $check, $status->{Seconds_Behind_Master}); + printf("%s %s Replication lag: %s seconds\n", + $CRIT, $check, $lag); exit($ECRIT); } printf("%s %s invalid check: %s\n", $UNKN, $check, $check); -exit($EUNKN); \ No newline at end of file +exit($EUNKN); + -- To view, visit https://gerrit.wikimedia.org/r/253665 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I2d229173a0fdf5d91e6c95b4a6cb1335fa56f317 Gerrit-PatchSet: 5 Gerrit-Project: operations/puppet Gerrit-Branch: production Gerrit-Owner: Jcrespo <jcre...@wikimedia.org> Gerrit-Reviewer: Jcrespo <jcre...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits