Jcrespo has submitted this change and it was merged.

Change subject: Use heartbeat when possible to check slave lag
......................................................................


Use heartbeat when possible to check slave lag

Change current check_mariadb.pl script to try to use heartbeat
first, then failback to the current system. This will allow more
accurante measurements, independent if the system has replication
running or not.

FIXME: for full backwards compatibility, this uses the immediate
master as the original master, something that will have to be fixed
afterwards by indicating which host is the real shard master (ready
on the script, but has to be added on puppet on a subsequent
commit.

Bug: T112473 T114752
Change-Id: I2d229173a0fdf5d91e6c95b4a6cb1335fa56f317
---
M files/icinga/check_mariadb.pl
1 file changed, 51 insertions(+), 21 deletions(-)

Approvals:
  Jcrespo: Verified; Looks good to me, approved



diff --git a/files/icinga/check_mariadb.pl b/files/icinga/check_mariadb.pl
index af0f0d5..84b0612 100755
--- a/files/icinga/check_mariadb.pl
+++ b/files/icinga/check_mariadb.pl
@@ -20,12 +20,15 @@
 my $user = "";
 my $pass = "";
 my $sock = "";
+my $master_server_id = "";
 
 my $sql_lag_warn = 30;
 my $sql_lag_crit = 60;
 
 # Warn when IO or SQL stopped cleanly (no errno)
 my $warn_stopped = 0;
+
+my $heartbeat_table = 'heartbeat.heartbeat';
 
 my @vars = ();
 
@@ -70,6 +73,10 @@
        elsif ($arg =~ /^--no-warn-stopped$/)
        {
                $warn_stopped = 0;
+       }
+       elsif ($arg =~ /^--master-server-id=(.+)$/)
+       {
+               $master_server_id = $1;
        }
        elsif ($arg =~ /^--set=(.+)$/)
        {
@@ -162,8 +169,18 @@
 
 if ($check eq "slave_sql_lag")
 {
-       # TODO: Make this check heartbeat
-
+# The slave lag is checked using the $heartbeat_table table,
+# usually created and updated by running pt-heartbeat on the
+# master.
+# For that, --master-server-id is strongly suggested to be
+# set. In case it is not, the lag from its direct master is
+# reported. If the heartbeat table does not exist, the record
+# for the master is not found or any other errors happens,
+# it failbacks to using Seconds_Behind_Master.
+# If the server is not a slave, it returns OK. If lag cannot
+# be determined neither by using heartbeat nor seconds behind
+# master, it returns unknown, unless the replication is 
+# stopped manually- reporting optionally a warning.
        my $status = $db->selectrow_hashref("show slave status");
 
        unless ($status) {
@@ -171,36 +188,49 @@
                exit($EOK);
        }
 
-       # Either IO or SQL threads stopped? WARN
-       if ($status->{Slave_IO_Running} ne "Yes" || 
$status->{Slave_SQL_Running} ne "Yes") {
-               if ($warn_stopped == 1) {
-                       printf("%s %s Slave_IO_Running: %s, Slave_SQL_Running: 
%s\n",
-                               $WARN, $check, $status->{Slave_IO_Running}, 
$status->{Slave_SQL_Running});
-                       exit($EWARN);
-               }
-               printf("%s %s Slave_IO_Running: %s, Slave_SQL_Running: %s, (no 
error; intentional)\n",
-                       $OK, $check, $status->{Slave_IO_Running}, 
$status->{Slave_SQL_Running});
-               exit($EOK);
+       if ($master_server_id eq "") {
+               $master_server_id = $status->{Master_Server_Id};
        }
+       my $heartbeat = $db->selectrow_hashref("SELECT 
TIMESTAMPDIFF(MICROSECOND,ts,UTC_TIMESTAMP(6)) AS lag FROM heartbeat.heartbeat 
WHERE server_id = $master_server_id");
 
+       my $lag = 
$heartbeat->{lag}?$heartbeat->{lag}/1000000:$status->{Seconds_Behind_Master};
+
+       if ($lag eq "NULL") {
+               # Either IO or SQL threads stopped? WARN
+               if ($status->{Slave_IO_Running} ne "Yes" || 
$status->{Slave_SQL_Running} ne "Yes") {
+                       if ($warn_stopped == 1) {
+                               printf("%s %s Slave_IO_Running: %s, 
Slave_SQL_Running: %s\n",
+                                       $WARN, $check, 
$status->{Slave_IO_Running}, $status->{Slave_SQL_Running});
+                               exit($EWARN);
+                       }
+                       printf("%s %s Slave_IO_Running: %s, Slave_SQL_Running: 
%s, (no error; intentional)\n",
+                               $OK, $check, $status->{Slave_IO_Running}, 
$status->{Slave_SQL_Running});
+                       exit($EOK);
+               }
+               # lag could not be determined
+               printf("%s %s lag could not be determined\n", $UNKN, $check);
+               exit($EUNKN);
+
+       }
        # Small lag? OK
-       if ($status->{Seconds_Behind_Master} < $sql_lag_warn) {
-               printf("%s %s Seconds_Behind_Master: %s\n",
-                       $OK, $check, $status->{Seconds_Behind_Master});
+       if ($lag < $sql_lag_warn) {
+               printf("%s %s Replication lag: %s seconds\n",
+                       $OK, $check, $lag);
                exit($EOK);
        }
 
        # Medium lag? WARN
-       if ($status->{Seconds_Behind_Master} < $sql_lag_crit) {
-               printf("%s %s Seconds_Behind_Master: %s\n",
-                       $WARN, $check, $status->{Seconds_Behind_Master});
+       if ($lag < $sql_lag_crit) {
+               printf("%s %s Replication lag: %s seconds\n",
+                       $WARN, $check, $lag);
                exit($EWARN);
        }
 
-       printf("%s %s Seconds_Behind_Master: %s\n",
-               $CRIT, $check, $status->{Seconds_Behind_Master});
+       printf("%s %s Replication lag: %s seconds\n",
+               $CRIT, $check, $lag);
        exit($ECRIT);
 }
 
 printf("%s %s invalid check: %s\n", $UNKN, $check, $check);
-exit($EUNKN);
\ No newline at end of file
+exit($EUNKN);
+

-- 
To view, visit https://gerrit.wikimedia.org/r/253665
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I2d229173a0fdf5d91e6c95b4a6cb1335fa56f317
Gerrit-PatchSet: 5
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Jcrespo <jcre...@wikimedia.org>
Gerrit-Reviewer: Jcrespo <jcre...@wikimedia.org>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to