We have had problems with gmetad where things like nodes down or other
network errors have caused connection timeouts when gmetad is trying to
get data from a source.  The effect of having the first or a few of the
first nodes listed in your data_source unreachable is large gaps in the
rrd plots for the affected cluster.

I thought it would be nice if gmetad could remember which host it
connected to successfully the last time and try the same host again
first, the next time it attempts to read that data source.

Attached is a patch which does exactly this.  I have tested it here and
it appears to be working good.  I attached 2 versions of the same patch,
one for the current stable version [2.5.6], and another for Matt's
latest 2.6.0 development version.

~Jason


-- 
/------------------------------------------------------------------\
|  Jason A. Smith                          Email:  [EMAIL PROTECTED] |
|  Atlas Computing Facility, Bldg. 510M    Phone:  (631)344-4226   |
|  Brookhaven National Lab, P.O. Box 5000  Fax:    (631)344-7616   |
|  Upton, NY 11973-5000                                            |
\------------------------------------------------------------------/

diff -uNr ganglia-monitor-core-2.5.6-dist/gmetad/conf.c ganglia-monitor-core-2.5.6/gmetad/conf.c
--- ganglia-monitor-core-2.5.6-dist/gmetad/conf.c	2003-05-27 18:09:55.000000000 -0400
+++ ganglia-monitor-core-2.5.6/gmetad/conf.c	2004-06-29 15:03:03.000000000 -0400
@@ -106,6 +106,7 @@
       err_quit("Unable to malloc sources array");
 
    dslist->num_sources = 0;
+   dslist->last_good_index = -1;
 
    for ( ; i< cmd->arg_count; i++)
       {
diff -uNr ganglia-monitor-core-2.5.6-dist/gmetad/data_thread.c ganglia-monitor-core-2.5.6/gmetad/data_thread.c
--- ganglia-monitor-core-2.5.6-dist/gmetad/data_thread.c	2003-09-11 16:58:46.000000000 -0400
+++ ganglia-monitor-core-2.5.6/gmetad/data_thread.c	2004-06-29 15:55:55.000000000 -0400
@@ -54,13 +54,25 @@
    for (;;)
       {
          gettimeofday(&start, NULL);
-         for(i=0; i < d->num_sources; i++)
-            {
-               /* Find first viable source in list. */
-               sock = g_tcp_socket_new ( d->sources[i] );
-               if( sock )
-                  break;
-            }
+	 
+	 /* If we successfully read from a good data source last time then try the same host again first. */
+	 if(d->last_good_index >= 0)
+	   sock = g_tcp_socket_new ( d->sources[d->last_good_index] );
+
+	 /* If there was no good connection last time or the above connect failed then try each host in the list. */
+	 if(!sock)
+           {
+             for(i=0; i < d->num_sources; i++)
+               {
+                 /* Find first viable source in list. */
+                 sock = g_tcp_socket_new ( d->sources[i] );
+                 if( sock )
+                   {
+                     d->last_good_index = i;
+                     break;
+                   }
+               }
+           }
 
          if(!sock)
             {
@@ -157,6 +169,10 @@
        take_a_break:
          g_tcp_socket_delete(sock);
 
+	 /* Don't remember this host if there was a problem */
+	 if(d->dead)
+           d->last_good_index = -1;
+
          gettimeofday(&end, NULL);
          /* Sleep somewhere between (step +/- 5sec.) */
          sleep_time = (d->step - 5) + (10 * (rand()/(float)RAND_MAX)) - (end.tv_sec - start.tv_sec);
diff -uNr ganglia-monitor-core-2.5.6-dist/gmetad/gmetad.h ganglia-monitor-core-2.5.6/gmetad/gmetad.h
--- ganglia-monitor-core-2.5.6-dist/gmetad/gmetad.h	2003-05-27 18:09:56.000000000 -0400
+++ ganglia-monitor-core-2.5.6/gmetad/gmetad.h	2004-06-29 15:05:00.000000000 -0400
@@ -96,6 +96,7 @@
       g_inet_addr **sources;
       long double timestamp;   /* added by swagner */
       int dead;
+      int last_good_index;
    }
 data_source_list_t;
 
diff -uNr ganglia-2.6.0-20040609-dist/gmetad/conf.c ganglia-2.6.0-20040609/gmetad/conf.c
--- ganglia-2.6.0-20040609-dist/gmetad/conf.c	2004-06-09 21:04:37.000000000 -0400
+++ ganglia-2.6.0-20040609/gmetad/conf.c	2004-06-29 13:25:57.000000000 -0400
@@ -153,6 +153,7 @@
       err_quit("Unable to malloc ports array");
 
    dslist->num_sources = 0;
+   dslist->last_good_index = -1;
 
    for ( ; i< cmd->arg_count; i++)
       {
diff -uNr ganglia-2.6.0-20040609-dist/gmetad/data_thread.c ganglia-2.6.0-20040609/gmetad/data_thread.c
--- ganglia-2.6.0-20040609-dist/gmetad/data_thread.c	2004-05-26 16:15:22.000000000 -0400
+++ ganglia-2.6.0-20040609/gmetad/data_thread.c	2004-06-29 14:54:23.000000000 -0400
@@ -59,13 +59,24 @@
     
        /* Find the first viable source in list */
        sock = -1;
-       for(i=0; i < d->num_sources; i++)
+       /* If we successfully read from a good data source last time then try the same host again first. */
+       if(d->last_good_index >= 0)
+         sock = tcp_connect( d->names[d->last_good_index], d->ports[d->last_good_index]);
+       
+       /* If there was no good connection last time or the above connect failed then try each host in the list. */
+       if(sock < 0)
          {
-           sock = tcp_connect( d->names[i], d->ports[i]);
-           if(sock >= 0)
-             break; /* success */
+           for(i=0; i < d->num_sources; i++)
+             {
+               sock = tcp_connect( d->names[i], d->ports[i]);
+               if(sock >= 0)
+                 {
+                   d->last_good_index = i;
+                   break; /* success */
+                 }
+             }
          }
-    
+     
        if(sock < 0)
          {
            err_msg("data_thread() got no answer from any [%s] datasource", d->name);
@@ -167,6 +178,10 @@
            ganglia_gzclose(gz);
            gz= NULL;
          }
+       
+       /* Don't remember this host if there was a problem */
+       if(d->dead)
+         d->last_good_index = -1;
      
        gettimeofday(&end, NULL);
     
diff -uNr ganglia-2.6.0-20040609-dist/gmetad/gmetad.h ganglia-2.6.0-20040609/gmetad/gmetad.h
--- ganglia-2.6.0-20040609-dist/gmetad/gmetad.h	2004-06-09 18:05:17.000000000 -0400
+++ ganglia-2.6.0-20040609/gmetad/gmetad.h	2004-06-29 13:10:49.000000000 -0400
@@ -102,6 +102,7 @@
       long double timestamp;   /* added by swagner */
       int dead;
       int last_heard_from;
+      int last_good_index;
    }
 data_source_list_t;
 

Reply via email to