I found a bug with my patch for the current stable version only [2.5.6],
it doesn't affect the 2.6.0 development version of the patch.  I have
attached a new patch which fixes the bug.  Has anyone else tried this? 
Any chance it will be included in future versions?  I haven't heard any
comments yet.

~Jason


On Tue, 2004-06-29 at 16:22, Jason A. Smith wrote:
> We have had problems with gmetad where things like nodes down or other
> network errors have caused connection timeouts when gmetad is trying to
> get data from a source.  The effect of having the first or a few of the
> first nodes listed in your data_source unreachable is large gaps in the
> rrd plots for the affected cluster.
> 
> I thought it would be nice if gmetad could remember which host it
> connected to successfully the last time and try the same host again
> first, the next time it attempts to read that data source.
> 
> Attached is a patch which does exactly this.  I have tested it here and
> it appears to be working good.  I attached 2 versions of the same patch,
> one for the current stable version [2.5.6], and another for Matt's
> latest 2.6.0 development version.
> 
> ~Jason
-- 
/------------------------------------------------------------------\
|  Jason A. Smith                          Email:  [EMAIL PROTECTED] |
|  Atlas Computing Facility, Bldg. 510M    Phone:  (631)344-4226   |
|  Brookhaven National Lab, P.O. Box 5000  Fax:    (631)344-7616   |
|  Upton, NY 11973-5000                                            |
\------------------------------------------------------------------/

diff -uNr ganglia-monitor-core-2.5.6-dist/gmetad/conf.c ganglia-monitor-core-2.5.6/gmetad/conf.c
--- ganglia-monitor-core-2.5.6-dist/gmetad/conf.c	2003-05-27 18:09:55.000000000 -0400
+++ ganglia-monitor-core-2.5.6/gmetad/conf.c	2004-06-29 15:03:03.000000000 -0400
@@ -106,6 +106,7 @@
       err_quit("Unable to malloc sources array");
 
    dslist->num_sources = 0;
+   dslist->last_good_index = -1;
 
    for ( ; i< cmd->arg_count; i++)
       {
diff -uNr ganglia-monitor-core-2.5.6-dist/gmetad/data_thread.c ganglia-monitor-core-2.5.6/gmetad/data_thread.c
--- ganglia-monitor-core-2.5.6-dist/gmetad/data_thread.c	2003-09-11 16:58:46.000000000 -0400
+++ ganglia-monitor-core-2.5.6/gmetad/data_thread.c	2004-07-28 17:07:44.000000000 -0400
@@ -54,13 +54,26 @@
    for (;;)
       {
          gettimeofday(&start, NULL);
-         for(i=0; i < d->num_sources; i++)
-            {
-               /* Find first viable source in list. */
-               sock = g_tcp_socket_new ( d->sources[i] );
-               if( sock )
-                  break;
-            }
+	 sock = NULL;
+	 
+	 /* If we successfully read from a good data source last time then try the same host again first. */
+	 if(d->last_good_index >= 0)
+	   sock = g_tcp_socket_new ( d->sources[d->last_good_index] );
+
+	 /* If there was no good connection last time or the above connect failed then try each host in the list. */
+	 if(!sock)
+           {
+             for(i=0; i < d->num_sources; i++)
+               {
+                 /* Find first viable source in list. */
+                 sock = g_tcp_socket_new ( d->sources[i] );
+                 if( sock )
+                   {
+                     d->last_good_index = i;
+                     break;
+                   }
+               }
+           }
 
          if(!sock)
             {
@@ -80,14 +93,14 @@
                if( rval < 0 )
                   {
                      /* Error */
-                     err_msg("poll() error in data_thread");
+		    err_msg("poll() error in data_thread for [%s] data source after %d bytes read", d->name, read_index);
                      d->dead = 1;
                      goto take_a_break;
                   }
                else if (rval == 0)
                   {
                      /* No revents during timeout period */
-                     err_msg("poll() timeout");
+                     err_msg("poll() timeout for [%s] data source after %d bytes read", d->name, read_index);
                      d->dead = 1;
                      goto take_a_break; 
                   }
@@ -120,19 +133,19 @@
                         }
                      if( struct_poll.revents & POLLHUP )
                         {
-                           err_msg("The remote machine closed connection");
+                           err_msg("The remote machine closed connection for [%s] data source after %d bytes read", d->name, read_index);
                            d->dead = 1;
                            goto take_a_break;
                         }
                      if( struct_poll.revents & POLLERR )
                         {
-                           err_msg("POLLERR!");
+                           err_msg("POLLERR! for [%s] data source after %d bytes read", d->name, read_index);
                            d->dead = 1;
                            goto take_a_break;
                         }
                      if( struct_poll.revents & POLLNVAL )
                         {
-                           err_msg("POLLNVAL!");
+                           err_msg("POLLNVAL! for [%s] data source after %d bytes read", d->name, read_index);
                            d->dead = 1;
                            goto take_a_break;
                         }
@@ -157,6 +170,10 @@
        take_a_break:
          g_tcp_socket_delete(sock);
 
+	 /* Don't remember this host if there was a problem */
+	 if(d->dead)
+           d->last_good_index = -1;
+
          gettimeofday(&end, NULL);
          /* Sleep somewhere between (step +/- 5sec.) */
          sleep_time = (d->step - 5) + (10 * (rand()/(float)RAND_MAX)) - (end.tv_sec - start.tv_sec);
diff -uNr ganglia-monitor-core-2.5.6-dist/gmetad/gmetad.h ganglia-monitor-core-2.5.6/gmetad/gmetad.h
--- ganglia-monitor-core-2.5.6-dist/gmetad/gmetad.h	2003-05-27 18:09:56.000000000 -0400
+++ ganglia-monitor-core-2.5.6/gmetad/gmetad.h	2004-06-29 15:05:00.000000000 -0400
@@ -96,6 +96,7 @@
       g_inet_addr **sources;
       long double timestamp;   /* added by swagner */
       int dead;
+      int last_good_index;
    }
 data_source_list_t;
 
diff -uNr ganglia-2.6.0-20040609-dist/gmetad/conf.c ganglia-2.6.0-20040609/gmetad/conf.c
--- ganglia-2.6.0-20040609-dist/gmetad/conf.c	2004-06-09 21:04:37.000000000 -0400
+++ ganglia-2.6.0-20040609/gmetad/conf.c	2004-06-29 13:25:57.000000000 -0400
@@ -153,6 +153,7 @@
       err_quit("Unable to malloc ports array");
 
    dslist->num_sources = 0;
+   dslist->last_good_index = -1;
 
    for ( ; i< cmd->arg_count; i++)
       {
diff -uNr ganglia-2.6.0-20040609-dist/gmetad/data_thread.c ganglia-2.6.0-20040609/gmetad/data_thread.c
--- ganglia-2.6.0-20040609-dist/gmetad/data_thread.c	2004-05-26 16:15:22.000000000 -0400
+++ ganglia-2.6.0-20040609/gmetad/data_thread.c	2004-06-29 14:54:23.000000000 -0400
@@ -59,13 +59,24 @@
     
        /* Find the first viable source in list */
        sock = -1;
-       for(i=0; i < d->num_sources; i++)
+       /* If we successfully read from a good data source last time then try the same host again first. */
+       if(d->last_good_index >= 0)
+         sock = tcp_connect( d->names[d->last_good_index], d->ports[d->last_good_index]);
+       
+       /* If there was no good connection last time or the above connect failed then try each host in the list. */
+       if(sock < 0)
          {
-           sock = tcp_connect( d->names[i], d->ports[i]);
-           if(sock >= 0)
-             break; /* success */
+           for(i=0; i < d->num_sources; i++)
+             {
+               sock = tcp_connect( d->names[i], d->ports[i]);
+               if(sock >= 0)
+                 {
+                   d->last_good_index = i;
+                   break; /* success */
+                 }
+             }
          }
-    
+     
        if(sock < 0)
          {
            err_msg("data_thread() got no answer from any [%s] datasource", d->name);
@@ -167,6 +178,10 @@
            ganglia_gzclose(gz);
            gz= NULL;
          }
+       
+       /* Don't remember this host if there was a problem */
+       if(d->dead)
+         d->last_good_index = -1;
      
        gettimeofday(&end, NULL);
     
diff -uNr ganglia-2.6.0-20040609-dist/gmetad/gmetad.h ganglia-2.6.0-20040609/gmetad/gmetad.h
--- ganglia-2.6.0-20040609-dist/gmetad/gmetad.h	2004-06-09 18:05:17.000000000 -0400
+++ ganglia-2.6.0-20040609/gmetad/gmetad.h	2004-06-29 13:10:49.000000000 -0400
@@ -102,6 +102,7 @@
       long double timestamp;   /* added by swagner */
       int dead;
       int last_heard_from;
+      int last_good_index;
    }
 data_source_list_t;
 

Reply via email to