I found a bug with my patch for the current stable version only [2.5.6], it doesn't affect the 2.6.0 development version of the patch. I have attached a new patch which fixes the bug. Has anyone else tried this? Any chance it will be included in future versions? I haven't heard any comments yet.
~Jason On Tue, 2004-06-29 at 16:22, Jason A. Smith wrote: > We have had problems with gmetad where things like nodes down or other > network errors have caused connection timeouts when gmetad is trying to > get data from a source. The effect of having the first or a few of the > first nodes listed in your data_source unreachable is large gaps in the > rrd plots for the affected cluster. > > I thought it would be nice if gmetad could remember which host it > connected to successfully the last time and try the same host again > first, the next time it attempts to read that data source. > > Attached is a patch which does exactly this. I have tested it here and > it appears to be working good. I attached 2 versions of the same patch, > one for the current stable version [2.5.6], and another for Matt's > latest 2.6.0 development version. > > ~Jason -- /------------------------------------------------------------------\ | Jason A. Smith Email: [EMAIL PROTECTED] | | Atlas Computing Facility, Bldg. 510M Phone: (631)344-4226 | | Brookhaven National Lab, P.O. Box 5000 Fax: (631)344-7616 | | Upton, NY 11973-5000 | \------------------------------------------------------------------/
diff -uNr ganglia-monitor-core-2.5.6-dist/gmetad/conf.c ganglia-monitor-core-2.5.6/gmetad/conf.c --- ganglia-monitor-core-2.5.6-dist/gmetad/conf.c 2003-05-27 18:09:55.000000000 -0400 +++ ganglia-monitor-core-2.5.6/gmetad/conf.c 2004-06-29 15:03:03.000000000 -0400 @@ -106,6 +106,7 @@ err_quit("Unable to malloc sources array"); dslist->num_sources = 0; + dslist->last_good_index = -1; for ( ; i< cmd->arg_count; i++) { diff -uNr ganglia-monitor-core-2.5.6-dist/gmetad/data_thread.c ganglia-monitor-core-2.5.6/gmetad/data_thread.c --- ganglia-monitor-core-2.5.6-dist/gmetad/data_thread.c 2003-09-11 16:58:46.000000000 -0400 +++ ganglia-monitor-core-2.5.6/gmetad/data_thread.c 2004-07-28 17:07:44.000000000 -0400 @@ -54,13 +54,26 @@ for (;;) { gettimeofday(&start, NULL); - for(i=0; i < d->num_sources; i++) - { - /* Find first viable source in list. */ - sock = g_tcp_socket_new ( d->sources[i] ); - if( sock ) - break; - } + sock = NULL; + + /* If we successfully read from a good data source last time then try the same host again first. */ + if(d->last_good_index >= 0) + sock = g_tcp_socket_new ( d->sources[d->last_good_index] ); + + /* If there was no good connection last time or the above connect failed then try each host in the list. */ + if(!sock) + { + for(i=0; i < d->num_sources; i++) + { + /* Find first viable source in list. */ + sock = g_tcp_socket_new ( d->sources[i] ); + if( sock ) + { + d->last_good_index = i; + break; + } + } + } if(!sock) { @@ -80,14 +93,14 @@ if( rval < 0 ) { /* Error */ - err_msg("poll() error in data_thread"); + err_msg("poll() error in data_thread for [%s] data source after %d bytes read", d->name, read_index); d->dead = 1; goto take_a_break; } else if (rval == 0) { /* No revents during timeout period */ - err_msg("poll() timeout"); + err_msg("poll() timeout for [%s] data source after %d bytes read", d->name, read_index); d->dead = 1; goto take_a_break; } @@ -120,19 +133,19 @@ } if( struct_poll.revents & POLLHUP ) { - err_msg("The remote machine closed connection"); + err_msg("The remote machine closed connection for [%s] data source after %d bytes read", d->name, read_index); d->dead = 1; goto take_a_break; } if( struct_poll.revents & POLLERR ) { - err_msg("POLLERR!"); + err_msg("POLLERR! for [%s] data source after %d bytes read", d->name, read_index); d->dead = 1; goto take_a_break; } if( struct_poll.revents & POLLNVAL ) { - err_msg("POLLNVAL!"); + err_msg("POLLNVAL! for [%s] data source after %d bytes read", d->name, read_index); d->dead = 1; goto take_a_break; } @@ -157,6 +170,10 @@ take_a_break: g_tcp_socket_delete(sock); + /* Don't remember this host if there was a problem */ + if(d->dead) + d->last_good_index = -1; + gettimeofday(&end, NULL); /* Sleep somewhere between (step +/- 5sec.) */ sleep_time = (d->step - 5) + (10 * (rand()/(float)RAND_MAX)) - (end.tv_sec - start.tv_sec); diff -uNr ganglia-monitor-core-2.5.6-dist/gmetad/gmetad.h ganglia-monitor-core-2.5.6/gmetad/gmetad.h --- ganglia-monitor-core-2.5.6-dist/gmetad/gmetad.h 2003-05-27 18:09:56.000000000 -0400 +++ ganglia-monitor-core-2.5.6/gmetad/gmetad.h 2004-06-29 15:05:00.000000000 -0400 @@ -96,6 +96,7 @@ g_inet_addr **sources; long double timestamp; /* added by swagner */ int dead; + int last_good_index; } data_source_list_t;
diff -uNr ganglia-2.6.0-20040609-dist/gmetad/conf.c ganglia-2.6.0-20040609/gmetad/conf.c --- ganglia-2.6.0-20040609-dist/gmetad/conf.c 2004-06-09 21:04:37.000000000 -0400 +++ ganglia-2.6.0-20040609/gmetad/conf.c 2004-06-29 13:25:57.000000000 -0400 @@ -153,6 +153,7 @@ err_quit("Unable to malloc ports array"); dslist->num_sources = 0; + dslist->last_good_index = -1; for ( ; i< cmd->arg_count; i++) { diff -uNr ganglia-2.6.0-20040609-dist/gmetad/data_thread.c ganglia-2.6.0-20040609/gmetad/data_thread.c --- ganglia-2.6.0-20040609-dist/gmetad/data_thread.c 2004-05-26 16:15:22.000000000 -0400 +++ ganglia-2.6.0-20040609/gmetad/data_thread.c 2004-06-29 14:54:23.000000000 -0400 @@ -59,13 +59,24 @@ /* Find the first viable source in list */ sock = -1; - for(i=0; i < d->num_sources; i++) + /* If we successfully read from a good data source last time then try the same host again first. */ + if(d->last_good_index >= 0) + sock = tcp_connect( d->names[d->last_good_index], d->ports[d->last_good_index]); + + /* If there was no good connection last time or the above connect failed then try each host in the list. */ + if(sock < 0) { - sock = tcp_connect( d->names[i], d->ports[i]); - if(sock >= 0) - break; /* success */ + for(i=0; i < d->num_sources; i++) + { + sock = tcp_connect( d->names[i], d->ports[i]); + if(sock >= 0) + { + d->last_good_index = i; + break; /* success */ + } + } } - + if(sock < 0) { err_msg("data_thread() got no answer from any [%s] datasource", d->name); @@ -167,6 +178,10 @@ ganglia_gzclose(gz); gz= NULL; } + + /* Don't remember this host if there was a problem */ + if(d->dead) + d->last_good_index = -1; gettimeofday(&end, NULL); diff -uNr ganglia-2.6.0-20040609-dist/gmetad/gmetad.h ganglia-2.6.0-20040609/gmetad/gmetad.h --- ganglia-2.6.0-20040609-dist/gmetad/gmetad.h 2004-06-09 18:05:17.000000000 -0400 +++ ganglia-2.6.0-20040609/gmetad/gmetad.h 2004-06-29 13:10:49.000000000 -0400 @@ -102,6 +102,7 @@ long double timestamp; /* added by swagner */ int dead; int last_heard_from; + int last_good_index; } data_source_list_t;