We have had problems with gmetad where things like nodes down or other network errors have caused connection timeouts when gmetad is trying to get data from a source. The effect of having the first or a few of the first nodes listed in your data_source unreachable is large gaps in the rrd plots for the affected cluster.
I thought it would be nice if gmetad could remember which host it connected to successfully the last time and try the same host again first, the next time it attempts to read that data source. Attached is a patch which does exactly this. I have tested it here and it appears to be working good. I attached 2 versions of the same patch, one for the current stable version [2.5.6], and another for Matt's latest 2.6.0 development version. ~Jason -- /------------------------------------------------------------------\ | Jason A. Smith Email: [EMAIL PROTECTED] | | Atlas Computing Facility, Bldg. 510M Phone: (631)344-4226 | | Brookhaven National Lab, P.O. Box 5000 Fax: (631)344-7616 | | Upton, NY 11973-5000 | \------------------------------------------------------------------/
diff -uNr ganglia-monitor-core-2.5.6-dist/gmetad/conf.c ganglia-monitor-core-2.5.6/gmetad/conf.c --- ganglia-monitor-core-2.5.6-dist/gmetad/conf.c 2003-05-27 18:09:55.000000000 -0400 +++ ganglia-monitor-core-2.5.6/gmetad/conf.c 2004-06-29 15:03:03.000000000 -0400 @@ -106,6 +106,7 @@ err_quit("Unable to malloc sources array"); dslist->num_sources = 0; + dslist->last_good_index = -1; for ( ; i< cmd->arg_count; i++) { diff -uNr ganglia-monitor-core-2.5.6-dist/gmetad/data_thread.c ganglia-monitor-core-2.5.6/gmetad/data_thread.c --- ganglia-monitor-core-2.5.6-dist/gmetad/data_thread.c 2003-09-11 16:58:46.000000000 -0400 +++ ganglia-monitor-core-2.5.6/gmetad/data_thread.c 2004-06-29 15:55:55.000000000 -0400 @@ -54,13 +54,25 @@ for (;;) { gettimeofday(&start, NULL); - for(i=0; i < d->num_sources; i++) - { - /* Find first viable source in list. */ - sock = g_tcp_socket_new ( d->sources[i] ); - if( sock ) - break; - } + + /* If we successfully read from a good data source last time then try the same host again first. */ + if(d->last_good_index >= 0) + sock = g_tcp_socket_new ( d->sources[d->last_good_index] ); + + /* If there was no good connection last time or the above connect failed then try each host in the list. */ + if(!sock) + { + for(i=0; i < d->num_sources; i++) + { + /* Find first viable source in list. */ + sock = g_tcp_socket_new ( d->sources[i] ); + if( sock ) + { + d->last_good_index = i; + break; + } + } + } if(!sock) { @@ -157,6 +169,10 @@ take_a_break: g_tcp_socket_delete(sock); + /* Don't remember this host if there was a problem */ + if(d->dead) + d->last_good_index = -1; + gettimeofday(&end, NULL); /* Sleep somewhere between (step +/- 5sec.) */ sleep_time = (d->step - 5) + (10 * (rand()/(float)RAND_MAX)) - (end.tv_sec - start.tv_sec); diff -uNr ganglia-monitor-core-2.5.6-dist/gmetad/gmetad.h ganglia-monitor-core-2.5.6/gmetad/gmetad.h --- ganglia-monitor-core-2.5.6-dist/gmetad/gmetad.h 2003-05-27 18:09:56.000000000 -0400 +++ ganglia-monitor-core-2.5.6/gmetad/gmetad.h 2004-06-29 15:05:00.000000000 -0400 @@ -96,6 +96,7 @@ g_inet_addr **sources; long double timestamp; /* added by swagner */ int dead; + int last_good_index; } data_source_list_t;
diff -uNr ganglia-2.6.0-20040609-dist/gmetad/conf.c ganglia-2.6.0-20040609/gmetad/conf.c --- ganglia-2.6.0-20040609-dist/gmetad/conf.c 2004-06-09 21:04:37.000000000 -0400 +++ ganglia-2.6.0-20040609/gmetad/conf.c 2004-06-29 13:25:57.000000000 -0400 @@ -153,6 +153,7 @@ err_quit("Unable to malloc ports array"); dslist->num_sources = 0; + dslist->last_good_index = -1; for ( ; i< cmd->arg_count; i++) { diff -uNr ganglia-2.6.0-20040609-dist/gmetad/data_thread.c ganglia-2.6.0-20040609/gmetad/data_thread.c --- ganglia-2.6.0-20040609-dist/gmetad/data_thread.c 2004-05-26 16:15:22.000000000 -0400 +++ ganglia-2.6.0-20040609/gmetad/data_thread.c 2004-06-29 14:54:23.000000000 -0400 @@ -59,13 +59,24 @@ /* Find the first viable source in list */ sock = -1; - for(i=0; i < d->num_sources; i++) + /* If we successfully read from a good data source last time then try the same host again first. */ + if(d->last_good_index >= 0) + sock = tcp_connect( d->names[d->last_good_index], d->ports[d->last_good_index]); + + /* If there was no good connection last time or the above connect failed then try each host in the list. */ + if(sock < 0) { - sock = tcp_connect( d->names[i], d->ports[i]); - if(sock >= 0) - break; /* success */ + for(i=0; i < d->num_sources; i++) + { + sock = tcp_connect( d->names[i], d->ports[i]); + if(sock >= 0) + { + d->last_good_index = i; + break; /* success */ + } + } } - + if(sock < 0) { err_msg("data_thread() got no answer from any [%s] datasource", d->name); @@ -167,6 +178,10 @@ ganglia_gzclose(gz); gz= NULL; } + + /* Don't remember this host if there was a problem */ + if(d->dead) + d->last_good_index = -1; gettimeofday(&end, NULL); diff -uNr ganglia-2.6.0-20040609-dist/gmetad/gmetad.h ganglia-2.6.0-20040609/gmetad/gmetad.h --- ganglia-2.6.0-20040609-dist/gmetad/gmetad.h 2004-06-09 18:05:17.000000000 -0400 +++ ganglia-2.6.0-20040609/gmetad/gmetad.h 2004-06-29 13:10:49.000000000 -0400 @@ -102,6 +102,7 @@ long double timestamp; /* added by swagner */ int dead; int last_heard_from; + int last_good_index; } data_source_list_t;