I have been using my patch successfully for the past several months
without any problems. Can this be included in the upcoming 2.5.8
release of ganglia? I haven't heard any comments on this patch yet. I
tested it and it can still be applied to the latest 2.5.8 sources from
cvs with a few minor line offsets:
# patch -p1 <../ganglia-2.5.6-gmetad-remember-good-host.patch
patching file gmetad/conf.c
Hunk #1 succeeded at 104 (offset -2 lines).
patching file gmetad/data_thread.c
patching file gmetad/gmetad.h
Hunk #1 succeeded at 104 with fuzz 2 (offset 8 lines).
~Jason
--
/------------------------------------------------------------------\
| Jason A. Smith Email: [EMAIL PROTECTED] |
| Atlas Computing Facility, Bldg. 510M Phone: (631)344-4226 |
| Brookhaven National Lab, P.O. Box 5000 Fax: (631)344-7616 |
| Upton, NY 11973-5000 |
\------------------------------------------------------------------/
--- Begin Message ---
I found a bug with my patch for the current stable version only [2.5.6],
it doesn't affect the 2.6.0 development version of the patch. I have
attached a new patch which fixes the bug. Has anyone else tried this?
Any chance it will be included in future versions? I haven't heard any
comments yet.
~Jason
On Tue, 2004-06-29 at 16:22, Jason A. Smith wrote:
> We have had problems with gmetad where things like nodes down or other
> network errors have caused connection timeouts when gmetad is trying to
> get data from a source. The effect of having the first or a few of the
> first nodes listed in your data_source unreachable is large gaps in the
> rrd plots for the affected cluster.
>
> I thought it would be nice if gmetad could remember which host it
> connected to successfully the last time and try the same host again
> first, the next time it attempts to read that data source.
>
> Attached is a patch which does exactly this. I have tested it here and
> it appears to be working good. I attached 2 versions of the same patch,
> one for the current stable version [2.5.6], and another for Matt's
> latest 2.6.0 development version.
>
> ~Jason
--
/------------------------------------------------------------------\
| Jason A. Smith Email: [EMAIL PROTECTED] |
| Atlas Computing Facility, Bldg. 510M Phone: (631)344-4226 |
| Brookhaven National Lab, P.O. Box 5000 Fax: (631)344-7616 |
| Upton, NY 11973-5000 |
\------------------------------------------------------------------/
diff -uNr ganglia-monitor-core-2.5.6-dist/gmetad/conf.c ganglia-monitor-core-2.5.6/gmetad/conf.c
--- ganglia-monitor-core-2.5.6-dist/gmetad/conf.c 2003-05-27 18:09:55.000000000 -0400
+++ ganglia-monitor-core-2.5.6/gmetad/conf.c 2004-06-29 15:03:03.000000000 -0400
@@ -106,6 +106,7 @@
err_quit("Unable to malloc sources array");
dslist->num_sources = 0;
+ dslist->last_good_index = -1;
for ( ; i< cmd->arg_count; i++)
{
diff -uNr ganglia-monitor-core-2.5.6-dist/gmetad/data_thread.c ganglia-monitor-core-2.5.6/gmetad/data_thread.c
--- ganglia-monitor-core-2.5.6-dist/gmetad/data_thread.c 2003-09-11 16:58:46.000000000 -0400
+++ ganglia-monitor-core-2.5.6/gmetad/data_thread.c 2004-07-28 17:07:44.000000000 -0400
@@ -54,13 +54,26 @@
for (;;)
{
gettimeofday(&start, NULL);
- for(i=0; i < d->num_sources; i++)
- {
- /* Find first viable source in list. */
- sock = g_tcp_socket_new ( d->sources[i] );
- if( sock )
- break;
- }
+ sock = NULL;
+
+ /* If we successfully read from a good data source last time then try the same host again first. */
+ if(d->last_good_index >= 0)
+ sock = g_tcp_socket_new ( d->sources[d->last_good_index] );
+
+ /* If there was no good connection last time or the above connect failed then try each host in the list. */
+ if(!sock)
+ {
+ for(i=0; i < d->num_sources; i++)
+ {
+ /* Find first viable source in list. */
+ sock = g_tcp_socket_new ( d->sources[i] );
+ if( sock )
+ {
+ d->last_good_index = i;
+ break;
+ }
+ }
+ }
if(!sock)
{
@@ -80,14 +93,14 @@
if( rval < 0 )
{
/* Error */
- err_msg("poll() error in data_thread");
+ err_msg("poll() error in data_thread for [%s] data source after %d bytes read", d->name, read_index);
d->dead = 1;
goto take_a_break;
}
else if (rval == 0)
{
/* No revents during timeout period */
- err_msg("poll() timeout");
+ err_msg("poll() timeout for [%s] data source after %d bytes read", d->name, read_index);
d->dead = 1;
goto take_a_break;
}
@@ -120,19 +133,19 @@
}
if( struct_poll.revents & POLLHUP )
{
- err_msg("The remote machine closed connection");
+ err_msg("The remote machine closed connection for [%s] data source after %d bytes read", d->name, read_index);
d->dead = 1;
goto take_a_break;
}
if( struct_poll.revents & POLLERR )
{
- err_msg("POLLERR!");
+ err_msg("POLLERR! for [%s] data source after %d bytes read", d->name, read_index);
d->dead = 1;
goto take_a_break;
}
if( struct_poll.revents & POLLNVAL )
{
- err_msg("POLLNVAL!");
+ err_msg("POLLNVAL! for [%s] data source after %d bytes read", d->name, read_index);
d->dead = 1;
goto take_a_break;
}
@@ -157,6 +170,10 @@
take_a_break:
g_tcp_socket_delete(sock);
+ /* Don't remember this host if there was a problem */
+ if(d->dead)
+ d->last_good_index = -1;
+
gettimeofday(&end, NULL);
/* Sleep somewhere between (step +/- 5sec.) */
sleep_time = (d->step - 5) + (10 * (rand()/(float)RAND_MAX)) - (end.tv_sec - start.tv_sec);
diff -uNr ganglia-monitor-core-2.5.6-dist/gmetad/gmetad.h ganglia-monitor-core-2.5.6/gmetad/gmetad.h
--- ganglia-monitor-core-2.5.6-dist/gmetad/gmetad.h 2003-05-27 18:09:56.000000000 -0400
+++ ganglia-monitor-core-2.5.6/gmetad/gmetad.h 2004-06-29 15:05:00.000000000 -0400
@@ -96,6 +96,7 @@
g_inet_addr **sources;
long double timestamp; /* added by swagner */
int dead;
+ int last_good_index;
}
data_source_list_t;
diff -uNr ganglia-2.6.0-20040609-dist/gmetad/conf.c ganglia-2.6.0-20040609/gmetad/conf.c
--- ganglia-2.6.0-20040609-dist/gmetad/conf.c 2004-06-09 21:04:37.000000000 -0400
+++ ganglia-2.6.0-20040609/gmetad/conf.c 2004-06-29 13:25:57.000000000 -0400
@@ -153,6 +153,7 @@
err_quit("Unable to malloc ports array");
dslist->num_sources = 0;
+ dslist->last_good_index = -1;
for ( ; i< cmd->arg_count; i++)
{
diff -uNr ganglia-2.6.0-20040609-dist/gmetad/data_thread.c ganglia-2.6.0-20040609/gmetad/data_thread.c
--- ganglia-2.6.0-20040609-dist/gmetad/data_thread.c 2004-05-26 16:15:22.000000000 -0400
+++ ganglia-2.6.0-20040609/gmetad/data_thread.c 2004-06-29 14:54:23.000000000 -0400
@@ -59,13 +59,24 @@
/* Find the first viable source in list */
sock = -1;
- for(i=0; i < d->num_sources; i++)
+ /* If we successfully read from a good data source last time then try the same host again first. */
+ if(d->last_good_index >= 0)
+ sock = tcp_connect( d->names[d->last_good_index], d->ports[d->last_good_index]);
+
+ /* If there was no good connection last time or the above connect failed then try each host in the list. */
+ if(sock < 0)
{
- sock = tcp_connect( d->names[i], d->ports[i]);
- if(sock >= 0)
- break; /* success */
+ for(i=0; i < d->num_sources; i++)
+ {
+ sock = tcp_connect( d->names[i], d->ports[i]);
+ if(sock >= 0)
+ {
+ d->last_good_index = i;
+ break; /* success */
+ }
+ }
}
-
+
if(sock < 0)
{
err_msg("data_thread() got no answer from any [%s] datasource", d->name);
@@ -167,6 +178,10 @@
ganglia_gzclose(gz);
gz= NULL;
}
+
+ /* Don't remember this host if there was a problem */
+ if(d->dead)
+ d->last_good_index = -1;
gettimeofday(&end, NULL);
diff -uNr ganglia-2.6.0-20040609-dist/gmetad/gmetad.h ganglia-2.6.0-20040609/gmetad/gmetad.h
--- ganglia-2.6.0-20040609-dist/gmetad/gmetad.h 2004-06-09 18:05:17.000000000 -0400
+++ ganglia-2.6.0-20040609/gmetad/gmetad.h 2004-06-29 13:10:49.000000000 -0400
@@ -102,6 +102,7 @@
long double timestamp; /* added by swagner */
int dead;
int last_heard_from;
+ int last_good_index;
}
data_source_list_t;
--- End Message ---