How about extending ganglia to collect ps information?
Suppose we add to the XML something like:

   <!ELEMENT PROCESS EMPTY>
   <!ATTLIST PROCESS NAME    CDATA #REQUIRED
                     USER    CDATA #REQUIRED
                     PID     CDATA #REQUIRED
                     CPU     CDATA #REQUIRED
                     MEM     CDATA #REQUIRED
                     SZ      CDATA #REQUIRED
                     RSS     CDATA #REQUIRED
                     STATUS  CDATA #REQUIRED
                     ..... whatever else looks useful
                     >

And the per-node output would look like:


<HOST NAME="compute-0-2" IP="10.255.255.252" REPORTED="1013270664">
<METRIC NAME="mem_free" VAL="475380" TYPE="uint32" UNITS="KBs" SOURCE="gmond"/>

[....]

<METRIC NAME="os_release" VAL="2.4.9-13smp" TYPE="string" UNITS="" 
SOURCE="gmond"/>

<PROCESS NAME="mozilla-bin" USER="asaph" PID="13845" CPU="12.4" MEM="22.3"
SZ="62008" RSS="55352" STATUS="S" >

[...]

<PROCESS NAME="/bin/csh" USER="asaph" PID="13840" CPU="0.0" MEM="0.3"
SZ="3872" RSS="2259" STATUS="S" >
</HOST>


We could then easily implement a cluster-wide ps utility.

On the negative side, this style of implementation would
tend to return stale information, you wouldn't want to broadcast
this information more than once every few seconds, so anybody
using the feature would always be seeing the state of the processes
as they were a few seconds ago.

On the plus side this gives us a bound on the bandwidth consumed 
by the cluster-wide ps function. We know that no matter how
many people retrieve the cluster-wide ps information we will
not consume more than N*process_list_size/sample_rate of bandwidth.

Moreover, since applications running on clusters tend to be
long lived perhaps using somewhat stale information is no
big deal.

Thoughts?

                Asaph



On Tue, Apr 09, 2002 at 12:53:29PM -0700, matt massie wrote:
> asaph-
> 
> this is a much better way of collecting the metrics on linux.  i like that 
> your method eliminates 3 threads and all the mutex locking.  i'll 
> try out the code and likely include it in the next release.
> 
> -matt
> 
> Today, Asaph Zemach wrote forth saying...
> 
> > Here iks a drop-in replacement to linux.c that does not
> > use the extra threads and gets rid of the now-unneeded 
> > locking. It seems to work. I think it's a little cleaner
> > and more maintainable (e.g. no forgotten locking) for the future.
> > 
> > Decide if you want to keep it.
> > 
> >             Asaph
> > 
> > 
> > ------------------------------------------------------------------
> > #include <time.h>
> > #include "ganglia.h"
> > #include "metric_typedefs.h"
> > 
> > /*
> > #include "set_metric_val.h"
> > */
> > 
> > #define OSNAME "Linux"
> > #define OSNAME_LEN strlen(OSNAME)
> > 
> > /* Never changes */
> > char proc_cpuinfo[BUFFSIZE];
> > char proc_sys_kernel_osrelease[BUFFSIZE];
> > 
> > typedef struct {
> >   int last_read;
> >   int thresh;
> >   char *name;
> >   char buffer[BUFFSIZE];
> > } timely_file;
> > 
> > timely_file proc_stat    = { 0, 15, "/proc/stat" };
> > timely_file proc_loadavg = { 0, 15, "/proc/loadavg" };
> > timely_file proc_meminfo = { 0, 30, "/proc/meminfo" };
> > 
> > char *update_file(timely_file *tf)
> > {
> >   int now,rval;
> >   now = time(0);
> >   if(now - tf->last_read > tf->thresh) {
> >     rval = slurpfile(tf->name, tf->buffer, BUFFSIZE);
> >     if(rval == SYNAPSE_FAILURE) {
> >       err_msg("update_file() got an error from slurpfile() reading %s",
> >           tf->name);
> >     }
> >     else tf->last_read = now;
> >   }
> >   return tf->buffer;
> > }
> > 
> > 
> > 
> > 
> > 
> > 
> > /*
> >  * This function is called only once by the gmond.  Use to 
> >  * initialize data structures, etc or just return SYNAPSE_SUCCESS;
> >  */
> > g_val_t
> > metric_init(void)
> > {
> >    g_val_t rval;
> > 
> >    rval.int32 = slurpfile("/proc/cpuinfo", proc_cpuinfo, BUFFSIZE);
> >    if ( rval.int32 == SYNAPSE_FAILURE )
> >       {
> >          err_msg("metric_init() got an error from slurpfile() 
> > /proc/cpuinfo");
> >          return rval;
> >       }  
> > 
> >    rval.int32 = slurpfile( "/proc/sys/kernel/osrelease", 
> >                        proc_sys_kernel_osrelease, BUFFSIZE);
> >    if ( rval.int32 == SYNAPSE_FAILURE )
> >       {
> >          err_msg("kernel_func() got an error from slurpfile()");
> >          return rval;
> >       }   
> > 
> >    /* Get rid of pesky \n in osrelease */
> >    proc_sys_kernel_osrelease[rval.int32-1] = '\0';
> > 
> >    rval.int32 = SYNAPSE_SUCCESS;
> >    return rval;
> > }
> > 
> > /*
> >  * 
> >  */
> > 
> > g_val_t
> > cpu_num_func ( void )
> > {
> >    FILE *f;
> >    static int cpu_num = 0;
> >    char line[80];
> >    g_val_t val;
> > 
> >    /* Only need to do this once */
> >    if (! cpu_num)
> >       {
> >          f = fopen("/proc/stat", "r");
> >          while (fscanf(f, "%s", line) != EOF)
> >             if (strncmp(line, "cpu", 3) == 0)
> >                cpu_num++;
> >          fclose(f);
> >       }
> >    val.uint16 = cpu_num - 1;
> >    return val;
> > }
> > 
> > g_val_t
> > cpu_speed_func ( void )
> > {
> >    char *p;
> >    static g_val_t val = {0};
> > 
> >    if (! val.uint32 )
> >       {
> >          p = proc_cpuinfo;  
> >          p = strstr( p, "cpu MHz" );
> >          p = strchr( p, ':' );
> >          p++;
> >          p = skip_whitespace(p);
> >          val.uint32 = (uint32_t)strtol( p, (char **)NULL , 10 );
> >       }
> >    return val;
> > }
> > 
> > g_val_t
> > mem_total_func ( void )
> > {
> >    char *p;
> >    g_val_t val;
> > 
> >    p = strstr( update_file(&proc_meminfo), "MemTotal:");
> >    p = skip_token(p);
> >    val.uint32 = strtol( p, (char **)NULL, 10 );
> > 
> >    return val;
> > }
> > 
> > g_val_t
> > swap_total_func ( void )
> > {
> >    char *p;
> >    g_val_t val;
> >  
> >    p = strstr( update_file(&proc_meminfo), "SwapTotal:" );
> >    p = skip_token(p);
> >    val.uint32 = strtol( p, (char **)NULL, 10 );  
> > 
> >    return val;
> > }
> > 
> > g_val_t
> > boottime_func ( void )
> > {
> >    char *p;
> >    g_val_t val;
> > 
> >    p = update_file(&proc_stat); 
> > 
> >    p = strstr ( p, "btime" );
> >    p = skip_token ( p );
> >    val.uint32 = strtod ( p, (char **)NULL );
> > 
> >    return val;
> > }
> > 
> > g_val_t
> > sys_clock_func ( void )
> > {
> >    g_val_t val;
> > 
> >    val.uint32 = time(NULL);
> >    return val;
> > }
> > 
> > g_val_t
> > machine_type_func ( void )
> > {
> >    g_val_t val;
> >  
> > #ifdef IA64
> >    snprintf(val.str, MAX_G_STRING_SIZE, "ia64");
> > #endif
> > #ifdef __i386__
> >    snprintf(val.str, MAX_G_STRING_SIZE, "x86");
> > #endif
> > #ifdef __alpha__
> >    snprintf(val.str, MAX_G_STRING_SIZE, "alpha");
> > #endif
> >    return val;
> > }
> > 
> > g_val_t
> > os_name_func ( void )
> > {
> >    g_val_t val;
> > 
> >    snprintf(val.str, MAX_G_STRING_SIZE, "Linux");
> >    return val;
> > }
> > 
> > g_val_t
> > os_release_func ( void )
> > {
> >    g_val_t val;
> > 
> >    snprintf(val.str, MAX_G_STRING_SIZE, "%s", proc_sys_kernel_osrelease);
> >    return val;
> > }
> > 
> > /*
> >  * A helper function to return the total number of cpu jiffies
> >  */
> > unsigned long
> > total_jiffies_func ( void )
> > {
> >    char *p;
> >    unsigned long user_jiffies, nice_jiffies, system_jiffies, idle_jiffies;
> > 
> >    p = update_file(&proc_stat);
> >    p = skip_token(p);
> >    p = skip_whitespace(p);
> >    user_jiffies = strtod( p, &p );
> >    p = skip_whitespace(p);
> >    nice_jiffies = strtod( p, &p ); 
> >    p = skip_whitespace(p);
> >    system_jiffies = strtod( p , &p ); 
> >    p = skip_whitespace(p);
> >    idle_jiffies = strtod( p , &p );
> >   
> >    return user_jiffies + nice_jiffies + system_jiffies + idle_jiffies; 
> > }   
> > 
> > g_val_t
> > cpu_user_func ( void )
> > {
> >    char *p;
> >    g_val_t val;
> >    static double last_user_jiffies,  user_jiffies, 
> >                  last_total_jiffies, total_jiffies, diff;
> > 
> >    p = update_file(&proc_stat);
> >  
> >    p = skip_token(p);
> >    user_jiffies  = strtod( p , (char **)NULL );
> >    total_jiffies = total_jiffies_func();
> > 
> >    diff = user_jiffies - last_user_jiffies; 
> > 
> >    if ( diff )
> >       val.f = (diff/(total_jiffies - last_total_jiffies))*100;
> >    else
> >       val.f = 0.0;
> >   
> >    last_user_jiffies  = user_jiffies;
> >    last_total_jiffies = total_jiffies; 
> >    return val;
> > }
> > 
> > g_val_t
> > cpu_nice_func ( void )
> > {
> >    char *p;
> >    g_val_t val;
> >    static double last_nice_jiffies,  nice_jiffies,
> >                  last_total_jiffies, total_jiffies, diff;
> >  
> >    p = update_file(&proc_stat);
> >  
> >    p = skip_token(p);
> >    p = skip_token(p);
> >    nice_jiffies  = strtod( p , (char **)NULL );
> >    total_jiffies = total_jiffies_func();
> > 
> >    diff = (nice_jiffies  - last_nice_jiffies);
> >  
> >    if ( diff )
> >       val.f = (diff/(total_jiffies - last_total_jiffies))*100;
> >    else
> >       val.f = 0.0;
> >  
> >    last_nice_jiffies  = nice_jiffies;
> >    last_total_jiffies = total_jiffies;
> >    return val;
> > }
> > 
> > g_val_t 
> > cpu_system_func ( void )
> > {
> >    char *p;
> >    g_val_t val;
> >    static double last_system_jiffies,  system_jiffies,
> >                  last_total_jiffies, total_jiffies, diff;
> >  
> >    p = update_file(&proc_stat);
> >  
> >    p = skip_token(p);
> >    p = skip_token(p);
> >    p = skip_token(p);
> >    system_jiffies = strtod( p , (char **)NULL );
> >    total_jiffies  = total_jiffies_func();
> > 
> >    diff = system_jiffies  - last_system_jiffies;
> >  
> >    if ( diff )
> >       val.f = (diff/(total_jiffies - last_total_jiffies))*100;
> >    else
> >       val.f = 0.0;
> >  
> >    last_system_jiffies  = system_jiffies;
> >    last_total_jiffies = total_jiffies;   
> >    return val;
> > }
> > 
> > g_val_t 
> > cpu_idle_func ( void )
> > {
> >    char *p;
> >    g_val_t val;
> >    static double last_idle_jiffies,  idle_jiffies,
> >                  last_total_jiffies, total_jiffies, diff;
> >  
> >    p = update_file(&proc_stat);
> >  
> >    p = skip_token(p);
> >    p = skip_token(p);
> >    p = skip_token(p);
> >    p = skip_token(p);
> >    idle_jiffies  = strtod( p , (char **)NULL );
> >    total_jiffies = total_jiffies_func();
> > 
> >    diff = idle_jiffies - last_idle_jiffies;
> >  
> >    if ( diff ) 
> >       val.f = (diff/(total_jiffies - last_total_jiffies))*100;
> >    else
> >       val.f = 0.0;
> >  
> >    last_idle_jiffies  = idle_jiffies;
> >    last_total_jiffies = total_jiffies;
> >    return val;
> > }
> > 
> > g_val_t 
> > cpu_aidle_func ( uint32_t i )
> > {
> >    char *p;
> >    g_val_t val;
> >    double idle_jiffies, total_jiffies;
> >    
> >    p = update_file(&proc_stat);
> > 
> >    p = skip_token(p);
> >    p = skip_token(p);
> >    p = skip_token(p);
> >    p = skip_token(p);
> >    idle_jiffies  = strtod( p , (char **)NULL );
> >    total_jiffies = total_jiffies_func();
> >    
> >    val.f = (idle_jiffies/total_jiffies)*100;
> >    return val;
> > }
> > 
> > g_val_t
> > load_one_func ( void )
> > {
> >    g_val_t val;
> > 
> >    val.f = strtod( update_file(&proc_loadavg), (char **)NULL);
> > 
> >    return val;
> > }
> > 
> > g_val_t
> > load_five_func ( void )
> > {
> >    char *p;
> >    g_val_t val;
> > 
> >    p = update_file(&proc_loadavg);
> >    p = skip_token(p);
> >    val.f = strtod( p, (char **)NULL);
> >  
> >    return val;
> > }
> > 
> > g_val_t
> > load_fifteen_func ( void )
> > {
> >    char *p;
> >    g_val_t val;
> > 
> >    p = update_file(&proc_loadavg);
> >  
> >    p = skip_token(p);
> >    p = skip_token(p);
> >    val.f = strtod( p, (char **)NULL);
> > 
> >    return val;
> > }
> > 
> > g_val_t
> > proc_run_func( void )
> > {
> >    char *p;
> >    g_val_t val;
> > 
> >    p = update_file(&proc_loadavg);
> >    p = skip_token(p);
> >    p = skip_token(p);
> >    p = skip_token(p);
> >    val.uint32 = strtol( p, (char **)NULL, 10 );
> > 
> >    val.uint32--;
> >    /* This shouldn't happen.. but it might */
> >    if (val.uint32 <0)
> >       val.uint32 = 0;
> > 
> >    return val;
> > }
> > 
> > g_val_t
> > proc_total_func ( void )
> > {
> >    char *p;
> >    g_val_t val;
> > 
> >    p = update_file(&proc_loadavg);
> >    p = skip_token(p);
> >    p = skip_token(p);
> >    p = skip_token(p); 
> >    p = skip_whitespace(p);
> >    while ( isdigit(*p) )
> >       p++;
> >    p++;  /* skip the slash-/ */ 
> >    val.uint32 = strtol( p, (char **)NULL, 10 ); 
> > 
> >    return val;
> > }
> > 
> > g_val_t
> > mem_free_func ( void )
> > {
> >    char *p;
> >    g_val_t val;
> > 
> >    p = strstr( update_file(&proc_meminfo), "MemFree:" );
> >    p = skip_token(p);
> >    val.uint32 = strtol( p, (char **)NULL, 10 );
> > 
> >    return val;
> > }
> > 
> > g_val_t
> > mem_shared_func ( void )
> > {
> >    char *p;
> >    g_val_t val;
> > 
> >    p = strstr( update_file(&proc_meminfo), "MemShared:" );
> >    p = skip_token(p);
> >    val.uint32 = strtol( p, (char **)NULL, 10 );
> > 
> >    return val;
> > }
> > 
> > g_val_t
> > mem_buffers_func ( void )
> > {
> >    char *p;
> >    g_val_t val;
> > 
> >    p = strstr( update_file(&proc_meminfo), "Buffers:" );
> >    p = skip_token(p);
> >    val.uint32 = strtol( p, (char **)NULL, 10 ); 
> > 
> >    return val;
> > }
> > 
> > g_val_t
> > mem_cached_func ( void )
> > {
> >    char *p;
> >    g_val_t val;
> > 
> >    p = strstr( update_file(&proc_meminfo), "Cached:");
> >    p = skip_token(p);
> >    val.uint32 = strtol( p, (char **)NULL, 10 );
> > 
> >    return val;
> > }
> > 
> > g_val_t
> > swap_free_func ( void )
> > {
> >    char *p;
> >    g_val_t val;
> > 
> >    p = strstr( update_file(&proc_meminfo), "SwapFree:" );
> >    p = skip_token(p);
> >    val.uint32 = strtol( p, (char **)NULL, 10 ); 
> > 
> >    return val;
> > }
> > 
> > 
> 
> 
> _______________________________________________
> Ganglia-general mailing list
> Ganglia-general@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/ganglia-general

Reply via email to