Influenced by several sources including gexec_process.c, Steven's email
below, and top.c from procps 2.0.7, I have written a process-gathering
testbench that directly parses the /proc filesystem in linux.
Take a look (and try it, it's only one file). I make some perhaps
controversial decisions on how to report memory usage. When considering
them, keep in mind how fork() and copy-on-write work. Also, check the
/usr/src/linux/fs/proc/array.c to see how /proc/pid/statm is reported.
The processes are sorted by %CPU, and only the top 5 are reported by
default. Hopefully this test will form the base for process reporting in
gmond.
Fed.
/*
* A simple test bench for reporting the top 5 processes by CPU
* usage. Based on code from GNU's procps 2.0.7 package,
* Copyright (C) 1996 Charles L. Blake, 1998 Michael K. Johnson,
* and inherits the terms of the GNU Library General Public License.
*
* Compile with:
* gcc -I. -g ps.c -o myps
*
* Copyright 2002 Federico Sacerdoti <[EMAIL PROTECTED]>
* and SDSC.
*/
#include <stdlib.h>
#include <stdio.h>
#include <pwd.h>
#include <sys/time.h>
#include <sys/types.h>
#include <dirent.h>
#include <sys/param.h> /* for HZ */
#include <asm/page.h> /* for PAGE_SHIFT */
#include <sys/stat.h>
#include <unistd.h>
/* useful macros */
#define bytetok(x) (((x) + 512) >> 10)
#define pagetok(x) ((x) << (PAGE_SHIFT - 10))
/* The number of processes to report. */
#define SHOWPROCS 5
struct process {
char user[32];
int pid;
char name[64];
float percent_cpu;
unsigned long utime;
unsigned long stime;
unsigned long starttime;
float percent_mem;
unsigned int text; /* Program text, bss memory size, in KB */
unsigned int data; /* Program heap+stack size, in KB */
unsigned int shared; /* Size of shared libs + IPC shm. */
unsigned int resident; /* The total number of pages resident in core mem. */
unsigned int vm; /* The total number of pages allocated by Virtual
Mem */
unsigned int major_faults; /* Page faults that required a read from disk
*/
};
/* This is junk of course, to be replaced soon */
const char *uid_to_user(int uid)
{
static struct passwd *p;
static char txt[64];
p = getpwuid(uid);
if(p) return p->pw_name;
sprintf(txt, "%d", uid);
return txt;
}
/* Returns the interval between calls to this function, in seconds. */
static float
get_elapsed_time(void)
{
struct timeval t;
static struct timeval oldtime;
struct timezone timez;
float elapsed_time;
gettimeofday(&t, &timez);
elapsed_time = (t.tv_sec - oldtime.tv_sec)
+ (float) (t.tv_usec - oldtime.tv_usec) / 1000000.0;
oldtime.tv_sec = t.tv_sec;
oldtime.tv_usec = t.tv_usec;
return elapsed_time;
}
static unsigned long
total_mem_kb(void)
{
int len;
FILE *meminfo;
char buffer[2048], *p;
unsigned long memtotal;
meminfo = fopen("/proc/meminfo", "r");
if(!meminfo)
return 0;
len = fread(buffer, sizeof(char), sizeof(buffer)-1, meminfo);
buffer[len] = '\0';
fclose(meminfo);
p = (char*) strstr( buffer, "MemTotal:" );
if (!p)
return 0;
sscanf(p, "MemTotal: %lu ", &memtotal);
return memtotal;
}
/* A more efficient, thread-safe pid->uid->username mapper. */
static void
pid2name(int pid, char *name)
{
struct stat sb;
char buffer[32];
struct passwd *pw;
int rc, uid;
/* To insure a NULL string is returned on error. */
*name='\0';
sprintf(buffer, "/proc/%d", pid);
rc=stat(buffer, &sb);
if (rc<0) return;
uid=sb.st_uid;
pw=getpwuid(uid);
if (!pw)
sprintf(name,"%d",uid);
else
strcpy(name, pw->pw_name);
/* Cannot free this? free(pw); */
}
int percent_cpu_sort (struct process *P, struct process *Q)
{
if (P->percent_cpu < Q->percent_cpu) return 1;
if (P->percent_cpu < Q->percent_cpu) return -1;
return 0;
}
static void read_proc(int pid, struct process *procs, int index)
{
char filename[128];
char line[512];
FILE * stat, *statm;
struct process *proc;
char *tmp;
int i, rc, n;
unsigned long resident, vmsize;
proc = &procs[index];
if (!proc) return;
sprintf(filename, "/proc/%d/stat", pid);
stat = fopen(filename, "r");
if (!stat) {
printf("WARNING: process %d could not be found.",pid);
return;
}
fgets(line, sizeof(line), stat);
if (! *line) return;
n=sscanf(line, "%d (%s %*c %*d %*d %*d %*d %*d " /* Start: pid, End: tgpid
*/
"%*lu %*lu %*lu %lu %*lu " /* Start: flags, End: cmajflt */
"%lu %lu %*ld %*ld %*ld " /* Start: utime, End: priority */
"%*ld %*lu %*ld %lu ", /* Start: nice, End: starttime */
&proc->pid, proc->name,
&proc->major_faults,
&proc->utime, &proc->stime,
&proc->starttime);
/* Remove the trailing ')' from the name */
proc->name[strlen(proc->name)-1]='\0';
fclose(stat);
/* Use the percent_cpu field to hold the number of new jiffies this process
used since last time. */
proc->percent_cpu = proc->utime + proc->stime - proc->percent_cpu;
/* Get virtual memory statistics */
sprintf(filename, "/proc/%d/statm", pid);
statm = fopen(filename, "r");
if (!statm) {
printf("WARNING: process %d could not be found.",pid);
return;
}
fgets(line, sizeof(line), stat);
if (! *line) return;
/* The actual format is different than reported by the proc manpage.
See /usr/src/linux/fs/proc/array.c.
Size, Resident, Shared, Trs, Lrs, Drs, Dt */
sscanf(line, "%lu %lu %lu %lu %*lu %*lu %*lu",
&proc->vm, &proc->resident, &proc->shared, &proc->text);
fclose(statm);
/* Data is defined as resident - shared. This may include text, and may not.
Read-write heap pages are never shared. The stack is never shared. */
proc->data = proc->resident - proc->shared;
/* Convert to KB */
proc->resident = pagetok(proc->resident);
proc->text = pagetok(proc->text);
proc->data = pagetok(proc->data);
proc->shared = pagetok(proc->shared);
proc->vm = pagetok(proc->vm);
}
/* Data is defined as resident - shared. This may include text, and may not.
Read-write heap pages are never shared. The stack is never shared.
Fork() calls will increase the number of shared pages because of copy-on-write
semantics. A page is "shared" if its reference count > 1. This figure counts
more than
IPC shared memory pages.
*/
int
main(void)
{
int pid, npids=0;
int i=0;
struct process *procs; /* The top five processes by cpu usage */
struct process *p;
DIR *d;
struct dirent *de;
float delta;
if (!(d = opendir ("/proc"))) {
perror ("/proc"); exit (1);
}
/* First pass to see how many processes we have running */
while (de = readdir (d))
if (pid = atoi (de->d_name))
npids++;
procs = (struct process *) calloc(npids, sizeof(*procs));
/* Second pass to populate our procs list */
rewinddir(d);
get_elapsed_time();
i=0;
while (de = readdir (d))
if (pid = atoi (de->d_name)) {
/* Insure no extra pids have been created since we counted. */
if (i>=npids) break;
read_proc(pid, procs, i);
i++;
}
/* Sleep a bit so we don't grab all the %cpu while measuring it.
Also improves the quality of measurements by increasing delta. */
usleep(800000);
/* Third pass to calculate percent CPU */
rewinddir(d);
delta = get_elapsed_time();
i=0;
while (de = readdir (d))
if (pid = atoi (de->d_name)) {
/* Insure no extra pids have been created since we counted. */
if (i>=npids) break;
read_proc(pid, procs, i);
i++;
}
closedir (d);
printf("Total Main Memory (from /proc/meminfo): %lu KB\n", total_mem_kb());
printf("Time Delta for %%CPU=%.4fs\n",delta);
/* Fill in the remaining process fields. */
for (i=0; i<npids; i++) {
p=&procs[i];
/* Remember we have stored the new ticks of this process in
p->percent_cpu. */
p->percent_cpu = (p->percent_cpu * 100/HZ) / delta;
if (p->percent_cpu > 99.9) p->percent_cpu=99.9;
p->percent_mem = 100.00 * (p->resident / (float) total_mem_kb());
if (p->percent_mem > 99.9) p->percent_mem=99.9;
pid2name(p->pid, p->user);
}
/* Sort by %CPU */
qsort(procs, npids, sizeof(*procs), (void*)percent_cpu_sort);
printf("The top %d processes, sorted by %%CPU:\n", SHOWPROCS);
for (i=0; i<SHOWPROCS; i++) {
if (i>=npids) break;
p=&procs[i];
printf("Process %d, %s (%s): %%CPU=%.1f %%MEM=%.1f. Memory: "
"text=%d, data=%d, shared=%d, VMtotal=%d (KB).\n",
p->pid, p->name, p->user, p->percent_cpu, p->percent_mem,
p->text, p->data, p->shared, p->vm);
}
return 0;
}
On Tuesday, August 27, 2002, at 04:16 PM, Steven Wagner wrote:
I don't have code but I'm happy to dump core on the subject.
The one thing that various OSes all seem to have in common so far is
they all have /proc or a similar interface for each running process.
Although I haven't found a nice flat file sort of interface in the
(info-rich) Linux procfs, I can tell you what top does on all supported
platforms.
It walks /proc (or calls the equivalent system call, if /proc isn't
available). In Linux's case it would go through all numeric
directories in /proc and sum up each process's utime and stime jiffies,
then run through all the running processes again and divide each one by
the total to determine percentages.
I found the sample code in top 3.9beta5. I'll paste it at the end of
this message.
In any case, Solaris, Tru64 and (apparently) IRIX actually have this
information in the procinfo struct, which saves a couple walks through
that particular hash. So this implementation is Linux-specific (I'm
also leaving out the supporting functions although I think they're
mostly self-explanatory)... there's another function which actually
does the second walk but this should give you an idea.
Nice work on gmetad, BTW ... after 2.5.0 releases and it's deployed
throughout my Linux cluster I'll "return to the fold" and ditch my
hacked 0.1 version ... ;)
From $TOP_BUILD_ROOT/machine/m_linux.c:
static void
read_one_proc_stat(pid_t pid, struct top_proc *proc)
{
char buffer[4096], *p;
/* grab the proc stat info in one go */
{
int fd, len;
sprintf(buffer, "%d/stat", pid);
fd = open(buffer, O_RDONLY);
len = read(fd, buffer, sizeof(buffer)-1);
close(fd);
buffer[len] = '\0';
}
proc->uid = proc_owner(pid);
/* parse out the status */
p = buffer;
p = strchr(p, '(')+1; /* skip pid */
{
char *q = strrchr(p, ')');
int len = q-p;
if (len >= sizeof(proc->name))
len = sizeof(proc->name)-1;
memcpy(proc->name, p, len);
proc->name[len] = 0;
p = q+1;
}
p = skip_ws(p);
switch (*p++)
{
case 'R': proc->state = 1; break;
case 'S': proc->state = 2; break;
case 'D': proc->state = 3; break;
case 'Z': proc->state = 4; break;
case 'T': proc->state = 5; break;
case 'W': proc->state = 6; break;
}
p = skip_token(p); /* skip ppid */
p = skip_token(p); /* skip pgrp */
p = skip_token(p); /* skip session */
p = skip_token(p); /* skip tty */
p = skip_token(p); /* skip tty pgrp */
p = skip_token(p); /* skip flags */
p = skip_token(p); /* skip min flt */
p = skip_token(p); /* skip cmin flt */
p = skip_token(p); /* skip maj flt */
p = skip_token(p); /* skip cmaj flt */
proc->time = strtoul(p, &p, 10); /* utime */
proc->time += strtoul(p, &p, 10); /* stime */
p = skip_token(p); /* skip cutime */
p = skip_token(p); /* skip cstime */
proc->pri = strtol(p, &p, 10); /* priority */
proc->nice = strtol(p, &p, 10); /* nice */
p = skip_token(p); /* skip timeout */
p = skip_token(p); /* skip it_real_val */
p = skip_token(p); /* skip start_time */
proc->size = bytetok(strtoul(p, &p, 10)); /* vsize */
proc->rss = pagetok(strtoul(p, &p, 10)); /* rss */
#if 0
/* for the record, here are the rest of the fields */
p = skip_token(p); /* skip rlim */
p = skip_token(p); /* skip start_code */
p = skip_token(p); /* skip end_code */
p = skip_token(p); /* skip start_stack */
p = skip_token(p); /* skip sp */
p = skip_token(p); /* skip pc */
p = skip_token(p); /* skip signal */
p = skip_token(p); /* skip sigblocked */
p = skip_token(p); /* skip sigignore */
p = skip_token(p); /* skip sigcatch */
p = skip_token(p); /* skip wchan */
#endif
}
Federico Sacerdoti wrote:
Hi,
I'm trying to write a test bench program that reports basic things
about the top five processes, based on percent cpu usage (in Linux).
Since it seems you are working on this problem, I thought I would ask
you for some help. You recently mentioned some vengence was brought to
bear on the prickly problem of determining %CPU. That seems exactly
what it needs.
Do you have some code that would help shed some light on this matter?
Thanks,
Federico
On Wednesday, August 21, 2002, at 11:46 AM, Steven Wagner wrote:
Steven Wagner wrote:
matt massie wrote:
guys-
i just checked out our latest source on freebsd and solaris and it
wasn't happy. i was a little too naive about the way i stitched in
the
libdnet source. i've added the necessary files to make ganglia
happy
again on solaris, freebsd and likely other oses as well (since i
added the
complete intf and eth support files, autoconf tests etc).
i added the mtu_func to solaris.c (it was a simple cut and paste
from linux.c).
since i'm running the monitoring core on source forge's compile
farm .. i'm not able to test them as i'd like. let me know what
you guys find on FreeBSD, Solaris, et al.
Basically what it comes down to is that top (and, as far as I know,
the Linux kernel) does a weighted average between the last
calculated value and the current value, adjusting the weight
according to how much time it's been since the last update. My code
isn't doing this so it looks very boring.
[but don't hold the release up on my account]
Actually, top is sneakier than I thought. The code that I thought
was doing CPU utilization was the per-process cycle counter, not the
per-state cycle counter. The percentage calculation for CPU states
was in another (non-Solaris-specific) function, and uses Crazy Number
Magic[tm] to get the percentages it does.
Sheesh, I thought you'd just take the difference between your last
measured value and the current measured value from each state counter
and divide that by the sum of the difference from all the state
counters, multiply by a hundred, maybe shave it to a couple
significant figures and *bam* you've got the percentages for each
state...
Anyway, diff enclosed. It has some code that, as they say, "has no
direct application" at this time but it will be used fo some
lightweight process work that I'm going to have to do in order to get
the proc_run metric.
And in the spirit of 'fattest disk partition,' how about 'busiest
process' stats? CPU/mem/owner + name + args? Just a thought...
-------------------------------------------------------
This sf.net email is sponsored by: OSDN - Tired of that same old
cell phone? Get a new here for FREE!
https://www.inphonic.com/r.asp?r=sourceforge1&refcode1=vs3390
_______________________________________________
Ganglia-developers mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/ganglia-developers
Federico
Rocks Cluster Group, Camp X-Ray, SDSC, San Diego
GPG Fingerprint: 3C5E 47E7 BDF8 C14E ED92 92BB BA86 B2E6 0390 8845
Federico
Rocks Cluster Group, Camp X-Ray, SDSC, San Diego
GPG Fingerprint: 3C5E 47E7 BDF8 C14E ED92 92BB BA86 B2E6 0390 8845