Attached is a patch to display getrusage output to EXPLAIN output.
This is the patch I mentioned previously in
http://archives.postgresql.org/pgsql-hackers/2010-02/msg00684.php and
it raises the same issues we were talking about then. Should the
resource usage stats displayed be per-iteration totals for the whole
query execution for that node. I can't see dividing by nloops making
things clearer but the discrepancy is becoming more noticeable. I
wonder if there's anything we can do to make things clearer.
Incidentally, this is a first cut at the patch written in a single
sitting. I need to go through it again to make sure I didn't do
anything silly. One thing I know is broken is Windows support. I
fleshed out our getrusage stub a bit but I can't test it at all, and
the INSTR_TIME_* macros actually are defined differently on windows so
I can't use them quite so simply on struct timeval. The simplest
option is to define the struct timeval versions always even if
INSTR_TIME_* doesn't use them and this code can use them directly.
Another would be to modify the struct rusage definition on Windows so
we use the native Windows time datatype -- which would be tempting
since it would avoid the loss of precision in "only" having
microsecond precision.
postgres=# explain (analyze, resource) select * from x;
QUERY PLAN
-------------------------------------------------------------------------------------------------------------
Seq Scan on x (cost=0.00..11540.00 rows=800000 width=4) (actual
time=69.851..1287.025 rows=800000 loops=1)
Resources: sys=240.000ms user=940.000ms read=27.7MB
Total runtime: 2431.237 ms
(3 rows)
[...flush buffers and drop caches between tests...]
postgres=# explain (analyze, verbose, buffers, resource) select * from x;
QUERY PLAN
--------------------------------------------------------------------------------------------------------------------
Seq Scan on public.x (cost=0.00..11540.00 rows=800000 width=4)
(actual time=28.739..1290.786 rows=800000 loops=1)
Output: generate_series
Resources: sys=280.000ms user=890.000ms read=27.7MB minflt=65
nvcsw=9 nivcsw=633
Buffers: shared read=3540
Total runtime: 2487.629 ms
(5 rows)
--
greg
*** a/src/backend/commands/explain.c
--- b/src/backend/commands/explain.c
***************
*** 105,110 **** static void ExplainJSONLineEnding(ExplainState *es);
--- 105,111 ----
static void ExplainYAMLLineStarting(ExplainState *es);
static void escape_json(StringInfo buf, const char *str);
static void escape_yaml(StringInfo buf, const char *str);
+ static double normalize_memory(double amount, char **unit, int *precision);
***************
*** 137,142 **** ExplainQuery(ExplainStmt *stmt, const char *queryString,
--- 138,145 ----
es.costs = defGetBoolean(opt);
else if (strcmp(opt->defname, "buffers") == 0)
es.buffers = defGetBoolean(opt);
+ else if (strcmp(opt->defname, "resource") == 0)
+ es.rusage = defGetBoolean(opt);
else if (strcmp(opt->defname, "format") == 0)
{
char *p = defGetString(opt);
***************
*** 363,368 **** ExplainOnePlan(PlannedStmt *plannedstmt, ExplainState *es,
--- 366,373 ----
instrument_option |= INSTRUMENT_TIMER;
if (es->buffers)
instrument_option |= INSTRUMENT_BUFFERS;
+ if (es->rusage)
+ instrument_option |= INSTRUMENT_RUSAGE;
/*
* Use a snapshot with an updated command ID to ensure this query sees
***************
*** 1086,1091 **** ExplainNode(PlanState *planstate, List *ancestors,
--- 1091,1187 ----
break;
}
+ /* Show resource usage from getrusage */
+ if (es->rusage && es->format == EXPLAIN_FORMAT_TEXT)
+ {
+ const struct rusage *usage = &planstate->instrument->rusage;
+
+ bool has_rusage = (!INSTR_TIME_IS_ZERO(usage->ru_stime) ||
+ !INSTR_TIME_IS_ZERO(usage->ru_utime) ||
+ usage->ru_inblock > 0 ||
+ usage->ru_oublock > 0);
+ bool has_verbose_rusage = (usage->ru_minflt > 0 ||
+ usage->ru_majflt > 0 ||
+ usage->ru_nswap > 0 ||
+ usage->ru_msgsnd > 0 ||
+ usage->ru_msgrcv > 0 ||
+ usage->ru_nsignals > 0 ||
+ usage->ru_nvcsw > 0 ||
+ usage->ru_nivcsw > 0);
+
+ if (has_rusage || (es->verbose && has_verbose_rusage))
+ {
+ appendStringInfoSpaces(es->str, es->indent *2);
+ appendStringInfoString(es->str, "Resources:");
+
+ if (!INSTR_TIME_IS_ZERO(usage->ru_stime))
+ {
+ double stime = INSTR_TIME_GET_DOUBLE(usage->ru_stime);
+ appendStringInfo(es->str, " sys=%.3fms", stime * 1000);
+ }
+
+ if (!INSTR_TIME_IS_ZERO(usage->ru_utime))
+ {
+ double utime = INSTR_TIME_GET_DOUBLE(usage->ru_utime);
+ appendStringInfo(es->str, " user=%.3fms", utime * 1000);
+ }
+
+ if (usage->ru_inblock > 0)
+ {
+ double inblock;
+ char *units;
+ int prec;
+ inblock = normalize_memory((double)usage->ru_inblock * 512, &units, &prec);
+ appendStringInfo(es->str, " read=%.*f%s", prec, inblock, units);
+ }
+ if (usage->ru_oublock > 0)
+ {
+ double oublock;
+ char *units;
+ int prec;
+ oublock = normalize_memory((double)usage->ru_oublock * 512, &units, &prec);
+ appendStringInfo(es->str, " written=%.*f%s", prec, oublock, units);
+ }
+ if (es->verbose)
+ {
+ if (usage->ru_minflt > 0)
+ appendStringInfo(es->str, " minflt=%ld", usage->ru_minflt);
+ if (usage->ru_majflt > 0)
+ appendStringInfo(es->str, " majflt=%ld", usage->ru_majflt);
+ if (usage->ru_nswap > 0)
+ appendStringInfo(es->str, " nswap=%ld", usage->ru_nswap);
+ if (usage->ru_msgsnd > 0)
+ appendStringInfo(es->str, " msgsnd=%ld", usage->ru_msgsnd);
+ if (usage->ru_msgrcv > 0)
+ appendStringInfo(es->str, " msgrcv=%ld", usage->ru_msgrcv);
+ if (usage->ru_nsignals > 0)
+ appendStringInfo(es->str, " nsignals=%ld", usage->ru_nsignals);
+ if (usage->ru_nvcsw > 0)
+ appendStringInfo(es->str, " nvcsw=%ld", usage->ru_nvcsw);
+ if (usage->ru_nivcsw > 0)
+ appendStringInfo(es->str, " nivcsw=%ld", usage->ru_nivcsw);
+ }
+ appendStringInfoChar(es->str, '\n');
+ }
+ }
+ else if (es->rusage)
+ {
+ const struct rusage *usage = &planstate->instrument->rusage;
+
+ ExplainPropertyFloat("User Time", INSTR_TIME_GET_DOUBLE(usage->ru_utime), 3, es);
+ ExplainPropertyFloat("System Time", INSTR_TIME_GET_DOUBLE(usage->ru_stime), 3, es);
+ ExplainPropertyLong("Minor Page Faults", usage->ru_minflt, es);
+ ExplainPropertyLong("Major Page Faults", usage->ru_majflt, es);
+ ExplainPropertyLong("Swaps", usage->ru_nswap, es);
+ ExplainPropertyLong("Blocks Written", usage->ru_inblock, es);
+ ExplainPropertyLong("Blocks Read", usage->ru_oublock, es);
+ ExplainPropertyLong("Messages Sent", usage->ru_msgsnd, es);
+ ExplainPropertyLong("Messages Received", usage->ru_msgrcv, es);
+ ExplainPropertyLong("Signals Received", usage->ru_nsignals, es);
+ ExplainPropertyLong("Voluntary Context Switches", usage->ru_nvcsw, es);
+ ExplainPropertyLong("Involuntary Context Switches", usage->ru_nivcsw, es);
+ }
+
/* Show buffer usage */
if (es->buffers)
{
***************
*** 2173,2175 **** escape_yaml(StringInfo buf, const char *str)
--- 2269,2304 ----
{
escape_json(buf, str);
}
+
+ /*
+ * For a quantity of bytes pick a reasonable display unit for it and
+ * return the quantity in that unit. Also return the unit name and a
+ * reasonable precision via the reference parameters.
+ */
+
+ static double normalize_memory(double amount, char **unit, int *precision)
+ {
+ static char *units[] = {"bytes", "kB", "MB", "GB", "TB", "PB"};
+ char **u = units, **last = units + (sizeof(units)/sizeof(*units)-1);
+
+ while (amount > 1024.0 && u < last)
+ {
+ amount /= 1024.0;
+ u += 1;
+ }
+
+ *unit = *u;
+
+ /* if it's bytes or kB then don't print decimals since that's less
+ * than blocksize, otherwise always print 3 significant digits */
+ if (u == units || u == units+1 )
+ *precision = 0;
+ else if (amount < 10)
+ *precision = 2;
+ else if (amount < 100)
+ *precision = 1;
+ else
+ *precision = 0;
+
+ return amount;
+ }
*** a/src/backend/executor/instrument.c
--- b/src/backend/executor/instrument.c
***************
*** 21,26 **** BufferUsage pgBufferUsage;
--- 21,29 ----
static void BufferUsageAccumDiff(BufferUsage *dst,
const BufferUsage *add, const BufferUsage *sub);
+ static void ResourceUsageAccumDiff(struct rusage *dst,
+ const struct rusage *add,
+ const struct rusage *sub);
/* Allocate new instrumentation structure(s) */
Instrumentation *
***************
*** 40,45 **** InstrAlloc(int n, int instrument_options)
--- 43,56 ----
instr[i].needs_bufusage = true;
}
+ if (instrument_options & INSTRUMENT_RUSAGE)
+ {
+ int i;
+
+ for (i = 0; i < n; i++)
+ instr[i].needs_rusage = true;
+ }
+
return instr;
}
***************
*** 55,60 **** InstrStartNode(Instrumentation *instr)
--- 66,73 ----
/* initialize buffer usage per plan node */
if (instr->needs_bufusage)
instr->bufusage_start = pgBufferUsage;
+ if (instr->needs_rusage)
+ getrusage(RUSAGE_SELF, &instr->rusage_start);
}
/* Exit from a plan node */
***************
*** 82,87 **** InstrStopNode(Instrumentation *instr, double nTuples)
--- 95,118 ----
BufferUsageAccumDiff(&instr->bufusage,
&pgBufferUsage, &instr->bufusage_start);
+ if (instr->needs_rusage)
+ {
+ struct rusage rusage;
+
+ if (INSTR_TIME_IS_ZERO(instr->rusage_start.ru_utime) ||
+ INSTR_TIME_IS_ZERO(instr->rusage_start.ru_stime))
+ {
+ elog(DEBUG2, "InstrStopNode called without rusage start");
+ }
+
+ getrusage(RUSAGE_SELF, &rusage);
+ ResourceUsageAccumDiff(&instr->rusage,
+ &rusage,
+ &instr->rusage_start);
+ INSTR_TIME_SET_ZERO(instr->rusage_start.ru_utime);
+ INSTR_TIME_SET_ZERO(instr->rusage_start.ru_stime);
+ }
+
/* Is this the first tuple of this cycle? */
if (!instr->running)
{
***************
*** 134,136 **** BufferUsageAccumDiff(BufferUsage *dst,
--- 165,186 ----
dst->temp_blks_read += add->temp_blks_read - sub->temp_blks_read;
dst->temp_blks_written += add->temp_blks_written - sub->temp_blks_written;
}
+
+ static void
+ ResourceUsageAccumDiff(struct rusage *dst,
+ const struct rusage *add,
+ const struct rusage *sub)
+ {
+ INSTR_TIME_ACCUM_DIFF(dst->ru_utime, add->ru_utime, sub->ru_utime);
+ INSTR_TIME_ACCUM_DIFF(dst->ru_stime, add->ru_stime, sub->ru_stime);
+ dst->ru_minflt += add->ru_minflt - sub->ru_minflt;
+ dst->ru_majflt += add->ru_majflt - sub->ru_majflt;
+ dst->ru_nswap += add->ru_nswap - sub->ru_nswap;
+ dst->ru_inblock += add->ru_inblock - sub->ru_inblock;
+ dst->ru_oublock += add->ru_oublock - sub->ru_oublock;
+ dst->ru_msgsnd += add->ru_msgsnd - sub->ru_msgsnd;
+ dst->ru_msgrcv += add->ru_msgrcv - sub->ru_msgrcv;
+ dst->ru_nsignals += add->ru_nsignals - sub->ru_nsignals;
+ dst->ru_nvcsw += add->ru_nvcsw - sub->ru_nvcsw;
+ dst->ru_nivcsw += add->ru_nivcsw - sub->ru_nivcsw;
+ }
*** a/src/include/commands/explain.h
--- b/src/include/commands/explain.h
***************
*** 31,36 **** typedef struct ExplainState
--- 31,37 ----
bool analyze; /* print actual times */
bool costs; /* print costs */
bool buffers; /* print buffer usage */
+ bool rusage; /* print resource usage (from getrusage) */
ExplainFormat format; /* output format */
/* other states */
PlannedStmt *pstmt; /* top of plan */
*** a/src/include/executor/instrument.h
--- b/src/include/executor/instrument.h
***************
*** 15,20 ****
--- 15,22 ----
#include "portability/instr_time.h"
+ #include <sys/time.h>
+ #include <sys/resource.h>
typedef struct BufferUsage
{
***************
*** 32,37 **** typedef enum InstrumentOption
--- 34,40 ----
{
INSTRUMENT_TIMER = 1 << 0, /* needs timer */
INSTRUMENT_BUFFERS = 1 << 1, /* needs buffer usage */
+ INSTRUMENT_RUSAGE = 1 << 2, /* needs resource usage (getrusage) */
INSTRUMENT_ALL = 0x7FFFFFFF
} InstrumentOption;
***************
*** 40,56 **** typedef struct Instrumentation
--- 43,62 ----
/* Info about current plan cycle: */
bool running; /* TRUE if we've completed first tuple */
bool needs_bufusage; /* TRUE if we need buffer usage */
+ bool needs_rusage; /* TRUE if we need resource usage */
instr_time starttime; /* Start time of current iteration of node */
instr_time counter; /* Accumulated runtime for this node */
double firsttuple; /* Time for first tuple of this cycle */
double tuplecount; /* Tuples emitted so far this cycle */
BufferUsage bufusage_start; /* Buffer usage at start */
+ struct rusage rusage_start; /* Resource usage at start (from getrusage) */
/* Accumulated statistics across all completed cycles: */
double startup; /* Total startup time (in seconds) */
double total; /* Total total time (in seconds) */
double ntuples; /* Total tuples produced */
double nloops; /* # of run cycles for this node */
BufferUsage bufusage; /* Total buffer usage */
+ struct rusage rusage; /* Total resource usage (from getrusage) */
} Instrumentation;
extern PGDLLIMPORT BufferUsage pgBufferUsage;
*** a/src/include/rusagestub.h
--- b/src/include/rusagestub.h
***************
*** 27,32 **** struct rusage
--- 27,46 ----
{
struct timeval ru_utime; /* user time used */
struct timeval ru_stime; /* system time used */
+ long ru_maxrss; /* maximum resident set size */
+ long ru_ixrss; /* integral shared memory size */
+ long ru_idrss; /* integral unshared data size */
+ long ru_isrss; /* integral unshared stack size */
+ long ru_minflt; /* page reclaims */
+ long ru_majflt; /* page faults */
+ long ru_nswap; /* swaps */
+ long ru_inblock; /* block input operations */
+ long ru_oublock; /* block output operations */
+ long ru_msgsnd; /* messages sent */
+ long ru_msgrcv; /* messages received */
+ long ru_nsignals; /* signals received */
+ long ru_nvcsw; /* voluntary context switches */
+ long ru_nivcsw; /* involuntary context switches */
};
extern int getrusage(int who, struct rusage * rusage);
*** a/src/port/getrusage.c
--- b/src/port/getrusage.c
***************
*** 40,45 **** getrusage(int who, struct rusage * rusage)
--- 40,46 ----
FILETIME kerneltime;
FILETIME usertime;
ULARGE_INTEGER li;
+ IOCOUNTERS iocounters;
if (who != RUSAGE_SELF)
{
***************
*** 61,66 **** getrusage(int who, struct rusage * rusage)
--- 62,73 ----
return -1;
}
+ if (GetProcessIoCounters(GetCurrentProcess(), &iocounters) == 0)
+ {
+ _dosmaperr(GetLastError());
+ return -1;
+ }
+
/* Convert FILETIMEs (0.1 us) to struct timeval */
memcpy(&li, &kerneltime, sizeof(FILETIME));
li.QuadPart /= 10L; /* Convert to microseconds */
***************
*** 71,76 **** getrusage(int who, struct rusage * rusage)
--- 78,87 ----
li.QuadPart /= 10L; /* Convert to microseconds */
rusage->ru_utime.tv_sec = li.QuadPart / 1000000L;
rusage->ru_utime.tv_usec = li.QuadPart % 1000000L;
+
+ rusage->ru_inblock = iocounters.ReadTransferCount/512;
+ rusage->ru_oublock = iocounters.WriteTransferCount/512;
+
#else /* all but WIN32 */
struct tms tms;
--
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers