Hi, On 2/26/23 5:03 PM, Melanie Plageman wrote:
Hi,As suggested in [1], the attached patch adds IO times to pg_stat_io;
Thanks for the patch! I started to have a look at it and figured out that a tiny rebase was needed (due to 728560db7d and b9f0e54bc9), so please find the rebase (aka V2) attached.
The timings will only be non-zero when track_io_timing is on
That could lead to incorrect interpretation if one wants to divide the timing per operations, say: - track_io_timing is set to on while there is already operations - or set to off while it was on (and the number of operations keeps growing) Might be worth to warn/highlight in the "track_io_timing" doc? + if (track_io_timing) + { + INSTR_TIME_SET_CURRENT(io_time); + INSTR_TIME_SUBTRACT(io_time, io_start); + pgstat_count_io_time(io_object, io_context, IOOP_EXTEND, io_time); + } + + pgstat_count_io_op(io_object, io_context, IOOP_EXTEND); vs @@ -1042,6 +1059,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, INSTR_TIME_SUBTRACT(io_time, io_start); pgstat_count_buffer_read_time(INSTR_TIME_GET_MICROSEC(io_time)); INSTR_TIME_ADD(pgBufferUsage.blk_read_time, io_time); + pgstat_count_io_time(io_object, io_context, IOOP_READ, io_time); } That leads to pgstat_count_io_time() to be called before pgstat_count_io_op() (for the IOOP_EXTEND case) and after pgstat_count_io_op() (for the IOOP_READ case). What about calling them in the same order and so that pgstat_count_io_time() is called before pgstat_count_io_op()? If so, the ordering would also need to be changed in: - FlushRelationBuffers() - register_dirty_segment()
There is one minor question (in the code as a TODO) which is whether or not it is worth cross-checking that IO counts and times are either both zero or neither zero in the validation function pgstat_bktype_io_stats_valid().
As pgstat_bktype_io_stats_valid() is called only in Assert(), I think that would be a good idea to also check that if counts are not Zero then times are not Zero. Regards, -- Bertrand Drouvot PostgreSQL Contributors Team RDS Open Source Databases Amazon Web Services: https://aws.amazon.com
diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index 6249bb50d0..2c62b0a437 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -3814,6 +3814,18 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i </entry> </row> + <row> + <entry role="catalog_table_entry"> + <para role="column_definition"> + <structfield>read_time</structfield> <type>double precision</type> + </para> + <para> + Time spent in read operations in milliseconds (if <xref + linkend="guc-track-io-timing"/> is enabled, otherwise zero) + </para> + </entry> + </row> + <row> <entry role="catalog_table_entry"> <para role="column_definition"> @@ -3826,6 +3838,18 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i </entry> </row> + <row> + <entry role="catalog_table_entry"> + <para role="column_definition"> + <structfield>write_time</structfield> <type>double precision</type> + </para> + <para> + Time spent in write operations in milliseconds (if <xref + linkend="guc-track-io-timing"/> is enabled, otherwise zero) + </para> + </entry> + </row> + <row> <entry role="catalog_table_entry"> <para role="column_definition"> @@ -3838,6 +3862,18 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i </entry> </row> + <row> + <entry role="catalog_table_entry"> + <para role="column_definition"> + <structfield>extend_time</structfield> <type>double precision</type> + </para> + <para> + Time spent in extend operations in milliseconds (if <xref + linkend="guc-track-io-timing"/> is enabled, otherwise zero) + </para> + </entry> + </row> + <row> <entry role="catalog_table_entry"> <para role="column_definition"> @@ -3902,6 +3938,18 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i </entry> </row> + <row> + <entry role="catalog_table_entry"> + <para role="column_definition"> + <structfield>fsync_time</structfield> <type>double precision</type> + </para> + <para> + Time spent in fsync operations in milliseconds (if <xref + linkend="guc-track-io-timing"/> is enabled, otherwise zero) + </para> + </entry> + </row> + <row> <entry role="catalog_table_entry"> <para role="column_definition"> diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 34ca0e739f..39391bc2fc 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -1123,12 +1123,16 @@ SELECT b.io_object, b.io_context, b.reads, + b.read_time, b.writes, + b.write_time, b.extends, + b.extend_time, b.op_bytes, b.evictions, b.reuses, b.fsyncs, + b.fsync_time, b.stats_reset FROM pg_stat_get_io() b; diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 0a05577b68..bbd2af9fae 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -1000,11 +1000,26 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, if (isExtend) { + instr_time io_start, + io_time; /* new buffers are zero-filled */ MemSet((char *) bufBlock, 0, BLCKSZ); + + if (track_io_timing) + INSTR_TIME_SET_CURRENT(io_start); + else + INSTR_TIME_SET_ZERO(io_start); + /* don't set checksum for all-zero page */ smgrextend(smgr, forkNum, blockNum, bufBlock, false); + if (track_io_timing) + { + INSTR_TIME_SET_CURRENT(io_time); + INSTR_TIME_SUBTRACT(io_time, io_start); + pgstat_count_io_time(io_object, io_context, IOOP_EXTEND, io_time); + } + pgstat_count_io_op(io_object, io_context, IOOP_EXTEND); /* @@ -1042,6 +1057,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, INSTR_TIME_SUBTRACT(io_time, io_start); pgstat_count_buffer_read_time(INSTR_TIME_GET_MICROSEC(io_time)); INSTR_TIME_ADD(pgBufferUsage.blk_read_time, io_time); + pgstat_count_io_time(io_object, io_context, IOOP_READ, io_time); } /* check for garbage data */ @@ -2989,6 +3005,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, INSTR_TIME_SUBTRACT(io_time, io_start); pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time)); INSTR_TIME_ADD(pgBufferUsage.blk_write_time, io_time); + pgstat_count_io_time(IOOBJECT_RELATION, io_context, IOOP_WRITE, io_time); } pgBufferUsage.shared_blks_written++; @@ -3594,6 +3611,9 @@ FlushRelationBuffers(Relation rel) if (RelationUsesLocalBuffers(rel)) { + instr_time io_start, + io_time; + for (i = 0; i < NLocBuffer; i++) { uint32 buf_state; @@ -3616,6 +3636,11 @@ FlushRelationBuffers(Relation rel) PageSetChecksumInplace(localpage, bufHdr->tag.blockNum); + if (track_io_timing) + INSTR_TIME_SET_CURRENT(io_start); + else + INSTR_TIME_SET_ZERO(io_start); + smgrwrite(RelationGetSmgr(rel), BufTagGetForkNum(&bufHdr->tag), bufHdr->tag.blockNum, @@ -3627,6 +3652,13 @@ FlushRelationBuffers(Relation rel) pgstat_count_io_op(IOOBJECT_TEMP_RELATION, IOCONTEXT_NORMAL, IOOP_WRITE); + if (track_io_timing) + { + INSTR_TIME_SET_CURRENT(io_time); + INSTR_TIME_SUBTRACT(io_time, io_start); + pgstat_count_io_time(IOOBJECT_TEMP_RELATION, IOCONTEXT_NORMAL, IOOP_WRITE, io_time); + } + /* Pop the error context stack */ error_context_stack = errcallback.previous; } diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c index 5325ddb663..b1272840bd 100644 --- a/src/backend/storage/buffer/localbuf.c +++ b/src/backend/storage/buffer/localbuf.c @@ -220,6 +220,8 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, */ if (buf_state & BM_DIRTY) { + instr_time io_start, + io_time; SMgrRelation oreln; Page localpage = (char *) LocalBufHdrGetBlock(bufHdr); @@ -228,6 +230,11 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, PageSetChecksumInplace(localpage, bufHdr->tag.blockNum); + if (track_io_timing) + INSTR_TIME_SET_CURRENT(io_start); + else + INSTR_TIME_SET_ZERO(io_start); + /* And write... */ smgrwrite(oreln, BufTagGetForkNum(&bufHdr->tag), @@ -239,6 +246,13 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, buf_state &= ~BM_DIRTY; pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state); + if (track_io_timing) + { + INSTR_TIME_SET_CURRENT(io_time); + INSTR_TIME_SUBTRACT(io_time, io_start); + pgstat_count_io_time(IOOBJECT_TEMP_RELATION, IOCONTEXT_NORMAL, IOOP_WRITE, io_time); + } + pgstat_count_io_op(IOOBJECT_TEMP_RELATION, IOCONTEXT_NORMAL, IOOP_WRITE); pgBufferUsage.local_blks_written++; } diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 352958e1fe..1e2f4bc9a9 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -1030,6 +1030,14 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */ )) { + instr_time io_start, + io_time; + + if (track_io_timing) + INSTR_TIME_SET_CURRENT(io_start); + else + INSTR_TIME_SET_ZERO(io_start); + /* * We have no way of knowing if the current IOContext is * IOCONTEXT_NORMAL or IOCONTEXT_[BULKREAD, BULKWRITE, VACUUM] at this @@ -1051,6 +1059,14 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", FilePathName(seg->mdfd_vfd)))); + + if (track_io_timing) + { + INSTR_TIME_SET_CURRENT(io_time); + INSTR_TIME_SUBTRACT(io_time, io_start); + pgstat_count_io_time(IOOBJECT_RELATION, IOCONTEXT_NORMAL, IOOP_FSYNC, io_time); + } + } } @@ -1399,6 +1415,8 @@ int mdsyncfiletag(const FileTag *ftag, char *path) { SMgrRelation reln = smgropen(ftag->rlocator, InvalidBackendId); + instr_time io_start, + io_time; File file; bool need_to_close; int result, @@ -1425,10 +1443,22 @@ mdsyncfiletag(const FileTag *ftag, char *path) need_to_close = true; } + if (track_io_timing) + INSTR_TIME_SET_CURRENT(io_start); + else + INSTR_TIME_SET_ZERO(io_start); + /* Sync the file. */ result = FileSync(file, WAIT_EVENT_DATA_FILE_SYNC); save_errno = errno; + if (track_io_timing) + { + INSTR_TIME_SET_CURRENT(io_time); + INSTR_TIME_SUBTRACT(io_time, io_start); + pgstat_count_io_time(IOOBJECT_RELATION, IOCONTEXT_NORMAL, IOOP_FSYNC, io_time); + } + if (need_to_close) FileClose(file); diff --git a/src/backend/utils/activity/pgstat_io.c b/src/backend/utils/activity/pgstat_io.c index c4199d18c8..75e34be982 100644 --- a/src/backend/utils/activity/pgstat_io.c +++ b/src/backend/utils/activity/pgstat_io.c @@ -25,17 +25,20 @@ bool have_iostats = false; /* * Check that stats have not been counted for any combination of IOObject, - * IOContext, and IOOp which are not tracked for the passed-in BackendType. The - * passed-in PgStat_BktypeIO must contain stats from the BackendType specified - * by the second parameter. Caller is responsible for locking the passed-in - * PgStat_BktypeIO, if needed. + * IOContext, and IOOp which are not tracked for the passed-in BackendType. If + * the IOOp is not counted for this combination but IO time is otherwise + * tracked for this IOOp, check that IO time has not been counted for this + * combination. + * TODO: should we check that IO counts are not 0 if IO time is not zero? + * + * The passed-in PgStat_BktypeIO must contain stats from the BackendType + * specified by the second parameter. Caller is responsible for locking the + * passed-in PgStat_BktypeIO, if needed. */ bool pgstat_bktype_io_stats_valid(PgStat_BktypeIO *backend_io, BackendType bktype) { - bool bktype_tracked = pgstat_tracks_io_bktype(bktype); - for (int io_object = 0; io_object < IOOBJECT_NUM_TYPES; io_object++) { for (int io_context = 0; io_context < IOCONTEXT_NUM_TYPES; io_context++) @@ -47,14 +50,21 @@ pgstat_bktype_io_stats_valid(PgStat_BktypeIO *backend_io, */ for (int io_op = 0; io_op < IOOP_NUM_TYPES; io_op++) { - /* No stats, so nothing to validate */ - if (backend_io->data[io_object][io_context][io_op] == 0) + /* we do track it */ + if (pgstat_tracks_io_op(bktype, io_object, io_context, io_op)) continue; - /* There are stats and there shouldn't be */ - if (!bktype_tracked || - !pgstat_tracks_io_op(bktype, io_object, io_context, io_op)) + /* we don't track it, and it is not 0 */ + if (backend_io->counts[io_object][io_context][io_op] != 0) return false; + + /* we don't track this IOOp, so make sure its IO time is zero */ + if (pgstat_tracks_io_time(io_op) > -1) + { + if (!INSTR_TIME_IS_ZERO(backend_io->times[io_object][io_context][io_op])) + return false; + } + } } } @@ -70,7 +80,21 @@ pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op) Assert(io_op < IOOP_NUM_TYPES); Assert(pgstat_tracks_io_op(MyBackendType, io_object, io_context, io_op)); - PendingIOStats.data[io_object][io_context][io_op]++; + PendingIOStats.counts[io_object][io_context][io_op]++; + + have_iostats = true; +} + +void +pgstat_count_io_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time time) +{ + Assert(io_object < IOOBJECT_NUM_TYPES); + Assert(io_context < IOCONTEXT_NUM_TYPES); + Assert(io_op < IOOP_NUM_TYPES); + Assert(pgstat_tracks_io_op(MyBackendType, io_object, io_context, io_op)); + Assert(pgstat_tracks_io_time(io_op) != -1); + + INSTR_TIME_ADD(PendingIOStats.times[io_object][io_context][io_op], time); have_iostats = true; } @@ -114,8 +138,13 @@ pgstat_flush_io(bool nowait) for (int io_context = 0; io_context < IOCONTEXT_NUM_TYPES; io_context++) { for (int io_op = 0; io_op < IOOP_NUM_TYPES; io_op++) - bktype_shstats->data[io_object][io_context][io_op] += - PendingIOStats.data[io_object][io_context][io_op]; + { + bktype_shstats->counts[io_object][io_context][io_op] += + PendingIOStats.counts[io_object][io_context][io_op]; + + INSTR_TIME_ADD(bktype_shstats->times[io_object][io_context][io_op], + PendingIOStats.times[io_object][io_context][io_op]); + } } } @@ -384,3 +413,30 @@ pgstat_tracks_io_op(BackendType bktype, IOObject io_object, return true; } + +/* + * PgStat_BktypeIO->times contains IO times for IOOps. For simplicity this + * array has a spot for every IOOp. pgstat_tracks_io_time() is the source of + * truth for which IOOps have corresponding IO times. + */ +IOOp +pgstat_tracks_io_time(IOOp io_op) +{ + switch (io_op) + { + case IOOP_READ: + return IOOP_READ; + case IOOP_WRITE: + return IOOP_WRITE; + case IOOP_EXTEND: + return IOOP_EXTEND; + case IOOP_FSYNC: + return IOOP_FSYNC; + case IOOP_EVICT: + case IOOP_REUSE: + return -1; + } + + elog(ERROR, "unrecognized IOOp value: %d", io_op); + pg_unreachable(); +} diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c index b61a12382b..5731010ade 100644 --- a/src/backend/utils/adt/pgstatfuncs.c +++ b/src/backend/utils/adt/pgstatfuncs.c @@ -1255,12 +1255,16 @@ typedef enum io_stat_col IO_COL_IO_OBJECT, IO_COL_IO_CONTEXT, IO_COL_READS, + IO_COL_READ_TIME, IO_COL_WRITES, + IO_COL_WRITE_TIME, IO_COL_EXTENDS, + IO_COL_EXTEND_TIME, IO_COL_CONVERSION, IO_COL_EVICTIONS, IO_COL_REUSES, IO_COL_FSYNCS, + IO_COL_FSYNC_TIME, IO_COL_RESET_TIME, IO_NUM_COLUMNS, } io_stat_col; @@ -1292,6 +1296,21 @@ pgstat_get_io_op_index(IOOp io_op) pg_unreachable(); } +/* + * Get the number of the column containing IO times for the specified IOOp. If + * the specified IOOp is one for which IO time is not tracked, return -1. Note + * that this function assumes that IO time for an IOOp is displayed in the view + * in the column directly after the IOOp counts. + */ +static io_stat_col +pgstat_get_io_time_index(IOOp io_op) +{ + if (pgstat_tracks_io_time(io_op) == -1) + return -1; + + return pgstat_get_io_op_index(io_op) + 1; +} + Datum pg_stat_get_io(PG_FUNCTION_ARGS) { @@ -1359,20 +1378,31 @@ pg_stat_get_io(PG_FUNCTION_ARGS) for (int io_op = 0; io_op < IOOP_NUM_TYPES; io_op++) { - int col_idx = pgstat_get_io_op_index(io_op); + int i = pgstat_get_io_op_index(io_op); /* * Some combinations of BackendType and IOOp, of IOContext * and IOOp, and of IOObject and IOOp are not tracked. Set * these cells in the view NULL. */ - nulls[col_idx] = !pgstat_tracks_io_op(bktype, io_obj, io_context, io_op); + if (pgstat_tracks_io_op(bktype, io_obj, io_context, io_op)) + values[i] = Int64GetDatum(bktype_stats->counts[io_obj][io_context][io_op]); + else + nulls[i] = true; + } + + for (int io_op = 0; io_op < IOOP_NUM_TYPES; io_op++) + { + int i = pgstat_get_io_time_index(io_op); - if (nulls[col_idx]) + if (i == -1) continue; - values[col_idx] = - Int64GetDatum(bktype_stats->data[io_obj][io_context][io_op]); + if (!nulls[pgstat_get_io_op_index(io_op)]) + values[i] = + Float8GetDatum(INSTR_TIME_GET_MILLISEC(bktype_stats->times[io_obj][io_context][io_op])); + else + nulls[i] = true; } tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 505595620e..80c0627209 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -5721,9 +5721,9 @@ proname => 'pg_stat_get_io', provolatile => 'v', prorows => '30', proretset => 't', proparallel => 'r', prorettype => 'record', proargtypes => '', - proallargtypes => '{text,text,text,int8,int8,int8,int8,int8,int8,int8,timestamptz}', - proargmodes => '{o,o,o,o,o,o,o,o,o,o,o}', - proargnames => '{backend_type,io_object,io_context,reads,writes,extends,op_bytes,evictions,reuses,fsyncs,stats_reset}', + proallargtypes => '{text,text,text,int8,float8,int8,float8,int8,float8,int8,int8,int8,int8,float8,timestamptz}', + proargmodes => '{o,o,o,o,o,o,o,o,o,o,o,o,o,o,o}', + proargnames => '{backend_type,io_object,io_context,reads,read_time,writes,write_time,extends,extend_time,op_bytes,evictions,reuses,fsyncs,fsync_time,stats_reset}', prosrc => 'pg_stat_get_io' }, { oid => '1136', descr => 'statistics: information about WAL activity', diff --git a/src/include/pgstat.h b/src/include/pgstat.h index f43fac09ed..687aff4859 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -313,7 +313,8 @@ typedef enum IOOp typedef struct PgStat_BktypeIO { - PgStat_Counter data[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES]; + PgStat_Counter counts[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES]; + instr_time times[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES]; } PgStat_BktypeIO; typedef struct PgStat_IO @@ -507,6 +508,7 @@ extern PgStat_CheckpointerStats *pgstat_fetch_stat_checkpointer(void); extern bool pgstat_bktype_io_stats_valid(PgStat_BktypeIO *context_ops, BackendType bktype); extern void pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op); +extern void pgstat_count_io_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time time); extern PgStat_IO *pgstat_fetch_stat_io(void); extern const char *pgstat_get_io_context_name(IOContext io_context); extern const char *pgstat_get_io_object_name(IOObject io_object); @@ -516,6 +518,7 @@ extern bool pgstat_tracks_io_object(BackendType bktype, IOObject io_object, IOContext io_context); extern bool pgstat_tracks_io_op(BackendType bktype, IOObject io_object, IOContext io_context, IOOp io_op); +extern IOOp pgstat_tracks_io_time(IOOp io_op); /* diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index e953d1f515..5434851314 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -1880,14 +1880,18 @@ pg_stat_io| SELECT backend_type, io_object, io_context, reads, + read_time, writes, + write_time, extends, + extend_time, op_bytes, evictions, reuses, fsyncs, + fsync_time, stats_reset - FROM pg_stat_get_io() b(backend_type, io_object, io_context, reads, writes, extends, op_bytes, evictions, reuses, fsyncs, stats_reset); + FROM pg_stat_get_io() b(backend_type, io_object, io_context, reads, read_time, writes, write_time, extends, extend_time, op_bytes, evictions, reuses, fsyncs, fsync_time, stats_reset); pg_stat_progress_analyze| SELECT s.pid, s.datid, d.datname,