Re: [HACKERS] Instrument checkpoint sync calls

Greg Smith Tue, 30 Nov 2010 08:38:43 -0800

Jeff Janes wrote:

For the individual file sync times emitted under debug1, it would be
very handy if the file being synced was identified, for example
"relation base/16384/16523". Rather than being numbered sequentially
within a given checkpoint.

I was numbering them sequentially so that it's straightforward to graphthe sync times in an external analysis tool, but the relation data ishelpful too. New patch reflecting all upthread suggestions isattached. The output looks like this now at DEBUG1:


LOG:  checkpoint starting: xlog

DEBUG: checkpoint sync: number=1 file=base/16424/11645time=11589.549000 msec

DEBUG:  checkpoint sync: number=2 file=base/16424/16438 time=16.148000 msec
DEBUG:  checkpoint sync: number=3 file=base/16424/16437 time=53.530000 msec
DEBUG:  checkpoint sync: number=4 file=base/16424/16447 time=10.214000 msec
DEBUG:  checkpoint sync: number=5 file=base/16424/11607 time=1.499000 msec

DEBUG: checkpoint sync: number=6 file=base/16424/16425_fsmtime=2.921000 msec

DEBUG:  checkpoint sync: number=7 file=base/16424/16437.1 time=4.237000 msec

DEBUG: checkpoint sync: number=8 file=base/16424/16428_fsmtime=1.654000 msec

DEBUG:  checkpoint sync: number=9 file=base/16424/16442 time=7.920000 msec

DEBUG: checkpoint sync: number=10 file=base/16424/16428_vmtime=2.613000 msec

DEBUG:  checkpoint sync: number=11 file=base/16424/11618 time=1.468000 msec

DEBUG: checkpoint sync: number=12 file=base/16424/16437_fsmtime=2.638000 msec

DEBUG:  checkpoint sync: number=13 file=base/16424/16428 time=2.883000 msec
DEBUG:  checkpoint sync: number=14 file=base/16424/16425 time=3.369000 msec

DEBUG: checkpoint sync: number=15 file=base/16424/16437_vmtime=8.686000 msecDEBUG: checkpoint sync: number=16 file=base/16424/16425_vmtime=5.984000 msecLOG: checkpoint complete: wrote 2074 buffers (50.6%); 0 transaction logfile(s) added, 0 removed, 3 recycled; write=0.617 s, sync=11.715 s,total=22.167 s; sync files=16, longest=11.589 s, average=0.724 s

I kept the units for the DEBUG level ones in msec because that's abetter scale for the common really short syncs during good behavior.But the summary info in seconds now appears at the end of the existing"checkpoint complete" message, so only one line to parse for thoselooking to analyze the gross checkpoint data. That looks to work wellenough for finding situations like the big ext3 spikes. You can easilysee one in this example by the fact that "longest=11.589 s" is almostthe entirety of "sync=11.715 s". That's the really key thing there'scurrently no visibility into, that's made obvious with this patch.

This might be ready for some proper review now. I know there's at leastone blatant bug still in here I haven't found yet, related to how theaverages are computed. I saw this once:

LOG: checkpoint complete: wrote 0 buffers (0.0%); 0 transaction logfile(s) added, 0 removed, 1 recycled; write=0.000 s, sync=0.000 s,total=0.001 s; sync files=0, longest=0.000 s,average=-9223372036854775808.-2147483 s


After an immediate checkpoint, so at least one path not quite right yet.

--
Greg Smith   2ndQuadrant US    g...@2ndquadrant.com   Baltimore, MD

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index ede6ceb..6f6eb3b 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -7063,10 +7063,15 @@ LogCheckpointEnd(bool restartpoint)
 {
 	long		write_secs,
 				sync_secs,
-				total_secs;
+				total_secs,
+				longest_secs,
+				average_secs;
 	int			write_usecs,
 				sync_usecs,
-				total_usecs;
+				total_usecs,
+				longest_usecs,
+				average_usecs;
+	double		average_sync_time;
 
 	CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
 
@@ -7082,18 +7087,35 @@ LogCheckpointEnd(bool restartpoint)
 						CheckpointStats.ckpt_sync_end_t,
 						&sync_secs, &sync_usecs);
 
+	/*
+	 * Timing values returned from CheckpointStats are in milliseconds.
+	 * Convert to the second plus microsecond form that TimestampDifference
+	 * returns for homogeneous printing.
+	 */
+	longest_secs = (long) (CheckpointStats.ckpt_longest_sync / 1000);
+	longest_usecs = 1000 * (CheckpointStats.ckpt_longest_sync - longest_secs * 1000);
+
+	average_sync_time = CheckpointStats.ckpt_longest_sync / CheckpointStats.ckpt_sync_rels;
+	average_secs = (long) (average_sync_time / 1000);
+	average_usecs = 1000 * (average_sync_time - average_secs * 1000);
+
 	if (restartpoint)
 		elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); "
-			 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
+			 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
+			 "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
 			 CheckpointStats.ckpt_bufs_written,
 			 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
 			 write_secs, write_usecs / 1000,
 			 sync_secs, sync_usecs / 1000,
-			 total_secs, total_usecs / 1000);
+			 total_secs, total_usecs / 1000,
+			 CheckpointStats.ckpt_sync_rels,
+			 longest_secs, longest_usecs / 1000,
+			 average_secs, average_usecs / 1000);
 	else
 		elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
 			 "%d transaction log file(s) added, %d removed, %d recycled; "
-			 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
+			 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
+			 "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
 			 CheckpointStats.ckpt_bufs_written,
 			 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
 			 CheckpointStats.ckpt_segs_added,
@@ -7101,7 +7123,10 @@ LogCheckpointEnd(bool restartpoint)
 			 CheckpointStats.ckpt_segs_recycled,
 			 write_secs, write_usecs / 1000,
 			 sync_secs, sync_usecs / 1000,
-			 total_secs, total_usecs / 1000);
+			 total_secs, total_usecs / 1000,
+			 CheckpointStats.ckpt_sync_rels,
+			 longest_secs, longest_usecs / 1000,
+			 average_secs, average_usecs / 1000);
 }
 
 /*
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 604db21..cadd938 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -20,6 +20,7 @@
 
 #include "catalog/catalog.h"
 #include "miscadmin.h"
+#include "portability/instr_time.h"
 #include "postmaster/bgwriter.h"
 #include "storage/fd.h"
 #include "storage/bufmgr.h"
@@ -927,6 +928,13 @@ mdsync(void)
 	PendingOperationEntry *entry;
 	int			absorb_counter;
 
+	/* Statistics on sync times */
+	int processed = 0;
+	instr_time sync_start, sync_end, sync_diff; 
+	double elapsed;
+	double longest = 0;
+	double total_elapsed = 0;
+
 	/*
 	 * This is only called during checkpoints, and checkpoints should only
 	 * occur in processes that have created a pendingOpsTable.
@@ -1069,9 +1077,29 @@ mdsync(void)
 				seg = _mdfd_getseg(reln, entry->tag.forknum,
 							  entry->tag.segno * ((BlockNumber) RELSEG_SIZE),
 								   false, EXTENSION_RETURN_NULL);
+
+				if (log_checkpoints)
+					INSTR_TIME_SET_CURRENT(sync_start);
+
 				if (seg != NULL &&
 					FileSync(seg->mdfd_vfd) >= 0)
+				{
+					if (log_checkpoints)
+					{
+						INSTR_TIME_SET_CURRENT(sync_end);
+						sync_diff = sync_end;
+						INSTR_TIME_SUBTRACT(sync_diff, sync_start);
+						elapsed = (double) INSTR_TIME_GET_MICROSEC(sync_diff) / 1000.0;
+						if (elapsed > longest)
+							longest = elapsed;
+						total_elapsed += elapsed;
+						processed++;
+						elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%f msec", 
+							processed, FilePathName(seg->mdfd_vfd), elapsed);
+					}
+
 					break;		/* success; break out of retry loop */
+					}
 
 				/*
 				 * XXX is there any point in allowing more than one retry?
@@ -1113,6 +1141,11 @@ mdsync(void)
 			elog(ERROR, "pendingOpsTable corrupted");
 	}							/* end loop over hashtable entries */
 
+	/* Return sync performance metrics for report at checkpoint end */
+	CheckpointStats.ckpt_sync_rels = processed;
+	CheckpointStats.ckpt_longest_sync = longest;
+	CheckpointStats.ckpt_agg_sync_time = total_elapsed;
+
 	/* Flag successful completion of mdsync */
 	mdsync_in_progress = false;
 }
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index fa7ae2a..ec4d9db 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -257,6 +257,13 @@ typedef struct CheckpointStatsData
 	int			ckpt_segs_added;	/* # of new xlog segments created */
 	int			ckpt_segs_removed;		/* # of xlog segments deleted */
 	int			ckpt_segs_recycled;		/* # of xlog segments recycled */
+
+	int 		ckpt_sync_rels;		/* # of relations synced */
+    double 		ckpt_longest_sync;	/* Longest sync for one relation */
+    double 		ckpt_agg_sync_time;	/* The sum of all the individual sync
+									   times, which is not necessarily the
+									   same as the total elapsed time for
+									   the entire sync phase. */
 } CheckpointStatsData;
 
 extern CheckpointStatsData CheckpointStats;

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] Instrument checkpoint sync calls

Reply via email to