Jeff Janes wrote:
For the individual file sync times emitted under debug1, it would be
very handy if the file being synced was identified, for example
"relation base/16384/16523". Rather than being numbered sequentially
within a given checkpoint.

I was numbering them sequentially so that it's straightforward to graph the sync times in an external analysis tool, but the relation data is helpful too. New patch reflecting all upthread suggestions is attached. The output looks like this now at DEBUG1:

LOG:  checkpoint starting: xlog
DEBUG: checkpoint sync: number=1 file=base/16424/11645 time=11589.549000 msec
DEBUG:  checkpoint sync: number=2 file=base/16424/16438 time=16.148000 msec
DEBUG:  checkpoint sync: number=3 file=base/16424/16437 time=53.530000 msec
DEBUG:  checkpoint sync: number=4 file=base/16424/16447 time=10.214000 msec
DEBUG:  checkpoint sync: number=5 file=base/16424/11607 time=1.499000 msec
DEBUG: checkpoint sync: number=6 file=base/16424/16425_fsm time=2.921000 msec
DEBUG:  checkpoint sync: number=7 file=base/16424/16437.1 time=4.237000 msec
DEBUG: checkpoint sync: number=8 file=base/16424/16428_fsm time=1.654000 msec
DEBUG:  checkpoint sync: number=9 file=base/16424/16442 time=7.920000 msec
DEBUG: checkpoint sync: number=10 file=base/16424/16428_vm time=2.613000 msec
DEBUG:  checkpoint sync: number=11 file=base/16424/11618 time=1.468000 msec
DEBUG: checkpoint sync: number=12 file=base/16424/16437_fsm time=2.638000 msec
DEBUG:  checkpoint sync: number=13 file=base/16424/16428 time=2.883000 msec
DEBUG:  checkpoint sync: number=14 file=base/16424/16425 time=3.369000 msec
DEBUG: checkpoint sync: number=15 file=base/16424/16437_vm time=8.686000 msec DEBUG: checkpoint sync: number=16 file=base/16424/16425_vm time=5.984000 msec LOG: checkpoint complete: wrote 2074 buffers (50.6%); 0 transaction log file(s) added, 0 removed, 3 recycled; write=0.617 s, sync=11.715 s, total=22.167 s; sync files=16, longest=11.589 s, average=0.724 s

I kept the units for the DEBUG level ones in msec because that's a better scale for the common really short syncs during good behavior. But the summary info in seconds now appears at the end of the existing "checkpoint complete" message, so only one line to parse for those looking to analyze the gross checkpoint data. That looks to work well enough for finding situations like the big ext3 spikes. You can easily see one in this example by the fact that "longest=11.589 s" is almost the entirety of "sync=11.715 s". That's the really key thing there's currently no visibility into, that's made obvious with this patch.

This might be ready for some proper review now. I know there's at least one blatant bug still in here I haven't found yet, related to how the averages are computed. I saw this once:

LOG: checkpoint complete: wrote 0 buffers (0.0%); 0 transaction log file(s) added, 0 removed, 1 recycled; write=0.000 s, sync=0.000 s, total=0.001 s; sync files=0, longest=0.000 s, average=-9223372036854775808.-2147483 s

After an immediate checkpoint, so at least one path not quite right yet.

--
Greg Smith   2ndQuadrant US    g...@2ndquadrant.com   Baltimore, MD


diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index ede6ceb..6f6eb3b 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -7063,10 +7063,15 @@ LogCheckpointEnd(bool restartpoint)
 {
 	long		write_secs,
 				sync_secs,
-				total_secs;
+				total_secs,
+				longest_secs,
+				average_secs;
 	int			write_usecs,
 				sync_usecs,
-				total_usecs;
+				total_usecs,
+				longest_usecs,
+				average_usecs;
+	double		average_sync_time;
 
 	CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
 
@@ -7082,18 +7087,35 @@ LogCheckpointEnd(bool restartpoint)
 						CheckpointStats.ckpt_sync_end_t,
 						&sync_secs, &sync_usecs);
 
+	/*
+	 * Timing values returned from CheckpointStats are in milliseconds.
+	 * Convert to the second plus microsecond form that TimestampDifference
+	 * returns for homogeneous printing.
+	 */
+	longest_secs = (long) (CheckpointStats.ckpt_longest_sync / 1000);
+	longest_usecs = 1000 * (CheckpointStats.ckpt_longest_sync - longest_secs * 1000);
+
+	average_sync_time = CheckpointStats.ckpt_longest_sync / CheckpointStats.ckpt_sync_rels;
+	average_secs = (long) (average_sync_time / 1000);
+	average_usecs = 1000 * (average_sync_time - average_secs * 1000);
+
 	if (restartpoint)
 		elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); "
-			 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
+			 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
+			 "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
 			 CheckpointStats.ckpt_bufs_written,
 			 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
 			 write_secs, write_usecs / 1000,
 			 sync_secs, sync_usecs / 1000,
-			 total_secs, total_usecs / 1000);
+			 total_secs, total_usecs / 1000,
+			 CheckpointStats.ckpt_sync_rels,
+			 longest_secs, longest_usecs / 1000,
+			 average_secs, average_usecs / 1000);
 	else
 		elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
 			 "%d transaction log file(s) added, %d removed, %d recycled; "
-			 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
+			 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
+			 "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
 			 CheckpointStats.ckpt_bufs_written,
 			 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
 			 CheckpointStats.ckpt_segs_added,
@@ -7101,7 +7123,10 @@ LogCheckpointEnd(bool restartpoint)
 			 CheckpointStats.ckpt_segs_recycled,
 			 write_secs, write_usecs / 1000,
 			 sync_secs, sync_usecs / 1000,
-			 total_secs, total_usecs / 1000);
+			 total_secs, total_usecs / 1000,
+			 CheckpointStats.ckpt_sync_rels,
+			 longest_secs, longest_usecs / 1000,
+			 average_secs, average_usecs / 1000);
 }
 
 /*
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 604db21..cadd938 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -20,6 +20,7 @@
 
 #include "catalog/catalog.h"
 #include "miscadmin.h"
+#include "portability/instr_time.h"
 #include "postmaster/bgwriter.h"
 #include "storage/fd.h"
 #include "storage/bufmgr.h"
@@ -927,6 +928,13 @@ mdsync(void)
 	PendingOperationEntry *entry;
 	int			absorb_counter;
 
+	/* Statistics on sync times */
+	int processed = 0;
+	instr_time sync_start, sync_end, sync_diff; 
+	double elapsed;
+	double longest = 0;
+	double total_elapsed = 0;
+
 	/*
 	 * This is only called during checkpoints, and checkpoints should only
 	 * occur in processes that have created a pendingOpsTable.
@@ -1069,9 +1077,29 @@ mdsync(void)
 				seg = _mdfd_getseg(reln, entry->tag.forknum,
 							  entry->tag.segno * ((BlockNumber) RELSEG_SIZE),
 								   false, EXTENSION_RETURN_NULL);
+
+				if (log_checkpoints)
+					INSTR_TIME_SET_CURRENT(sync_start);
+
 				if (seg != NULL &&
 					FileSync(seg->mdfd_vfd) >= 0)
+				{
+					if (log_checkpoints)
+					{
+						INSTR_TIME_SET_CURRENT(sync_end);
+						sync_diff = sync_end;
+						INSTR_TIME_SUBTRACT(sync_diff, sync_start);
+						elapsed = (double) INSTR_TIME_GET_MICROSEC(sync_diff) / 1000.0;
+						if (elapsed > longest)
+							longest = elapsed;
+						total_elapsed += elapsed;
+						processed++;
+						elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%f msec", 
+							processed, FilePathName(seg->mdfd_vfd), elapsed);
+					}
+
 					break;		/* success; break out of retry loop */
+					}
 
 				/*
 				 * XXX is there any point in allowing more than one retry?
@@ -1113,6 +1141,11 @@ mdsync(void)
 			elog(ERROR, "pendingOpsTable corrupted");
 	}							/* end loop over hashtable entries */
 
+	/* Return sync performance metrics for report at checkpoint end */
+	CheckpointStats.ckpt_sync_rels = processed;
+	CheckpointStats.ckpt_longest_sync = longest;
+	CheckpointStats.ckpt_agg_sync_time = total_elapsed;
+
 	/* Flag successful completion of mdsync */
 	mdsync_in_progress = false;
 }
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index fa7ae2a..ec4d9db 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -257,6 +257,13 @@ typedef struct CheckpointStatsData
 	int			ckpt_segs_added;	/* # of new xlog segments created */
 	int			ckpt_segs_removed;		/* # of xlog segments deleted */
 	int			ckpt_segs_recycled;		/* # of xlog segments recycled */
+
+	int 		ckpt_sync_rels;		/* # of relations synced */
+    double 		ckpt_longest_sync;	/* Longest sync for one relation */
+    double 		ckpt_agg_sync_time;	/* The sum of all the individual sync
+									   times, which is not necessarily the
+									   same as the total elapsed time for
+									   the entire sync phase. */
 } CheckpointStatsData;
 
 extern CheckpointStatsData CheckpointStats;
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to