Re: [HACKERS] Instrument checkpoint sync calls

Greg Smith Sun, 05 Dec 2010 13:24:11 -0800

Jeff Janes wrote:

I've attached a tiny patch to apply over yours, to deal with this and
with the case where no files are synced.

Thanks for that. That obvious error eluded me because in most of thepatch update testing I was doing (on ext3), the longest sync was alwaysabout the same length as the total sync time.

Attached patch (in correct diff form this time!) collects up allchanges. That includes elimination of a potential race condition ifsomeone changes log_checkpoints while a long sync phase is executing. Idon't know whether that can happen, and it obviously won't give accuratestats going back to the beginning of the checkpoint in that case, butit tries to defend aginst producing complete garbage if that valuechanges out from under it.

This is the first version of this patch I feel fairly good about; noopen concerns left on my side. Jeff, since you're now the de-factocredited reviewer of this one by virtue of suggesting bug fixes, couldyou take this update out for a spin too?

Combining this instrumentation patch with the backend sync one already
committed, the net result under log_min_messages=debug1is somewhat
undesirable in that I can now see the individual sync times for the
syncs done by the checkpoint writer, but I do not get times for the
syncs done by the backend (I only get informed of their existence).

On a properly working system, backend syncs shouldn't be happening. Soif you see them, I think the question shouldn't be "how long are theytaking?", it's "how do I get rid of them?" From that perspective,knowing of their existence is sufficient to suggest the necessary tuningchanges, such as dropping bgwriter_delay.

When you get into a situation where they are showing up, a whole lot ofthem can happen in a very brief period; enough that I'd actually beconcerned about the added timing overhead, something I normally don'tcare about very much. That's the main reason I didn't botherinstrumenting those too--the situations where they happen are onesalready running low on resources.

Big writes for things that can only be written out at checkpoint time,on the other hand, are unavoidable in the current design. Providingdetail on them is going to be relevant unless there's a majorrefactoring of how those happen.


--
Greg Smith   2ndQuadrant US    [email protected]   Baltimore, MD
PostgreSQL Training, Services and Support        www.2ndQuadrant.us

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index ae9ed8f..6adc139 100644
*** a/src/backend/access/transam/xlog.c
--- b/src/backend/access/transam/xlog.c
*************** LogCheckpointEnd(bool restartpoint)
*** 6955,6964 ****
  {
  	long		write_secs,
  				sync_secs,
! 				total_secs;
  	int			write_usecs,
  				sync_usecs,
! 				total_usecs;
  
  	CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
  
--- 6955,6969 ----
  {
  	long		write_secs,
  				sync_secs,
! 				total_secs,
! 				longest_secs,
! 				average_secs;
  	int			write_usecs,
  				sync_usecs,
! 				total_usecs,
! 				longest_usecs,
! 				average_usecs;
! 	double		average_sync_time;
  
  	CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
  
*************** LogCheckpointEnd(bool restartpoint)
*** 6974,6991 ****
  						CheckpointStats.ckpt_sync_end_t,
  						&sync_secs, &sync_usecs);
  
  	if (restartpoint)
  		elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); "
! 			 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
  			 CheckpointStats.ckpt_bufs_written,
  			 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
  			 write_secs, write_usecs / 1000,
  			 sync_secs, sync_usecs / 1000,
! 			 total_secs, total_usecs / 1000);
  	else
  		elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
  			 "%d transaction log file(s) added, %d removed, %d recycled; "
! 			 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
  			 CheckpointStats.ckpt_bufs_written,
  			 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
  			 CheckpointStats.ckpt_segs_added,
--- 6979,7017 ----
  						CheckpointStats.ckpt_sync_end_t,
  						&sync_secs, &sync_usecs);
  
+ 	/*
+ 	 * Timing values returned from CheckpointStats are in milliseconds.
+ 	 * Convert to the second plus microsecond form that TimestampDifference
+ 	 * returns for homogeneous printing.
+ 	 */
+ 	longest_secs = (long) (CheckpointStats.ckpt_longest_sync / 1000);
+ 	longest_usecs = 1000 * (CheckpointStats.ckpt_longest_sync -
+ 		longest_secs * 1000);
+ 
+ 	average_sync_time = 0;
+ 	if (CheckpointStats.ckpt_sync_rels > 0) 
+ 		average_sync_time = CheckpointStats.ckpt_agg_sync_time /
+ 			CheckpointStats.ckpt_sync_rels;
+ 	average_secs = (long) (average_sync_time / 1000);
+ 	average_usecs = 1000 * (average_sync_time - average_secs * 1000);
+ 
  	if (restartpoint)
  		elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); "
! 			 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
! 			 "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
  			 CheckpointStats.ckpt_bufs_written,
  			 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
  			 write_secs, write_usecs / 1000,
  			 sync_secs, sync_usecs / 1000,
! 			 total_secs, total_usecs / 1000,
! 			 CheckpointStats.ckpt_sync_rels,
! 			 longest_secs, longest_usecs / 1000,
! 			 average_secs, average_usecs / 1000);
  	else
  		elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
  			 "%d transaction log file(s) added, %d removed, %d recycled; "
! 			 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
! 			 "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
  			 CheckpointStats.ckpt_bufs_written,
  			 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
  			 CheckpointStats.ckpt_segs_added,
*************** LogCheckpointEnd(bool restartpoint)
*** 6993,6999 ****
  			 CheckpointStats.ckpt_segs_recycled,
  			 write_secs, write_usecs / 1000,
  			 sync_secs, sync_usecs / 1000,
! 			 total_secs, total_usecs / 1000);
  }
  
  /*
--- 7019,7028 ----
  			 CheckpointStats.ckpt_segs_recycled,
  			 write_secs, write_usecs / 1000,
  			 sync_secs, sync_usecs / 1000,
! 			 total_secs, total_usecs / 1000,
! 			 CheckpointStats.ckpt_sync_rels,
! 			 longest_secs, longest_usecs / 1000,
! 			 average_secs, average_usecs / 1000);
  }
  
  /*
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 604db21..1f602a7 100644
*** a/src/backend/storage/smgr/md.c
--- b/src/backend/storage/smgr/md.c
***************
*** 20,25 ****
--- 20,26 ----
  
  #include "catalog/catalog.h"
  #include "miscadmin.h"
+ #include "portability/instr_time.h"
  #include "postmaster/bgwriter.h"
  #include "storage/fd.h"
  #include "storage/bufmgr.h"
*************** mdsync(void)
*** 927,932 ****
--- 928,940 ----
  	PendingOperationEntry *entry;
  	int			absorb_counter;
  
+ 	/* Statistics on sync times */
+ 	int processed = 0;
+ 	instr_time sync_start, sync_end, sync_diff; 
+ 	double elapsed;
+ 	double longest = 0;
+ 	double total_elapsed = 0;
+ 
  	/*
  	 * This is only called during checkpoints, and checkpoints should only
  	 * occur in processes that have created a pendingOpsTable.
*************** mdsync(void)
*** 1069,1077 ****
--- 1077,1107 ----
  				seg = _mdfd_getseg(reln, entry->tag.forknum,
  							  entry->tag.segno * ((BlockNumber) RELSEG_SIZE),
  								   false, EXTENSION_RETURN_NULL);
+ 
+ 				if (log_checkpoints)
+ 					INSTR_TIME_SET_CURRENT(sync_start);
+ 				else
+ 					INSTR_TIME_SET_ZERO(sync_start);
+ 
  				if (seg != NULL &&
  					FileSync(seg->mdfd_vfd) >= 0)
+ 				{
+ 					if (log_checkpoints && (! INSTR_TIME_IS_ZERO(sync_start)))
+ 					{
+ 						INSTR_TIME_SET_CURRENT(sync_end);
+ 						sync_diff = sync_end;
+ 						INSTR_TIME_SUBTRACT(sync_diff, sync_start);
+ 						elapsed = (double) INSTR_TIME_GET_MICROSEC(sync_diff) / 1000.0;
+ 						if (elapsed > longest)
+ 							longest = elapsed;
+ 						total_elapsed += elapsed;
+ 						processed++;
+ 						elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%f msec", 
+ 							processed, FilePathName(seg->mdfd_vfd), elapsed);
+ 					}
+ 
  					break;		/* success; break out of retry loop */
+ 				}
  
  				/*
  				 * XXX is there any point in allowing more than one retry?
*************** mdsync(void)
*** 1113,1118 ****
--- 1143,1153 ----
  			elog(ERROR, "pendingOpsTable corrupted");
  	}							/* end loop over hashtable entries */
  
+ 	/* Return sync performance metrics for report at checkpoint end */
+ 	CheckpointStats.ckpt_sync_rels = processed;
+ 	CheckpointStats.ckpt_longest_sync = longest;
+ 	CheckpointStats.ckpt_agg_sync_time = total_elapsed;
+ 
  	/* Flag successful completion of mdsync */
  	mdsync_in_progress = false;
  }
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index fa7ae2a..d95f843 100644
*** a/src/include/access/xlog.h
--- b/src/include/access/xlog.h
*************** typedef struct CheckpointStatsData
*** 257,262 ****
--- 257,269 ----
  	int			ckpt_segs_added;	/* # of new xlog segments created */
  	int			ckpt_segs_removed;		/* # of xlog segments deleted */
  	int			ckpt_segs_recycled;		/* # of xlog segments recycled */
+ 
+ 	int			ckpt_sync_rels;		/* # of relations synced */
+ 	double		ckpt_longest_sync;	/* Longest sync for one relation */
+ 	double		ckpt_agg_sync_time;	/* The sum of all the individual sync
+ 									   times, which is not necessarily the
+ 									   same as the total elapsed time for
+ 									   the entire sync phase. */
  } CheckpointStatsData;
  
  extern CheckpointStatsData CheckpointStats;

-- 
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] Instrument checkpoint sync calls

Reply via email to