On Mon, 2006-10-30 at 12:05 -0500, Tom Lane wrote:
> Alvaro Herrera <[EMAIL PROTECTED]> writes:
> > Ugh.  Is there another solution to this?  Say, sync the buffer so that
> > the hint bits are written to disk?
> 
> Yeah.  The original design for all this is explained by the notes for
> TruncateCLOG:
> 
>  * When this is called, we know that the database logically contains no
>  * reference to transaction IDs older than oldestXact.        However, we must
>  * not truncate the CLOG until we have performed a checkpoint, to ensure
>  * that no such references remain on disk either; else a crash just after
>  * the truncation might leave us with a problem.
> 
> The pre-8.2 coding is actually perfectly safe within a single database,
> because TruncateCLOG is only called at the end of a database-wide
> vacuum, and so the checkpoint is guaranteed to have flushed valid hint
> bits for all tuples to disk.  There is a risk in other databases though.
> I think that in the 8.2 structure the equivalent notion must be that
> VACUUM has to flush and fsync a table before it can advance the table's
> relminxid.

Ouch! We did discuss that also. Flushing the buffercache is nasty with
very large caches, so this makes autovacuum much less friendly - and
could take a seriously long time if you enforce the vacuum delay
costings.

ISTM we only need to flush iff the clog would be truncated when we
update relminxid. Otherwise we are safe to update even if we crash,
since the clog will not have been truncated. 

> That still leaves us with the problem of hint bits not being updated
> during WAL replay.  I think the best solution for this is for WAL replay
> to force relvacuumxid to equal relminxid (btw, these field names seem
> poorly chosen, and the comment in catalogs.sgml isn't self-explanatory...)
> rather than adopting the value shown in the WAL record.  This probably
> is best done by abandoning the generic "overwrite tuple" WAL record type
> in favor of something specific to minxid updates.  The effect would then
> be that a PITR slave would not truncate its clog beyond the freeze
> horizon until it had performed a vacuum of its own.

Sounds good. Methinks we do still need the TruncateCLOG patch to ensure
we do WAL replay for the truncation? I'm posting that now to -patches as
a prototype.

-- 
  Simon Riggs             
  EnterpriseDB   http://www.enterprisedb.com

Index: src/backend/access/transam/clog.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/access/transam/clog.c,v
retrieving revision 1.40
diff -c -r1.40 clog.c
*** src/backend/access/transam/clog.c	4 Oct 2006 00:29:49 -0000	1.40
--- src/backend/access/transam/clog.c	30 Oct 2006 14:32:14 -0000
***************
*** 68,74 ****
  
  static int	ZeroCLOGPage(int pageno, bool writeXlog);
  static bool CLOGPagePrecedes(int page1, int page2);
! static void WriteZeroPageXlogRec(int pageno);
  
  
  /*
--- 68,74 ----
  
  static int	ZeroCLOGPage(int pageno, bool writeXlog);
  static bool CLOGPagePrecedes(int page1, int page2);
! static void WriteClogXlogRec(int pageno, int rectype);
  
  
  /*
***************
*** 198,204 ****
  	slotno = SimpleLruZeroPage(ClogCtl, pageno);
  
  	if (writeXlog)
! 		WriteZeroPageXlogRec(pageno);
  
  	return slotno;
  }
--- 198,204 ----
  	slotno = SimpleLruZeroPage(ClogCtl, pageno);
  
  	if (writeXlog)
! 		WriteClogXlogRec(pageno, CLOG_ZEROPAGE);
  
  	return slotno;
  }
***************
*** 338,343 ****
--- 338,345 ----
  	/* Perform a CHECKPOINT */
  	RequestCheckpoint(true, false);
  
+ 	WriteClogXlogRec(cutoffPage, CLOG_TRUNCATE);
+ 
  	/* Now we can remove the old CLOG segment(s) */
  	SimpleLruTruncate(ClogCtl, cutoffPage);
  }
***************
*** 375,389 ****
   * (Besides which, this is normally done just before entering a transaction.)
   */
  static void
! WriteZeroPageXlogRec(int pageno)
  {
  	XLogRecData rdata;
  
  	rdata.data = (char *) (&pageno);
  	rdata.len = sizeof(int);
  	rdata.buffer = InvalidBuffer;
  	rdata.next = NULL;
! 	(void) XLogInsert(RM_CLOG_ID, CLOG_ZEROPAGE | XLOG_NO_TRAN, &rdata);
  }
  
  /*
--- 377,393 ----
   * (Besides which, this is normally done just before entering a transaction.)
   */
  static void
! WriteClogXlogRec(int pageno, int rectype)
  {
  	XLogRecData rdata;
  
+     Assert(rectype == CLOG_ZEROPAGE || rectype == CLOG_TRUNCATE);
+ 
  	rdata.data = (char *) (&pageno);
  	rdata.len = sizeof(int);
  	rdata.buffer = InvalidBuffer;
  	rdata.next = NULL;
! 	(void) XLogInsert(RM_CLOG_ID, rectype | XLOG_NO_TRAN, &rdata);
  }
  
  /*
***************
*** 409,414 ****
--- 413,432 ----
  
  		LWLockRelease(CLogControlLock);
  	}
+     else if (info == CLOG_TRUNCATE)
+     {
+ 		int			pageno;
+ 
+ 		memcpy(&pageno, XLogRecGetData(record), sizeof(int));
+ 
+ 		LWLockAcquire(CLogControlLock, LW_EXCLUSIVE);
+ 
+     	SimpleLruTruncate(ClogCtl, pageno);
+ 
+ 		LWLockRelease(CLogControlLock);
+     }
+     else
+ 		elog(PANIC, "clog_redo: unknown op code %u", info);
  }
  
  void
***************
*** 423,428 ****
  		memcpy(&pageno, rec, sizeof(int));
  		appendStringInfo(buf, "zeropage: %d", pageno);
  	}
! 	else
  		appendStringInfo(buf, "UNKNOWN");
  }
--- 441,453 ----
  		memcpy(&pageno, rec, sizeof(int));
  		appendStringInfo(buf, "zeropage: %d", pageno);
  	}
!     else if (info == CLOG_TRUNCATE)
!     {
! 		int			pageno;
! 
! 		memcpy(&pageno, rec, sizeof(int));
! 		appendStringInfo(buf, "truncate prior to: %d", pageno);
!     }
!     else
  		appendStringInfo(buf, "UNKNOWN");
  }
Index: src/include/access/clog.h
===================================================================
RCS file: /projects/cvsroot/pgsql/src/include/access/clog.h,v
retrieving revision 1.17
diff -c -r1.17 clog.h
*** src/include/access/clog.h	24 Mar 2006 04:32:13 -0000	1.17
--- src/include/access/clog.h	30 Oct 2006 14:32:35 -0000
***************
*** 46,51 ****
--- 46,52 ----
  
  /* XLOG stuff */
  #define CLOG_ZEROPAGE		0x00
+ #define CLOG_TRUNCATE       0x01
  
  extern void clog_redo(XLogRecPtr lsn, XLogRecord *record);
  extern void clog_desc(StringInfo buf, uint8 xl_info, char *rec);
---------------------------(end of broadcast)---------------------------
TIP 1: if posting/reading through Usenet, please send an appropriate
       subscribe-nomail command to [EMAIL PROTECTED] so that your
       message can get through to the mailing list cleanly

Reply via email to