On 04/04/2014 02:40 AM, Andres Freund wrote:
On 2014-04-03 19:33:12 -0400, Tom Lane wrote:
Andres Freund <and...@2ndquadrant.com> writes:
On 2014-04-03 19:08:27 -0400, Tom Lane wrote:
A somewhat more relevant concern is where are we going to keep the state
data involved in all this.  Since this code is, by definition, going to be
called in critical sections, any solution involving internal pallocs is
right out.

We actually already allocate memory in XLogInsert() :(, although only in
the first XLogInsert() a backend does...

Ouch.  I wonder if we should put an Assert(not-in-critical-section)
into MemoryContextAlloc.

Good idea!

XLogInsert() is using malloc() directly, so that wouldn't detect this
case...

It's not a bad idea tho. I wonder how far the regression tests
get...

Not even through initdb. GetVirtualXIDsDelayingChkpt() is to blame.

Hmm, the checkpointer process seems to ignore the rule, and just hopes for the best. The allocation in GetVirtualXIDsDelayingChkpt() was probably just an oversight, and that call could be moved outside the critical section, but we also have this in AbsortFsyncRequests():

        /*
         * We have to PANIC if we fail to absorb all the pending requests (eg,
         * because our hashtable runs out of memory).  This is because the 
system
         * cannot run safely if we are unable to fsync what we have been told to
         * fsync.  Fortunately, the hashtable is so small that the problem is
         * quite unlikely to arise in practice.
         */
        START_CRIT_SECTION();

Perhaps we could fix that by just calling fsync() directly if the allocation fails, like we do if ForwardFsyncRequest() fails.


But if we give the checkpointer process a free pass, running the regression tests with an assertion in AllocSetAlloc catches five genuine bugs:

1. _bt_newroot
2. XLogFileInit
3. spgPageIndexMultiDelete
4. PageRepairFragmentation
5. PageIndexMultiDelete

I'll commit the attached patch to fix those shortly.

- Heikki
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index d2ca8d9..922412e 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -1995,8 +1995,10 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
 	BTPageOpaque lopaque;
 	ItemId		itemid;
 	IndexTuple	item;
-	Size		itemsz;
-	IndexTuple	new_item;
+	IndexTuple	left_item;
+	Size		left_item_sz;
+	IndexTuple	right_item;
+	Size		right_item_sz;
 	Buffer		metabuf;
 	Page		metapg;
 	BTMetaPageData *metad;
@@ -2016,6 +2018,26 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
 	metapg = BufferGetPage(metabuf);
 	metad = BTPageGetMeta(metapg);
 
+	/*
+	 * Create downlink item for left page (old root).  Since this will be the
+	 * first item in a non-leaf page, it implicitly has minus-infinity key
+	 * value, so we need not store any actual key in it.
+	 */
+	left_item_sz = sizeof(IndexTupleData);
+	left_item = (IndexTuple) palloc(left_item_sz);
+	left_item->t_info = left_item_sz;
+	ItemPointerSet(&(left_item->t_tid), lbkno, P_HIKEY);
+
+	/*
+	 * Create downlink item for right page.  The key for it is obtained from
+	 * the "high key" position in the left page.
+	 */
+	itemid = PageGetItemId(lpage, P_HIKEY);
+	right_item_sz = ItemIdGetLength(itemid);
+	item = (IndexTuple) PageGetItem(lpage, itemid);
+	right_item = CopyIndexTuple(item);
+	ItemPointerSet(&(right_item->t_tid), rbkno, P_HIKEY);
+
 	/* NO EREPORT(ERROR) from here till newroot op is logged */
 	START_CRIT_SECTION();
 
@@ -2034,16 +2056,6 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
 	metad->btm_fastlevel = rootopaque->btpo.level;
 
 	/*
-	 * Create downlink item for left page (old root).  Since this will be the
-	 * first item in a non-leaf page, it implicitly has minus-infinity key
-	 * value, so we need not store any actual key in it.
-	 */
-	itemsz = sizeof(IndexTupleData);
-	new_item = (IndexTuple) palloc(itemsz);
-	new_item->t_info = itemsz;
-	ItemPointerSet(&(new_item->t_tid), lbkno, P_HIKEY);
-
-	/*
 	 * Insert the left page pointer into the new root page.  The root page is
 	 * the rightmost page on its level so there is no "high key" in it; the
 	 * two items will go into positions P_HIKEY and P_FIRSTKEY.
@@ -2051,32 +2063,20 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
 	 * Note: we *must* insert the two items in item-number order, for the
 	 * benefit of _bt_restore_page().
 	 */
-	if (PageAddItem(rootpage, (Item) new_item, itemsz, P_HIKEY,
+	if (PageAddItem(rootpage, (Item) left_item, left_item_sz, P_HIKEY,
 					false, false) == InvalidOffsetNumber)
 		elog(PANIC, "failed to add leftkey to new root page"
 			 " while splitting block %u of index \"%s\"",
 			 BufferGetBlockNumber(lbuf), RelationGetRelationName(rel));
-	pfree(new_item);
-
-	/*
-	 * Create downlink item for right page.  The key for it is obtained from
-	 * the "high key" position in the left page.
-	 */
-	itemid = PageGetItemId(lpage, P_HIKEY);
-	itemsz = ItemIdGetLength(itemid);
-	item = (IndexTuple) PageGetItem(lpage, itemid);
-	new_item = CopyIndexTuple(item);
-	ItemPointerSet(&(new_item->t_tid), rbkno, P_HIKEY);
 
 	/*
 	 * insert the right page pointer into the new root page.
 	 */
-	if (PageAddItem(rootpage, (Item) new_item, itemsz, P_FIRSTKEY,
+	if (PageAddItem(rootpage, (Item) right_item, right_item_sz, P_FIRSTKEY,
 					false, false) == InvalidOffsetNumber)
 		elog(PANIC, "failed to add rightkey to new root page"
 			 " while splitting block %u of index \"%s\"",
 			 BufferGetBlockNumber(lbuf), RelationGetRelationName(rel));
-	pfree(new_item);
 
 	/* Clear the incomplete-split flag in the left child */
 	Assert(P_INCOMPLETE_SPLIT(lopaque));
@@ -2129,6 +2129,9 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
 	/* done with metapage */
 	_bt_relbuf(rel, metabuf);
 
+	pfree(left_item);
+	pfree(right_item);
+
 	return rootbuf;
 }
 
diff --git a/src/backend/access/spgist/spgdoinsert.c b/src/backend/access/spgist/spgdoinsert.c
index 1f5d976..48f32cd 100644
--- a/src/backend/access/spgist/spgdoinsert.c
+++ b/src/backend/access/spgist/spgdoinsert.c
@@ -122,7 +122,8 @@ cmpOffsetNumbers(const void *a, const void *b)
  *
  * NB: this is used during WAL replay, so beware of trying to make it too
  * smart.  In particular, it shouldn't use "state" except for calling
- * spgFormDeadTuple().
+ * spgFormDeadTuple().  This is also used in a critical section, so no
+ * pallocs either!
  */
 void
 spgPageIndexMultiDelete(SpGistState *state, Page page,
@@ -131,7 +132,7 @@ spgPageIndexMultiDelete(SpGistState *state, Page page,
 						BlockNumber blkno, OffsetNumber offnum)
 {
 	OffsetNumber firstItem;
-	OffsetNumber *sortednos;
+	OffsetNumber sortednos[MaxIndexTuplesPerPage];
 	SpGistDeadTuple tuple = NULL;
 	int			i;
 
@@ -145,7 +146,6 @@ spgPageIndexMultiDelete(SpGistState *state, Page page,
 	 * replacement tuples.)  However, we must not scribble on the caller's
 	 * array, so we have to make a copy.
 	 */
-	sortednos = (OffsetNumber *) palloc(sizeof(OffsetNumber) * nitems);
 	memcpy(sortednos, itemnos, sizeof(OffsetNumber) * nitems);
 	if (nitems > 1)
 		qsort(sortednos, nitems, sizeof(OffsetNumber), cmpOffsetNumbers);
@@ -173,8 +173,6 @@ spgPageIndexMultiDelete(SpGistState *state, Page page,
 		else if (tupstate == SPGIST_PLACEHOLDER)
 			SpGistPageGetOpaque(page)->nPlaceholder++;
 	}
-
-	pfree(sortednos);
 }
 
 /*
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index f9d6609..565bbd5 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -859,9 +859,8 @@ XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
 
 	if (rechdr == NULL)
 	{
-		rechdr = malloc(SizeOfXLogRecord);
-		if (rechdr == NULL)
-			elog(ERROR, "out of memory");
+		static char rechdrbuf[SizeOfXLogRecord + MAXIMUM_ALIGNOF];
+		rechdr = (XLogRecord *) MAXALIGN(&rechdrbuf);
 		MemSet(rechdr, 0, SizeOfXLogRecord);
 	}
 
@@ -3080,6 +3079,7 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
 {
 	char		path[MAXPGPATH];
 	char		tmppath[MAXPGPATH];
+	char		zbuffer_raw[BLCKSZ + MAXIMUM_ALIGNOF];
 	char	   *zbuffer;
 	XLogSegNo	installed_segno;
 	int			max_advance;
@@ -3119,14 +3119,11 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
 	unlink(tmppath);
 
 	/*
-	 * Allocate a buffer full of zeros. This is done before opening the file
-	 * so that we don't leak the file descriptor if palloc fails.
-	 *
-	 * Note: palloc zbuffer, instead of just using a local char array, to
-	 * ensure it is reasonably well-aligned; this may save a few cycles
-	 * transferring data to the kernel.
+	 * Prepare a buffer full of zeros. Ensure the buffer is aligned; this
+	 * may save a few cycles transferring data to the kernel.
 	 */
-	zbuffer = (char *) palloc0(XLOG_BLCKSZ);
+	zbuffer = (char *) MAXALIGN(zbuffer_raw);
+	memset(zbuffer, 0, BLCKSZ);
 
 	/* do not use get_sync_bit() here --- want to fsync only at end of fill */
 	fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
@@ -3167,7 +3164,6 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
 					 errmsg("could not write to file \"%s\": %m", tmppath)));
 		}
 	}
-	pfree(zbuffer);
 
 	if (pg_fsync(fd) != 0)
 	{
diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c
index ccc92da..7729fca 100644
--- a/src/backend/storage/page/bufpage.c
+++ b/src/backend/storage/page/bufpage.c
@@ -15,6 +15,7 @@
 #include "postgres.h"
 
 #include "access/htup_details.h"
+#include "access/itup.h"
 #include "access/xlog.h"
 #include "storage/checksum.h"
 #include "utils/memdebug.h"
@@ -433,8 +434,6 @@ PageRepairFragmentation(Page page)
 	Offset		pd_lower = ((PageHeader) page)->pd_lower;
 	Offset		pd_upper = ((PageHeader) page)->pd_upper;
 	Offset		pd_special = ((PageHeader) page)->pd_special;
-	itemIdSort	itemidbase,
-				itemidptr;
 	ItemId		lp;
 	int			nline,
 				nstorage,
@@ -484,10 +483,11 @@ PageRepairFragmentation(Page page)
 		((PageHeader) page)->pd_upper = pd_special;
 	}
 	else
-	{							/* nstorage != 0 */
+	{
 		/* Need to compact the page the hard way */
-		itemidbase = (itemIdSort) palloc(sizeof(itemIdSortData) * nstorage);
-		itemidptr = itemidbase;
+		itemIdSortData itemidbase[MaxHeapTuplesPerPage];
+		itemIdSort	itemidptr = itemidbase;
+
 		totallen = 0;
 		for (i = 0; i < nline; i++)
 		{
@@ -532,8 +532,6 @@ PageRepairFragmentation(Page page)
 		}
 
 		((PageHeader) page)->pd_upper = upper;
-
-		pfree(itemidbase);
 	}
 
 	/* Set hint bit for PageAddItem */
@@ -782,8 +780,8 @@ PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
 	Offset		pd_lower = phdr->pd_lower;
 	Offset		pd_upper = phdr->pd_upper;
 	Offset		pd_special = phdr->pd_special;
-	itemIdSort	itemidbase,
-				itemidptr;
+	itemIdSortData	itemidbase[MaxIndexTuplesPerPage];
+	itemIdSort	itemidptr;
 	ItemId		lp;
 	int			nline,
 				nused;
@@ -795,6 +793,8 @@ PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
 	int			nextitm;
 	OffsetNumber offnum;
 
+	Assert(nitems < MaxIndexTuplesPerPage);
+
 	/*
 	 * If there aren't very many items to delete, then retail
 	 * PageIndexTupleDelete is the best way.  Delete the items in reverse
@@ -829,7 +829,6 @@ PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
 	 * still validity-checking.
 	 */
 	nline = PageGetMaxOffsetNumber(page);
-	itemidbase = (itemIdSort) palloc(sizeof(itemIdSortData) * nline);
 	itemidptr = itemidbase;
 	totallen = 0;
 	nused = 0;
@@ -895,8 +894,6 @@ PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
 
 	phdr->pd_lower = SizeOfPageHeaderData + nused * sizeof(ItemIdData);
 	phdr->pd_upper = upper;
-
-	pfree(itemidbase);
 }
 
 
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to