On Wed, Sep 7, 2011 at 8:28 AM, Andy Colson <[email protected]> wrote:
> On 08/22/2011 01:22 AM, Pavan Deolasee wrote:
>>
>
> Hi Pavan, I tried to apply your patch to git master (as of just now) and it
> failed. I assume that's what I should be checking out, right?
>
Yeah, seems like it bit-rotted. Please try the attached patch. I also
fixed a typo and added some more comments as per suggestion by Jim.
Thanks,
Pavan
--
Pavan Deolasee
EnterpriseDB http://www.enterprisedb.com
diff --git a/contrib/pageinspect/heapfuncs.c b/contrib/pageinspect/heapfuncs.c
index fa50655..2c1ab2c 100644
--- a/contrib/pageinspect/heapfuncs.c
+++ b/contrib/pageinspect/heapfuncs.c
@@ -150,6 +150,7 @@ heap_page_items(PG_FUNCTION_ARGS)
* many other ways, but at least we won't crash.
*/
if (ItemIdHasStorage(id) &&
+ !ItemIdIsDead(id) &&
lp_len >= sizeof(HeapTupleHeader) &&
lp_offset == MAXALIGN(lp_offset) &&
lp_offset + lp_len <= raw_page_size)
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 06db65d..cf65c05 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -3984,7 +3984,8 @@ log_heap_clean(Relation reln, Buffer buffer,
OffsetNumber *redirected, int nredirected,
OffsetNumber *nowdead, int ndead,
OffsetNumber *nowunused, int nunused,
- TransactionId latestRemovedXid)
+ TransactionId latestRemovedXid,
+ uint32 vacgen)
{
xl_heap_clean xlrec;
uint8 info;
@@ -3999,6 +4000,7 @@ log_heap_clean(Relation reln, Buffer buffer,
xlrec.latestRemovedXid = latestRemovedXid;
xlrec.nredirected = nredirected;
xlrec.ndead = ndead;
+ xlrec.vacgen = vacgen;
rdata[0].data = (char *) &xlrec;
rdata[0].len = SizeOfHeapClean;
@@ -4300,6 +4302,7 @@ heap_xlog_clean(XLogRecPtr lsn, XLogRecord *record)
int ndead;
int nunused;
Size freespace;
+ uint32 vacgen;
/*
* We're about to remove tuples. In Hot Standby mode, ensure that there's
@@ -4332,6 +4335,7 @@ heap_xlog_clean(XLogRecPtr lsn, XLogRecord *record)
nredirected = xlrec->nredirected;
ndead = xlrec->ndead;
+ vacgen = xlrec->vacgen;
end = (OffsetNumber *) ((char *) xlrec + record->xl_len);
redirected = (OffsetNumber *) ((char *) xlrec + SizeOfHeapClean);
nowdead = redirected + (nredirected * 2);
@@ -4343,7 +4347,8 @@ heap_xlog_clean(XLogRecPtr lsn, XLogRecord *record)
heap_page_prune_execute(buffer,
redirected, nredirected,
nowdead, ndead,
- nowunused, nunused);
+ nowunused, nunused,
+ vacgen);
freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */
diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c
index 61f2ce4..ee64758 100644
--- a/src/backend/access/heap/pruneheap.c
+++ b/src/backend/access/heap/pruneheap.c
@@ -29,9 +29,12 @@ typedef struct
TransactionId new_prune_xid; /* new prune hint value for page */
TransactionId latestRemovedXid; /* latest xid to be removed by this
* prune */
+ int already_dead; /* number of already dead line pointers */
+
int nredirected; /* numbers of entries in arrays below */
int ndead;
int nunused;
+
/* arrays that accumulate indexes of items to be changed */
OffsetNumber redirected[MaxHeapTuplesPerPage * 2];
OffsetNumber nowdead[MaxHeapTuplesPerPage];
@@ -123,8 +126,8 @@ heap_page_prune_opt(Relation relation, Buffer buffer, TransactionId OldestXmin)
TransactionId ignore = InvalidTransactionId; /* return value not
* needed */
- /* OK to prune */
- (void) heap_page_prune(relation, buffer, OldestXmin, true, &ignore);
+ /* OK to prune - pass invalid vacuum generation number */
+ (void) heap_page_prune(relation, buffer, OldestXmin, true, &ignore, 0);
}
/* And release buffer lock */
@@ -151,13 +154,15 @@ heap_page_prune_opt(Relation relation, Buffer buffer, TransactionId OldestXmin)
*/
int
heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin,
- bool report_stats, TransactionId *latestRemovedXid)
+ bool report_stats, TransactionId *latestRemovedXid,
+ uint32 current_vacgen)
{
int ndeleted = 0;
Page page = BufferGetPage(buffer);
OffsetNumber offnum,
maxoff;
PruneState prstate;
+ uint32 last_finished_vacgen = RelationGetLastVacGen(relation);
/*
* Our strategy is to scan the page and make lists of items to change,
@@ -173,6 +178,7 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin,
prstate.new_prune_xid = InvalidTransactionId;
prstate.latestRemovedXid = InvalidTransactionId;
prstate.nredirected = prstate.ndead = prstate.nunused = 0;
+ prstate.already_dead = 0;
memset(prstate.marked, 0, sizeof(prstate.marked));
/* Scan the page */
@@ -189,8 +195,26 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin,
/* Nothing to do if slot is empty or already dead */
itemid = PageGetItemId(page, offnum);
- if (!ItemIdIsUsed(itemid) || ItemIdIsDead(itemid))
+ if (!ItemIdIsUsed(itemid))
continue;
+
+ /*
+ * If the slot is dead-vacuumed and we know that the index pointers
+ * have already been vacuumed by the last index vacuum, just mark them
+ * unused so that they are removed when we defrag the page
+ */
+ if (ItemIdIsDeadVacuumed(itemid))
+ {
+ if (ItemIdGetVacGen(itemid) == last_finished_vacgen)
+ heap_prune_record_unused(&prstate, offnum);
+ continue;
+ }
+ else if (ItemIdIsDead(itemid))
+ {
+ heap_prune_record_dead(&prstate, offnum);
+ prstate.already_dead++;
+ continue;
+ }
/* Process this item or chain of items */
ndeleted += heap_prune_chain(relation, buffer, offnum,
@@ -211,7 +235,8 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin,
heap_page_prune_execute(buffer,
prstate.redirected, prstate.nredirected,
prstate.nowdead, prstate.ndead,
- prstate.nowunused, prstate.nunused);
+ prstate.nowunused, prstate.nunused,
+ current_vacgen);
/*
* Update the page's pd_prune_xid field to either zero, or the lowest
@@ -239,7 +264,8 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin,
prstate.redirected, prstate.nredirected,
prstate.nowdead, prstate.ndead,
prstate.nowunused, prstate.nunused,
- prstate.latestRemovedXid);
+ prstate.latestRemovedXid,
+ current_vacgen);
PageSetLSN(BufferGetPage(buffer), recptr);
PageSetTLI(BufferGetPage(buffer), ThisTimeLineID);
@@ -271,9 +297,12 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin,
* If requested, report the number of tuples reclaimed to pgstats. This is
* ndeleted minus ndead, because we don't want to count a now-DEAD root
* item as a deletion for this purpose.
+ *
+ * Adjust already_dead since they are counted as ndead and we really don't
+ * want to include them here
*/
- if (report_stats && ndeleted > prstate.ndead)
- pgstat_update_heap_dead_tuples(relation, ndeleted - prstate.ndead);
+ if (report_stats && ndeleted > (prstate.ndead - prstate.already_dead))
+ pgstat_update_heap_dead_tuples(relation, ndeleted - (prstate.ndead - prstate.already_dead));
*latestRemovedXid = prstate.latestRemovedXid;
@@ -643,7 +672,8 @@ void
heap_page_prune_execute(Buffer buffer,
OffsetNumber *redirected, int nredirected,
OffsetNumber *nowdead, int ndead,
- OffsetNumber *nowunused, int nunused)
+ OffsetNumber *nowunused, int nunused,
+ uint32 vacgen)
{
Page page = (Page) BufferGetPage(buffer);
OffsetNumber *offnum;
@@ -667,7 +697,17 @@ heap_page_prune_execute(Buffer buffer,
OffsetNumber off = *offnum++;
ItemId lp = PageGetItemId(page, off);
- ItemIdSetDead(lp);
+ /*
+ * If we are called from a vacuum (vacgen > 0), mark the line pointers
+ * as dead-vacuumed and also store the current vacuum generation number
+ * in the line pointer. OTOH if we are called from a normal HOT-prune
+ * routine, mark the line pointers as DEAD since the index pointers to
+ * them will not be removed just yet.
+ */
+ if (vacgen)
+ ItemIdSetDeadVacuumed(lp, vacgen);
+ else
+ ItemIdSetDead(lp);
}
/* Update all now-unused line pointers */
diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c
index 2aaf775..d640680 100644
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -786,6 +786,8 @@ InsertPgClassTuple(Relation pg_class_desc,
values[Anum_pg_class_relhastriggers - 1] = BoolGetDatum(rd_rel->relhastriggers);
values[Anum_pg_class_relhassubclass - 1] = BoolGetDatum(rd_rel->relhassubclass);
values[Anum_pg_class_relfrozenxid - 1] = TransactionIdGetDatum(rd_rel->relfrozenxid);
+ values[Anum_pg_class_relnextvacgen - 1] = Int32GetDatum(rd_rel->relnextvacgen);
+ values[Anum_pg_class_rellastvacgen - 1] = Int32GetDatum(rd_rel->rellastvacgen);
if (relacl != (Datum) 0)
values[Anum_pg_class_relacl - 1] = relacl;
else
@@ -880,6 +882,9 @@ AddNewRelationTuple(Relation pg_class_desc,
new_rel_reltup->relfrozenxid = InvalidTransactionId;
}
+ new_rel_reltup->relnextvacgen = 1;
+ new_rel_reltup->rellastvacgen = 0;
+
new_rel_reltup->relowner = relowner;
new_rel_reltup->reltype = new_type_oid;
new_rel_reltup->reloftype = reloftype;
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index 6b0a4e7..c074524 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -1073,11 +1073,20 @@ acquire_sample_rows(Relation onerel, HeapTuple *rows, int targrows,
* pointers should be counted as dead, because we need vacuum to
* run to get rid of them. Note that this rule agrees with the
* way that heap_page_prune() counts things.
+ *
+ * XXX We don't count dead line pointers if know that they can be
+ * removed by a HOT cleanup.
*/
if (!ItemIdIsNormal(itemid))
{
- if (ItemIdIsDead(itemid))
- deadrows += 1;
+ if (ItemIdIsDeadVacuumed(itemid))
+ {
+ if (ItemIdGetVacGen(itemid) != RelationGetLastVacGen(onerel))
+ deadrows += 1;
+ }
+ else if (ItemIdIsDead(itemid))
+ deadrows++;
+
continue;
}
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index 7fe787e..d3c92c9 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -645,6 +645,88 @@ vac_update_relstats(Relation relation,
heap_close(rd, RowExclusiveLock);
}
+/*
+ * Grab the next vacuum generation number to be used to stamp the dead-vacuumed
+ * line pointers and also increment the generation number.
+ */
+uint32
+vac_update_nextvacgen(Relation relation)
+{
+ Oid relid = RelationGetRelid(relation);
+ Relation rd;
+ HeapTuple ctup;
+ Form_pg_class pgcform;
+ uint32 nextvacgen;
+
+ rd = heap_open(RelationRelationId, RowExclusiveLock);
+
+ /* Fetch a copy of the tuple to scribble on */
+ ctup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(relid));
+ if (!HeapTupleIsValid(ctup))
+ elog(ERROR, "pg_class entry for relid %u vanished during vacuuming",
+ relid);
+ pgcform = (Form_pg_class) GETSTRUCT(ctup);
+
+ /* Remember the next vacuum generation number before incrementing it */
+ nextvacgen = pgcform->relnextvacgen;
+
+ /*
+ * Increment while taking care of wrap-around (without using zero)
+ *
+ * Note: We don't worry about the wrap-around issues here since it would
+ * take a 1 Billion vacuums on the same relation for the vacuum generation
+ * to wrap-around. That would take ages to happen and even if it happens,
+ * the chances that we might have dead-vacuumed line pointers still
+ * stamped with the old (failed) vacuum are infinitely small since some
+ * other vacuum cycle would have taken care of them.
+ */
+ pgcform->relnextvacgen = pgcform->relnextvacgen + 1;
+ if (pgcform->relnextvacgen == 0x80000000)
+ pgcform->relnextvacgen = 1;
+
+ heap_inplace_update(rd, ctup);
+
+ heap_close(rd, RowExclusiveLock);
+
+ /*
+ * Increase command counter since we want to see the updated row when we
+ * again come back to set the rellastvacgen when the vacuum completes and
+ * we don't to forget what we just did above
+ */
+ CommandCounterIncrement();
+
+ return nextvacgen;
+}
+
+/*
+ * Update the generation number of the last successful index vacuum.
+ */
+void
+vac_update_lastvacgen(Relation relation, uint32 vacgen)
+{
+ Oid relid = RelationGetRelid(relation);
+ Relation rd;
+ HeapTuple ctup;
+ Form_pg_class pgcform;
+
+ rd = heap_open(RelationRelationId, RowExclusiveLock);
+
+ /* Fetch a copy of the tuple to scribble on */
+ ctup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(relid));
+ if (!HeapTupleIsValid(ctup))
+ elog(ERROR, "pg_class entry for relid %u vanished during vacuuming",
+ relid);
+ pgcform = (Form_pg_class) GETSTRUCT(ctup);
+
+ /* Store the 30 LSB to match with what we store in the line pointers */
+ pgcform->rellastvacgen = (vacgen & 0x3fffffff);
+
+ heap_inplace_update(rd, ctup);
+
+ heap_close(rd, RowExclusiveLock);
+
+ CommandCounterIncrement();
+}
/*
* vac_update_datfrozenxid() -- update pg_database.datfrozenxid for our DB
diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c
index a2420a8..74558df 100644
--- a/src/backend/commands/vacuumlazy.c
+++ b/src/backend/commands/vacuumlazy.c
@@ -100,6 +100,7 @@ typedef struct LVRelStats
ItemPointer dead_tuples; /* array of ItemPointerData */
int num_index_scans;
TransactionId latestRemovedXid;
+ uint32 lastvacgen;
} LVRelStats;
@@ -115,15 +116,12 @@ static BufferAccessStrategy vac_strategy;
/* non-export function prototypes */
static void lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
Relation *Irel, int nindexes, bool scan_all);
-static void lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats);
static void lazy_vacuum_index(Relation indrel,
IndexBulkDeleteResult **stats,
LVRelStats *vacrelstats);
static void lazy_cleanup_index(Relation indrel,
IndexBulkDeleteResult *stats,
LVRelStats *vacrelstats);
-static int lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
- int tupindex, LVRelStats *vacrelstats);
static void lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats);
static BlockNumber count_nondeletable_pages(Relation onerel,
LVRelStats *vacrelstats);
@@ -211,6 +209,10 @@ lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
/* Vacuum the Free Space Map */
FreeSpaceMapVacuum(onerel);
+ /* Since vacuum ran to completion, remember the vacuum generation number */
+ if (vacrelstats->lastvacgen != 0)
+ vac_update_lastvacgen(onerel, vacrelstats->lastvacgen);
+
/*
* Update statistics in pg_class.
*
@@ -312,6 +314,41 @@ vacuum_log_cleanup_info(Relation rel, LVRelStats *vacrelstats)
*
* If there are no indexes then we just vacuum each dirty page as we
* process it, since there's no point in gathering many tuples.
+ *
+ * Starting 9.2, we removed the second heap pass of vacuum and instead
+ * leave the dead line pointers in the heap to be removed by the next
+ * vacuum cycle or a HOT-prune operation. We can do this without much
+ * performance penalty because almost all the dead space is reclaimed in
+ * the first pass itself (except that which is taken by the dead line
+ * pointers and there is no guarantee that will be freed by the second
+ * pass anyways). But this gives us two significant benefits:
+ *
+ * 1. We don't have to scan the heap again. Even though visibility map
+ * lets us scan only the necessary pages, in many cases this would still
+ * be a large part of the relation
+ *
+ * 2. We don't have to write the heap pages (and associated WAL) twice.
+ * Since vacuum use ring-buffers for heap scan, this would actually mean
+ * disk IO unless the relation is very small.
+ *
+ * The way we do this is by tracking the last successful vacuum by its
+ * generation number in the pg_class row. When a dead line pointer is
+ * collected by a vacuum, we store the generation number of the vacuum in
+ * the line pointer itself (lp_off/lp_len is not used for DEAD heap line
+ * pointer and that gives us 30-bits of unused space to store the
+ * information). Later on, either as part of the HOT-prune or the next
+ * vacuum on the table, we check if the vacuum generation number stored in
+ * a dead-vacuumed lined pointer is same as the last successful vacuum on
+ * the table and remove those dead-vacuumed line pointers. We are sure at
+ * that point that the index pointers to those dead-vacuumed line pointers
+ * must have been already removed.
+ *
+ * If the vacuum operation that generated the dead-vacuumed line pointer
+ * aborts in the middle, the subsequent vacuum will again scan these line
+ * pointers and stamp them with its generation number. Finally, when the
+ * vacuum finishes successfully and this information is recorded in the
+ * pg_class row, the dead-vacuumed line pointers are cleaned up from the
+ * heap.
*/
static void
lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
@@ -333,6 +370,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
Buffer vmbuffer = InvalidBuffer;
BlockNumber next_not_all_visible_block;
bool skipping_all_visible_blocks;
+ int current_vacgen;
pg_rusage_init(&ru0);
@@ -345,6 +383,23 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
empty_pages = vacuumed_pages = 0;
num_tuples = tups_vacuumed = nkeep = nunused = 0;
+ /*
+ * Before starting the vacuum, grab the next vacuum generation number for
+ * this relation. Whenever a block is scanned and dead line pointers are
+ * collected, we store the vacuum generation number in the line pointer
+ * offset (since lp_off is not useful for dead heap line pointers).
+ *
+ * We also update the relnextvacgen to guard against the case when this
+ * vacuum aborts after scanning few pages. If we don't increment the
+ * relnextvacgen now, the next vacuum may use the same generation number
+ * and if it skips the pages scanned by this vacuum (though not possible
+ * currently because the way visibility map is handled), we might get into
+ * a situation where the index pointers of some dead-vacuumed line pointers
+ * are not yet removed, but the vacuum generation number stored in those
+ * line pointers is same as the last successful vacuum on the table.
+ */
+ current_vacgen = vac_update_nextvacgen(onerel);
+
indstats = (IndexBulkDeleteResult **)
palloc0(nindexes * sizeof(IndexBulkDeleteResult *));
@@ -458,8 +513,6 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
lazy_vacuum_index(Irel[i],
&indstats[i],
vacrelstats);
- /* Remove tuples from heap */
- lazy_vacuum_heap(onerel, vacrelstats);
/*
* Forget the now-vacuumed tuples, and press on, but be careful
@@ -555,7 +608,8 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
* We count tuples removed by the pruning step as removed by VACUUM.
*/
tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin, false,
- &vacrelstats->latestRemovedXid);
+ &vacrelstats->latestRemovedXid,
+ current_vacgen);
/*
* Now scan the page to collect vacuumable items and check for tuples
@@ -739,24 +793,13 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
}
}
+ vacuumed_pages++;
+
/*
- * If there are no indexes then we can vacuum the page right now
- * instead of doing a second scan.
+ * If there are no indexes, we don't need to remember the dead tuples
*/
- if (nindexes == 0 &&
- vacrelstats->num_dead_tuples > 0)
- {
- /* Remove tuples from heap */
- lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats);
-
- /*
- * Forget the now-vacuumed tuples, and press on, but be careful
- * not to reset latestRemovedXid since we want that value to be
- * valid.
- */
+ if (nindexes == 0)
vacrelstats->num_dead_tuples = 0;
- vacuumed_pages++;
- }
freespace = PageGetHeapFreeSpace(page);
@@ -815,14 +858,9 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
vacrelstats->nonempty_pages = blkno + 1;
/*
- * If we remembered any tuples for deletion, then the page will be
- * visited again by lazy_vacuum_heap, which will compute and record
- * its post-compaction free space. If not, then we're done with this
- * page, so remember its free space as-is. (This path will always be
- * taken if there are no indexes.)
+ * Record the free space on the page.
*/
- if (vacrelstats->num_dead_tuples == prev_dead_count)
- RecordPageWithFreeSpace(onerel, blkno, freespace);
+ RecordPageWithFreeSpace(onerel, blkno, freespace);
}
/* save stats for use later */
@@ -847,8 +885,6 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
lazy_vacuum_index(Irel[i],
&indstats[i],
vacrelstats);
- /* Remove tuples from heap */
- lazy_vacuum_heap(onerel, vacrelstats);
vacrelstats->num_index_scans++;
}
@@ -859,11 +895,14 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
vmbuffer = InvalidBuffer;
}
+ /* Remember the current vacuum generation */
+ vacrelstats->lastvacgen = current_vacgen;
+
/* Do post-vacuum cleanup and statistics update for each index */
for (i = 0; i < nindexes; i++)
lazy_cleanup_index(Irel[i], indstats[i], vacrelstats);
- /* If no indexes, make log report that lazy_vacuum_heap would've made */
+ /* Report vacuum stats */
if (vacuumed_pages)
ereport(elevel,
(errmsg("\"%s\": removed %.0f row versions in %u pages",
@@ -885,118 +924,6 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
pg_rusage_show(&ru0))));
}
-
-/*
- * lazy_vacuum_heap() -- second pass over the heap
- *
- * This routine marks dead tuples as unused and compacts out free
- * space on their pages. Pages not having dead tuples recorded from
- * lazy_scan_heap are not visited at all.
- *
- * Note: the reason for doing this as a second pass is we cannot remove
- * the tuples until we've removed their index entries, and we want to
- * process index entry removal in batches as large as possible.
- */
-static void
-lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
-{
- int tupindex;
- int npages;
- PGRUsage ru0;
-
- pg_rusage_init(&ru0);
- npages = 0;
-
- tupindex = 0;
- while (tupindex < vacrelstats->num_dead_tuples)
- {
- BlockNumber tblk;
- Buffer buf;
- Page page;
- Size freespace;
-
- vacuum_delay_point();
-
- tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
- buf = ReadBufferExtended(onerel, MAIN_FORKNUM, tblk, RBM_NORMAL,
- vac_strategy);
- LockBufferForCleanup(buf);
- tupindex = lazy_vacuum_page(onerel, tblk, buf, tupindex, vacrelstats);
-
- /* Now that we've compacted the page, record its available space */
- page = BufferGetPage(buf);
- freespace = PageGetHeapFreeSpace(page);
-
- UnlockReleaseBuffer(buf);
- RecordPageWithFreeSpace(onerel, tblk, freespace);
- npages++;
- }
-
- ereport(elevel,
- (errmsg("\"%s\": removed %d row versions in %d pages",
- RelationGetRelationName(onerel),
- tupindex, npages),
- errdetail("%s.",
- pg_rusage_show(&ru0))));
-}
-
-/*
- * lazy_vacuum_page() -- free dead tuples on a page
- * and repair its fragmentation.
- *
- * Caller must hold pin and buffer cleanup lock on the buffer.
- *
- * tupindex is the index in vacrelstats->dead_tuples of the first dead
- * tuple for this page. We assume the rest follow sequentially.
- * The return value is the first tupindex after the tuples of this page.
- */
-static int
-lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
- int tupindex, LVRelStats *vacrelstats)
-{
- Page page = BufferGetPage(buffer);
- OffsetNumber unused[MaxOffsetNumber];
- int uncnt = 0;
-
- START_CRIT_SECTION();
-
- for (; tupindex < vacrelstats->num_dead_tuples; tupindex++)
- {
- BlockNumber tblk;
- OffsetNumber toff;
- ItemId itemid;
-
- tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
- if (tblk != blkno)
- break; /* past end of tuples for this block */
- toff = ItemPointerGetOffsetNumber(&vacrelstats->dead_tuples[tupindex]);
- itemid = PageGetItemId(page, toff);
- ItemIdSetUnused(itemid);
- unused[uncnt++] = toff;
- }
-
- PageRepairFragmentation(page);
-
- MarkBufferDirty(buffer);
-
- /* XLOG stuff */
- if (RelationNeedsWAL(onerel))
- {
- XLogRecPtr recptr;
-
- recptr = log_heap_clean(onerel, buffer,
- NULL, 0, NULL, 0,
- unused, uncnt,
- vacrelstats->latestRemovedXid);
- PageSetLSN(page, recptr);
- PageSetTLI(page, ThisTimeLineID);
- }
-
- END_CRIT_SECTION();
-
- return tupindex;
-}
-
/*
* lazy_vacuum_index() -- vacuum one index relation.
*
@@ -1223,9 +1150,13 @@ count_nondeletable_pages(Relation onerel, LVRelStats *vacrelstats)
* Note: any non-unused item should be taken as a reason to keep
* this page. We formerly thought that DEAD tuples could be
* thrown away, but that's not so, because we'd not have cleaned
- * out their index entries.
+ * out their index entries. But we can throw away the dead-vacuumed
+ * tuples created by this vacuum since those index pointers must
+ * have been removed before we come here
*/
- if (ItemIdIsUsed(itemid))
+ if (ItemIdIsUsed(itemid) &&
+ !(ItemIdIsDeadVacuumed(itemid) &&
+ ItemIdGetVacGen(itemid) == vacrelstats->lastvacgen))
{
hastup = true;
break; /* can stop scanning */
diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c
index 018f9c1..07ec438 100644
--- a/src/backend/storage/page/bufpage.c
+++ b/src/backend/storage/page/bufpage.c
@@ -396,7 +396,7 @@ PageRepairFragmentation(Page page)
lp = PageGetItemId(page, i);
if (ItemIdIsUsed(lp))
{
- if (ItemIdHasStorage(lp))
+ if (!ItemIdIsDead(lp) && ItemIdHasStorage(lp))
nstorage++;
}
else
@@ -409,7 +409,13 @@ PageRepairFragmentation(Page page)
if (nstorage == 0)
{
- /* Page is completely empty, so just reset it quickly */
+ /*
+ * Page is completely empty, so just reset it quickly
+ *
+ * Note: We don't reset the pd_lower because the page may still have
+ * DEAD line pointers with index pointers pointing to them and its not
+ * safe to remove them before the index pointers are first removed
+ */
((PageHeader) page)->pd_upper = pd_special;
}
else
@@ -421,7 +427,7 @@ PageRepairFragmentation(Page page)
for (i = 0; i < nline; i++)
{
lp = PageGetItemId(page, i + 1);
- if (ItemIdHasStorage(lp))
+ if (!ItemIdIsDead(lp) && ItemIdHasStorage(lp))
{
itemidptr->offsetindex = i;
itemidptr->itemoff = ItemIdGetOffset(lp);
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index 776ea5c..b1395ee 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -133,7 +133,8 @@ extern XLogRecPtr log_heap_clean(Relation reln, Buffer buffer,
OffsetNumber *redirected, int nredirected,
OffsetNumber *nowdead, int ndead,
OffsetNumber *nowunused, int nunused,
- TransactionId latestRemovedXid);
+ TransactionId latestRemovedXid,
+ uint32 vacgen);
extern XLogRecPtr log_heap_freeze(Relation reln, Buffer buffer,
TransactionId cutoff_xid,
OffsetNumber *offsets, int offcnt);
@@ -147,11 +148,13 @@ extern void heap_page_prune_opt(Relation relation, Buffer buffer,
TransactionId OldestXmin);
extern int heap_page_prune(Relation relation, Buffer buffer,
TransactionId OldestXmin,
- bool report_stats, TransactionId *latestRemovedXid);
+ bool report_stats, TransactionId *latestRemovedXid,
+ uint32 vacgen);
extern void heap_page_prune_execute(Buffer buffer,
OffsetNumber *redirected, int nredirected,
OffsetNumber *nowdead, int ndead,
- OffsetNumber *nowunused, int nunused);
+ OffsetNumber *nowunused, int nunused,
+ uint32 vacgen);
extern void heap_get_root_tuples(Page page, OffsetNumber *root_offsets);
/* in heap/syncscan.c */
diff --git a/src/include/access/htup.h b/src/include/access/htup.h
index c025835..4a8d842 100644
--- a/src/include/access/htup.h
+++ b/src/include/access/htup.h
@@ -691,10 +691,11 @@ typedef struct xl_heap_clean
TransactionId latestRemovedXid;
uint16 nredirected;
uint16 ndead;
+ uint32 vacgen;
/* OFFSET NUMBERS FOLLOW */
} xl_heap_clean;
-#define SizeOfHeapClean (offsetof(xl_heap_clean, ndead) + sizeof(uint16))
+#define SizeOfHeapClean (offsetof(xl_heap_clean, vacgen) + sizeof(uint32))
/*
* Cleanup_info is required in some cases during a lazy VACUUM.
diff --git a/src/include/catalog/pg_class.h b/src/include/catalog/pg_class.h
index e006180..8035cda 100644
--- a/src/include/catalog/pg_class.h
+++ b/src/include/catalog/pg_class.h
@@ -65,6 +65,8 @@ CATALOG(pg_class,1259) BKI_BOOTSTRAP BKI_ROWTYPE_OID(83) BKI_SCHEMA_MACRO
bool relhastriggers; /* has (or has had) any TRIGGERs */
bool relhassubclass; /* has (or has had) derived classes */
TransactionId relfrozenxid; /* all Xids < this are frozen in this rel */
+ int4 relnextvacgen; /* generation number of the next vacuum */
+ int4 rellastvacgen; /* generation number of last successful vacuum */
/*
* VARIABLE LENGTH FIELDS start here. These fields may be NULL, too.
@@ -78,7 +80,7 @@ CATALOG(pg_class,1259) BKI_BOOTSTRAP BKI_ROWTYPE_OID(83) BKI_SCHEMA_MACRO
/* Size of fixed part of pg_class tuples, not counting var-length fields */
#define CLASS_TUPLE_SIZE \
- (offsetof(FormData_pg_class,relfrozenxid) + sizeof(TransactionId))
+ (offsetof(FormData_pg_class, rellastvacgen) + sizeof(int4))
/* ----------------
* Form_pg_class corresponds to a pointer to a tuple with
@@ -92,7 +94,7 @@ typedef FormData_pg_class *Form_pg_class;
* ----------------
*/
-#define Natts_pg_class 26
+#define Natts_pg_class 28
#define Anum_pg_class_relname 1
#define Anum_pg_class_relnamespace 2
#define Anum_pg_class_reltype 3
@@ -117,8 +119,10 @@ typedef FormData_pg_class *Form_pg_class;
#define Anum_pg_class_relhastriggers 22
#define Anum_pg_class_relhassubclass 23
#define Anum_pg_class_relfrozenxid 24
-#define Anum_pg_class_relacl 25
-#define Anum_pg_class_reloptions 26
+#define Anum_pg_class_relnextvacgen 25
+#define Anum_pg_class_rellastvacgen 26
+#define Anum_pg_class_relacl 27
+#define Anum_pg_class_reloptions 28
/* ----------------
* initial contents of pg_class
@@ -130,13 +134,13 @@ typedef FormData_pg_class *Form_pg_class;
*/
/* Note: "3" in the relfrozenxid column stands for FirstNormalTransactionId */
-DATA(insert OID = 1247 ( pg_type PGNSP 71 0 PGUID 0 0 0 0 0 0 0 f f p r 29 0 t f f f f 3 _null_ _null_ ));
+DATA(insert OID = 1247 ( pg_type PGNSP 71 0 PGUID 0 0 0 0 0 0 0 f f p r 29 0 t f f f f 3 1 0 _null_ _null_ ));
DESCR("");
-DATA(insert OID = 1249 ( pg_attribute PGNSP 75 0 PGUID 0 0 0 0 0 0 0 f f p r 21 0 f f f f f 3 _null_ _null_ ));
+DATA(insert OID = 1249 ( pg_attribute PGNSP 75 0 PGUID 0 0 0 0 0 0 0 f f p r 21 0 f f f f f 3 1 0 _null_ _null_ ));
DESCR("");
-DATA(insert OID = 1255 ( pg_proc PGNSP 81 0 PGUID 0 0 0 0 0 0 0 f f p r 26 0 t f f f f 3 _null_ _null_ ));
+DATA(insert OID = 1255 ( pg_proc PGNSP 81 0 PGUID 0 0 0 0 0 0 0 f f p r 26 0 t f f f f 3 1 0 _null_ _null_ ));
DESCR("");
-DATA(insert OID = 1259 ( pg_class PGNSP 83 0 PGUID 0 0 0 0 0 0 0 f f p r 26 0 t f f f f 3 _null_ _null_ ));
+DATA(insert OID = 1259 ( pg_class PGNSP 83 0 PGUID 0 0 0 0 0 0 0 f f p r 28 0 t f f f f 3 1 0 _null_ _null_ ));
DESCR("");
diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h
index cfbe0c4..4c7480d 100644
--- a/src/include/commands/vacuum.h
+++ b/src/include/commands/vacuum.h
@@ -151,6 +151,8 @@ extern void vac_update_relstats(Relation relation,
double num_tuples,
bool hasindex,
TransactionId frozenxid);
+extern void vac_update_lastvacgen(Relation relation, uint32 vacgen);
+extern uint32 vac_update_nextvacgen(Relation relation);
extern void vacuum_set_xid_limits(int freeze_min_age, int freeze_table_age,
bool sharedRel,
TransactionId *oldestXmin,
diff --git a/src/include/storage/itemid.h b/src/include/storage/itemid.h
index 961d2c2..c0fbd69 100644
--- a/src/include/storage/itemid.h
+++ b/src/include/storage/itemid.h
@@ -19,7 +19,11 @@
*
* In some cases an item pointer is "in use" but does not have any associated
* storage on the page. By convention, lp_len == 0 in every item pointer
- * that does not have storage, independently of its lp_flags state.
+ * that does not have storage, independently of its lp_flags state. But
+ * lp_len != 0 does not imply that the line pointer has storage, not at least
+ * for heap tuples where we use lp_len (and lp_off) to store the vacuum
+ * generation number for dead-vacuumed tuples. In such cases, lp_flags must be
+ * set to LP_DEAD though.
*/
typedef struct ItemIdData
{
@@ -33,11 +37,16 @@ typedef ItemIdData *ItemId;
/*
* lp_flags has these possible states. An UNUSED line pointer is available
* for immediate re-use, the other states are not.
+ *
+ * A DEAD line pointer in heap does not have any storage associated with it.
+ * But a similar pointer in an index page may still have storage associated
+ * with it since we don't defrag index pages online.
*/
#define LP_UNUSED 0 /* unused (should always have lp_len=0) */
#define LP_NORMAL 1 /* used (should always have lp_len>0) */
#define LP_REDIRECT 2 /* HOT redirect (should have lp_len=0) */
-#define LP_DEAD 3 /* dead, may or may not have storage */
+#define LP_DEAD 3 /* dead or dead-vacuumed. Heap tuples don't have
+ storage, but index tuples may have */
/*
* Item offsets and lengths are represented by these types when
@@ -107,14 +116,26 @@ typedef uint16 ItemLength;
/*
* ItemIdIsDead
- * True iff item identifier is in state DEAD.
+ * True iff item identifier is in state DEAD or DEAD VACUUMED
*/
#define ItemIdIsDead(itemId) \
((itemId)->lp_flags == LP_DEAD)
/*
+ * ItemIdIsDeadVacuumed
+ * True iff item identifier is in state DEAD VACUUMED.
+ */
+#define ItemIdIsDeadVacuumed(itemId) \
+ (((itemId)->lp_flags == LP_DEAD) && \
+ (((itemId)->lp_off != 0) || \
+ ((itemid)->lp_len != 0)))
+
+/*
* ItemIdHasStorage
- * True iff item identifier has associated storage.
+ * True iff item identifier has associated storage. For DEAD line
+ * pointers, this applies only for index tuple since DEAD heap tuple
+ * never has storage associated with it. In fact, the lp_off/lp_len for
+ * DEAD heap line pointers are used to store the vacuum generation number
*/
#define ItemIdHasStorage(itemId) \
((itemId)->lp_len != 0)
@@ -168,6 +189,37 @@ typedef uint16 ItemLength;
)
/*
+ * ItemIdSetDeadVacuumed
+ * Set the item identifier to be DEAD VACUUMED, with no storage.
+ * Beware of multiple evaluations of itemId!
+ *
+ * Note: we save the generation number of the vacuum creating this dead-vacuumed
+ * line pointer. We reuse the lp_off/lp_len for this purpose since the
+ * dead-vacuumed line pointers only exist in the heap and lp_off/lp_len is not
+ * used for dead line pointers in the heap.
+ *
+ * Store the 30 LSB of the vacuum generation number in lp_off/lp_len
+ */
+#define ItemIdSetDeadVacuumed(itemId, vacgen) \
+( \
+ (itemId)->lp_flags = LP_DEAD, \
+ (itemId)->lp_off = ((vacgen) & (0x3fffffff << 15)), \
+ (itemId)->lp_len = ((vacgen) & (0x3fffffff >> 15)) \
+)
+
+/*
+ * Get the generation number of the vacuum that created this dead-vacuumed line
+ * pointer.
+ *
+ * Note: must be called only for the dead-vacuumed line pointers
+ */
+#define ItemIdGetVacGen(itemId) \
+( \
+ AssertMacro(ItemIdIsDeadVacuumed(itemId)), \
+ (((int32)((itemId)->lp_off) << 15) | (itemId)->lp_len) \
+)
+
+/*
* ItemIdMarkDead
* Set the item identifier to be DEAD, keeping its existing storage.
*
diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h
index 173dc16..d602b24 100644
--- a/src/include/utils/rel.h
+++ b/src/include/utils/rel.h
@@ -359,6 +359,13 @@ typedef struct StdRdOptions
((relation)->rd_rel->relpersistence == RELPERSISTENCE_TEMP)
/*
+ * RelationGetVacuumGen
+ * Get vacuum generation number for the relation
+ */
+#define RelationGetLastVacGen(relation) \
+ ((relation)->rd_rel->rellastvacgen)
+
+/*
* RELATION_IS_LOCAL
* If a rel is either temp or newly created in the current transaction,
* it can be assumed to be visible only to the current backend.
--
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers