On Mon, 12 Mar 2018 10:43:36 -0400
Tom Lane <[email protected]> wrote:
> Re-reading that thread, it seems like we should have applied Jeff's
> initial trivial patch[1] (to not hold across
> table_recheck_autovac) rather than waiting around for a super duper
> improvement to get agreed on. I'm a bit tempted to go do that;
> if nothing else, it seems simple enough to back-patch, unlike most
> of the rest of what was discussed.
This will help. In my testing it reduced the lock contention considerably. I
think a lot of users with lots of tables will benefit from it. However it does
nothing about the bigger issue which is that autovacuum flaps the stats temp
files.
I have attached the patch we are currently using. It applies to 9.6.8. I
have versions for older releases in 9.4, 9.5, 9.6. I fails to apply to 10,
and presumably head but I can update it if there is any interest.
The patch has three main features:
- Impose an ordering on the autovacuum workers worklist to avoid
the need for rechecking statistics to skip already vacuumed tables.
- Reduce the frequency of statistics refreshes
- Remove the AutovacuumScheduleLock
The patch is aware of the shared relations fix. It is subject to the problem
Alvero noted in the original discussion: if the last table to be
autovacuumed is large new workers will exit instead of choosing an
earlier table. Not sure this is really a big problem in practice, but I agree
it is a corner case.
My patch does not do what I believe really needs doing:
Schedule autovacuum more intelligently.
Currently autovacuum collects all the tables in the physical order of
pg_class and visits them one by one. With many tables it can take too long to
respond to urgent vacuum needs, eg heavily updated tables or wraparound.
There is a patch in the current commit fest that allows prioritizing tables
manually. I don't like that idea much, but it does recognize that the current
scheme is not adequate for databases with large numbers of tables.
-dg
--
David Gould [email protected]
If simplicity worked, the world would be overrun with insects.
diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c
index 6720248675..3bb06a2337 100644
--- a/src/backend/postmaster/autovacuum.c
+++ b/src/backend/postmaster/autovacuum.c
@@ -196,6 +196,26 @@ typedef struct autovac_table
char *at_datname;
} autovac_table;
+/*
+ * This is used to collect the oids of tables that may need vacuuming.
+ * Once they are all collected it is sorted by oid and then stepped through.
+ * By establishing an order we reduce contention between workers for tables.
+ */
+typedef struct aw_workitem
+{
+ Oid relid;
+ bool sharedrel;
+} aw_workitem;
+
+/* Extendable array of items. */
+typedef struct aw_worklist
+{
+ aw_workitem *items;
+ int maxitems,
+ numitems;
+} aw_worklist;
+
+
/*-------------
* This struct holds information about a single worker's whereabouts. We keep
* an array of these in shared memory, sized according to
@@ -1069,6 +1089,76 @@ db_comparator(const void *a, const void *b)
return (((const avl_dbase *) a)->adl_score < ((const avl_dbase *) b)->adl_score) ? 1 : -1;
}
+
+/*
+ * Utilities to manage worklist of tables to vacuum.
+ */
+
+static void aw_worklist_init(aw_worklist *w)
+{
+ w->maxitems = 32; /* not really a magic number, we have to start somewhere */
+ w->numitems = 0;
+ w->items = palloc(w->maxitems * sizeof(aw_workitem));
+}
+
+static void aw_worklist_clear(aw_worklist *w)
+{
+ w->maxitems = 0;
+ w->numitems = 0;
+ pfree(w->items);
+ w->items = NULL;
+}
+
+/* Append an item to the worklist */
+static aw_workitem *aw_worklist_add(aw_worklist *w, Oid oid, bool isshared)
+{
+ aw_workitem *newitem;
+
+ if (w->numitems >= w->maxitems)
+ {
+ /* grow the array */
+ w->maxitems = w->maxitems * 1.5;
+ w->items = (aw_workitem *) repalloc(w->items,
+ w->maxitems * sizeof(aw_workitem));
+ }
+ newitem = w->items + w->numitems++;
+ newitem->relid = oid,
+ newitem->sharedrel = isshared;
+ return newitem;
+}
+
+/* Release extra memory that might have been allocated during growth */
+static void aw_worklist_trim(aw_worklist *w)
+{
+ if (w->maxitems >= w->numitems)
+ {
+ w->maxitems = w->numitems;
+ w->items = (aw_workitem *) repalloc(w->items,
+ w->numitems * sizeof(aw_workitem));
+ }
+}
+
+/* qsort comparator for aw_workitem items */
+static int
+aw_workitem_cmp(const void *p1, const void *p2)
+{
+ const aw_workitem *v1 = ((const aw_workitem *) p1);
+ const aw_workitem *v2 = ((const aw_workitem *) p2);
+
+ if (v1->relid < v2->relid)
+ return -1;
+ if (v1->relid > v2->relid)
+ return 1;
+ return 0;
+}
+
+static void aw_worklist_sort(aw_worklist *w)
+{
+ if (w->numitems > 1)
+ qsort(w->items, w->numitems, sizeof(aw_workitem), aw_workitem_cmp);
+}
+
+
/*
* do_start_worker
*
@@ -1899,10 +1989,10 @@ do_autovacuum(void)
HeapTuple tuple;
HeapScanDesc relScan;
Form_pg_database dbForm;
- List *table_oids = NIL;
HASHCTL ctl;
HTAB *table_toast_map;
- ListCell *volatile cell;
+ aw_worklist table_oids;
+ int oid_idx;
PgStat_StatDBEntry *shared;
PgStat_StatDBEntry *dbentry;
BufferAccessStrategy bstrategy;
@@ -1912,6 +2002,7 @@ do_autovacuum(void)
bool did_vacuum = false;
bool found_concurrent_worker = false;
+
/*
* StartTransactionCommand and CommitTransactionCommand will automatically
* switch to other contexts. We need this one to keep the list of
@@ -1993,6 +2084,9 @@ do_autovacuum(void)
&ctl,
HASH_ELEM | HASH_BLOBS);
+ /* create array/list to track oids of all the tables that might need action */
+ aw_worklist_init(&table_oids);
+
/*
* Scan pg_class to determine which tables to vacuum.
*
@@ -2084,9 +2178,9 @@ do_autovacuum(void)
}
else
{
- /* relations that need work are added to table_oids */
+ /* relations that may need work are added to table_oids */
if (dovacuum || doanalyze)
- table_oids = lappend_oid(table_oids, relid);
+ (void) aw_worklist_add(&table_oids, relid, classForm->relisshared);
/*
* Remember the association for the second pass. Note: we must do
@@ -2170,7 +2264,8 @@ do_autovacuum(void)
/* ignore analyze for toast tables */
if (dovacuum)
- table_oids = lappend_oid(table_oids, relid);
+ aw_worklist_add(&table_oids, relid, classForm->relisshared);
+
}
heap_endscan(relScan);
@@ -2193,12 +2288,30 @@ do_autovacuum(void)
/*
* Perform operations on collected tables.
+ *
+ * 1. Sort the collected oids so that workers process oids in
+ * the same order.
+ * 2. While there are items not yet visited do:
+ * 3. Find the last (highest) item worked on by any worker.
+ * 4. Claim the next item after the last one worked on.
+ *
+ * The ordering prevents workers from racing or competing with other
+ * workers for an item to work on. Without it there is no way to
+ * prevent them from trying to vacuum the same table repeatedly.
*/
- foreach(cell, table_oids)
+
+ /* 1.Sort the oids to establish a consistent order for work items. */
+ aw_worklist_trim(&table_oids); /* release any extra allocated memory */
+ aw_worklist_sort(&table_oids);
+
+ /*
+ * 2. While there are items that have not been visited...
+ */
+ for(oid_idx = 0; oid_idx < table_oids.numitems; )
{
- Oid relid = lfirst_oid(cell);
autovac_table *tab;
- bool skipit;
+ Oid relid;
+ Oid highest_oid_claimed;
int stdVacuumCostDelay;
int stdVacuumCostLimit;
dlist_iter iter;
@@ -2222,71 +2335,61 @@ do_autovacuum(void)
}
/*
- * hold schedule lock from here until we're sure that this table still
- * needs vacuuming. We also need the AutovacuumLock to walk the
- * worker array, but we'll let go of that one quickly.
+ * We hold the Autovacuum lock to walk the worker array and to
+ * update our WorkerInfo with the chosen candidate table.
*/
- LWLockAcquire(AutovacuumScheduleLock, LW_EXCLUSIVE);
- LWLockAcquire(AutovacuumLock, LW_SHARED);
+ LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
/*
- * Check whether the table is being vacuumed concurrently by another
- * worker.
+ * 3. Find the last (highest) oid claimed by any worker.
*/
- skipit = false;
+ highest_oid_claimed = InvalidOid;
dlist_foreach(iter, &AutoVacuumShmem->av_runningWorkers)
{
WorkerInfo worker = dlist_container(WorkerInfoData, wi_links, iter.cur);
- /* ignore myself */
- if (worker == MyWorkerInfo)
- continue;
+ if ((worker->wi_sharedrel || worker->wi_dboid == MyDatabaseId)
+ && worker->wi_tableoid >= highest_oid_claimed)
+ {
+ highest_oid_claimed = worker->wi_tableoid;
+ found_concurrent_worker = true;
+ }
+ }
- /* ignore workers in other databases (unless table is shared) */
- if (!worker->wi_sharedrel && worker->wi_dboid != MyDatabaseId)
- continue;
+ /*
+ * 4a. Skip past the highest_oid_claimed to find the next oid.
+ */
+ for (relid = InvalidOid; oid_idx < table_oids.numitems; oid_idx++)
+ {
+ aw_workitem *item = &table_oids.items[oid_idx];
- if (worker->wi_tableoid == relid)
+ /* 4b. Claim the table by storing its oid in shared memory. */
+ if (item->relid > highest_oid_claimed)
{
- skipit = true;
- found_concurrent_worker = true;
+ relid = item->relid;
+ MyWorkerInfo->wi_tableoid = relid;
+ MyWorkerInfo->wi_sharedrel = item->sharedrel;
break;
}
}
+
LWLockRelease(AutovacuumLock);
- if (skipit)
- {
- LWLockRelease(AutovacuumScheduleLock);
+
+ /* If we reached the end of the work list there is nothing to do. */
+ if (relid == InvalidOid)
continue;
- }
/*
- * Check whether pgstat data still says we need to vacuum this table.
- * It could have changed if something else processed the table while
- * we weren't looking.
- *
- * Note: we have a special case in pgstat code to ensure that the
- * stats we read are as up-to-date as possible, to avoid the problem
- * that somebody just finished vacuuming this table. The window to
- * the race condition is not closed but it is very small.
+ * Re-check whether we still need to vacuum this table. We are working
+ * from a list of tables that could be quite stale by now. Possibly
+ * this able was dropped or manually vacuumed while we weren't looking.
*/
MemoryContextSwitchTo(AutovacMemCxt);
tab = table_recheck_autovac(relid, table_toast_map, pg_class_desc,
effective_multixact_freeze_max_age);
if (tab == NULL)
- {
/* someone else vacuumed the table, or it went away */
- LWLockRelease(AutovacuumScheduleLock);
continue;
- }
-
- /*
- * Ok, good to go. Store the table in shared memory before releasing
- * the lock so that other workers don't vacuum it concurrently.
- */
- MyWorkerInfo->wi_tableoid = relid;
- MyWorkerInfo->wi_sharedrel = tab->at_sharedrel;
- LWLockRelease(AutovacuumScheduleLock);
/*
* Remember the prevailing values of the vacuum cost GUCs. We have to
@@ -2407,6 +2510,8 @@ deleted:
VacuumCostLimit = stdVacuumCostLimit;
}
+ aw_worklist_clear(&table_oids); /* free work list memory */
+
/*
* We leak table_toast_map here (among other things), but since we're
* going away soon, it's not a problem.
@@ -2514,9 +2619,7 @@ table_recheck_autovac(Oid relid, HTAB *table_toast_map,
bool wraparound;
AutoVacOpts *avopts;
- /* use fresh stats */
- autovac_refresh_stats();
-
+ /* Fetch pgstats for re-checking. */
shared = pgstat_fetch_stat_dbentry(InvalidOid);
dbentry = pgstat_fetch_stat_dbentry(MyDatabaseId);
@@ -2995,28 +3098,19 @@ AutoVacuumShmemInit(void)
*
* Cause the next pgstats read operation to obtain fresh data, but throttle
* such refreshing in the autovacuum launcher. This is mostly to avoid
- * rereading the pgstats files too many times in quick succession when there
- * are many databases.
- *
- * Note: we avoid throttling in the autovac worker, as it would be
- * counterproductive in the recheck logic.
+ * rereading the pgstats files too many times in quick succession.
*/
static void
autovac_refresh_stats(void)
{
- if (IsAutoVacuumLauncherProcess())
- {
- static TimestampTz last_read = 0;
- TimestampTz current_time;
-
- current_time = GetCurrentTimestamp();
-
- if (!TimestampDifferenceExceeds(last_read, current_time,
- STATS_READ_DELAY))
- return;
+ static TimestampTz last_read = 0;
+ TimestampTz current_time;
- last_read = current_time;
- }
+ current_time = GetCurrentTimestamp();
+ if (!TimestampDifferenceExceeds(last_read, current_time,
+ STATS_READ_DELAY))
+ return;
+ last_read = current_time;
pgstat_clear_snapshot();
}
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 480d3b1876..54c4ca97a3 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -76,6 +76,9 @@
#define PGSTAT_STAT_INTERVAL 500 /* Minimum time between stats file
* updates; in milliseconds. */
+#define PGSTAT_AVAC_INTERVAL 5000 /* Minimum time between stats file
+ * updates for autovacuum workers */
+
#define PGSTAT_RETRY_DELAY 10 /* How long to wait between checks for
* a new file; in milliseconds. */
@@ -4759,10 +4762,9 @@ backend_read_statsfile(void)
/*
* We set the minimum acceptable timestamp to PGSTAT_STAT_INTERVAL
* msec before now. This indirectly ensures that the collector
- * needn't write the file more often than PGSTAT_STAT_INTERVAL. In
- * an autovacuum worker, however, we want a lower delay to avoid
- * using stale data, so we use PGSTAT_RETRY_DELAY (since the
- * number of workers is low, this shouldn't be a problem).
+ * needn't write the file more often than PGSTAT_STAT_INTERVAL.
+ * Autovacuum workers use a longer interval PGSTAT_AVAC_INTERVAL to
+ * avoid requesting excessively frequent rewrites of the stats.
*
* We don't recompute min_ts after sleeping, except in the
* unlikely case that cur_ts went backwards. So we might end up
@@ -4773,12 +4775,10 @@ backend_read_statsfile(void)
* actually accept.
*/
ref_ts = cur_ts;
- if (IsAutoVacuumWorkerProcess())
- min_ts = TimestampTzPlusMilliseconds(ref_ts,
- -PGSTAT_RETRY_DELAY);
- else
- min_ts = TimestampTzPlusMilliseconds(ref_ts,
- -PGSTAT_STAT_INTERVAL);
+ min_ts = TimestampTzPlusMilliseconds(ref_ts,
+ IsAutoVacuumWorkerProcess()
+ ? -PGSTAT_AVAC_INTERVAL
+ : -PGSTAT_STAT_INTERVAL);
}
/*
diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt
index f8996cd21a..63fb170f19 100644
--- a/src/backend/storage/lmgr/lwlocknames.txt
+++ b/src/backend/storage/lmgr/lwlocknames.txt
@@ -27,7 +27,7 @@ TablespaceCreateLock 19
BtreeVacuumLock 20
AddinShmemInitLock 21
AutovacuumLock 22
-AutovacuumScheduleLock 23
+# 23 is available; was formerly AutovacuumScheduleLock
SyncScanLock 24
RelationMappingLock 25
AsyncCtlLock 26