Hi, Have you benchmarked the toastrelidx removal stuff in any way? If not, thats fine, but if yes I'd be interested.
On 2013-03-04 22:33:53 +0900, Michael Paquier wrote: > --- a/src/backend/access/heap/tuptoaster.c > +++ b/src/backend/access/heap/tuptoaster.c > @@ -1238,7 +1238,7 @@ toast_save_datum(Relation rel, Datum value, > struct varlena * oldexternal, int options) > { > Relation toastrel; > - Relation toastidx; > + Relation *toastidxs; > HeapTuple toasttup; > TupleDesc toasttupDesc; > Datum t_values[3]; > @@ -1257,15 +1257,26 @@ toast_save_datum(Relation rel, Datum value, > char *data_p; > int32 data_todo; > Pointer dval = DatumGetPointer(value); > + ListCell *lc; > + int count = 0; I find count a confusing name for a loop iteration variable... i of orr, idxno, or ... > + int num_indexes; > > /* > * Open the toast relation and its index. We can use the index to check > * uniqueness of the OID we assign to the toasted item, even though it > has > - * additional columns besides OID. > + * additional columns besides OID. A toast table can have multiple > identical > + * indexes associated to it. > */ > toastrel = heap_open(rel->rd_rel->reltoastrelid, RowExclusiveLock); > toasttupDesc = toastrel->rd_att; > - toastidx = index_open(toastrel->rd_rel->reltoastidxid, > RowExclusiveLock); > + if (toastrel->rd_indexvalid == 0) > + RelationGetIndexList(toastrel); Hm, I think we should move this into a macro, this is cropping up at more and more places. > - index_insert(toastidx, t_values, t_isnull, > - &(toasttup->t_self), > - toastrel, > - toastidx->rd_index->indisunique ? > - UNIQUE_CHECK_YES : UNIQUE_CHECK_NO); > + for (count = 0; count < num_indexes; count++) > + index_insert(toastidxs[count], t_values, t_isnull, > + &(toasttup->t_self), > + toastrel, > + > toastidxs[count]->rd_index->indisunique ? > + UNIQUE_CHECK_YES : > UNIQUE_CHECK_NO); The indisunique check looks like a copy & pasto to me, albeit not yours... > > /* > * Create the TOAST pointer value that we'll return > @@ -1475,10 +1493,13 @@ toast_delete_datum(Relation rel, Datum value) > struct varlena *attr = (struct varlena *) DatumGetPointer(value); > struct varatt_external toast_pointer; > + /* > + * We actually use only the first index but taking a lock on all is > + * necessary. > + */ Hm, is it guaranteed that the first index is valid? > + foreach(lc, toastrel->rd_indexlist) > + toastidxs[count++] = index_open(lfirst_oid(lc), > RowExclusiveLock); > /* > - * If we're swapping two toast tables by content, do the same for their > - * indexes. > + * If we're swapping two toast tables by content, do the same for all of > + * their indexes. The swap can actually be safely done only if all the > indexes > + * have valid Oids. > */ What's an index without a valid oid? > if (swap_toast_by_content && > - relform1->reltoastidxid && relform2->reltoastidxid) > - swap_relation_files(relform1->reltoastidxid, > - relform2->reltoastidxid, > - target_is_pg_class, > - swap_toast_by_content, > - InvalidTransactionId, > - InvalidMultiXactId, > - mapped_tables); > + relform1->reltoastrelid && > + relform2->reltoastrelid) > + { > + Relation toastRel1, toastRel2; > + > + /* Open relations */ > + toastRel1 = heap_open(relform1->reltoastrelid, > RowExclusiveLock); > + toastRel2 = heap_open(relform2->reltoastrelid, > RowExclusiveLock); Shouldn't those be Access Exlusive Locks? > + /* Obtain index list if necessary */ > + if (toastRel1->rd_indexvalid == 0) > + RelationGetIndexList(toastRel1); > + if (toastRel2->rd_indexvalid == 0) > + RelationGetIndexList(toastRel2); > + > + /* Check if the swap is possible for all the toast indexes */ So there's no error being thrown if this turns out not to be possible? > + if (!list_member_oid(toastRel1->rd_indexlist, InvalidOid) && > + !list_member_oid(toastRel2->rd_indexlist, InvalidOid) && > + list_length(toastRel1->rd_indexlist) == > list_length(toastRel2->rd_indexlist)) > + { > + ListCell *lc1, *lc2; > + > + /* Now swap each couple */ > + lc2 = list_head(toastRel2->rd_indexlist); > + foreach(lc1, toastRel1->rd_indexlist) > + { > + Oid indexOid1 = lfirst_oid(lc1); > + Oid indexOid2 = lfirst_oid(lc2); > + swap_relation_files(indexOid1, > + > indexOid2, > + > target_is_pg_class, > + > swap_toast_by_content, > + > InvalidTransactionId, > + > InvalidMultiXactId, > + > mapped_tables); > + lc2 = lnext(lc2); > + } > + } > + > + heap_close(toastRel1, RowExclusiveLock); > + heap_close(toastRel2, RowExclusiveLock); > + } > /* rename the toast table ... */ > @@ -1528,11 +1563,23 @@ finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap, > RenameRelationInternal(newrel->rd_rel->reltoastrelid, > > NewToastName); > > - /* ... and its index too */ > - snprintf(NewToastName, NAMEDATALEN, "pg_toast_%u_index", > - OIDOldHeap); > - RenameRelationInternal(toastidx, > - > NewToastName); > + /* ... and its indexes too */ > + foreach(lc, toastrel->rd_indexlist) > + { > + /* > + * The first index keeps the former toast name > and the > + * following entries are thought as being > concurrent indexes. > + */ > + if (count == 0) > + snprintf(NewToastName, NAMEDATALEN, > "pg_toast_%u_index", > + OIDOldHeap); > + else > + snprintf(NewToastName, NAMEDATALEN, > "pg_toast_%u_index_cct%d", > + OIDOldHeap, count); > + RenameRelationInternal(lfirst_oid(lc), > + > NewToastName); > + count++; > + } Hm. It seems wrong that this layer needs to know about _cct. > /* > - * Calculate total on-disk size of a TOAST relation, including its index. > + * Calculate total on-disk size of a TOAST relation, including its indexes. > * Must not be applied to non-TOAST relations. > */ > static int64 > @@ -340,8 +340,8 @@ calculate_toast_table_size(Oid toastrelid) > { > ... > + /* Size is evaluated based on the first index available */ Uh. Why? Imo all indexes should be counted. > + foreach(lc, toastRel->rd_indexlist) > + { > + Relation toastIdxRel; > + toastIdxRel = relation_open(lfirst_oid(lc), > + > AccessShareLock); > + for (forkNum = 0; forkNum <= MAX_FORKNUM; forkNum++) > + size += calculate_relation_size(&(toastIdxRel->rd_node), > + > toastIdxRel->rd_backend, forkNum); > + > + relation_close(toastIdxRel, AccessShareLock); > + } > -#define CATALOG_VERSION_NO 201302181 > +#define CATALOG_VERSION_NO 20130219 Think you forgot a digit here ;) > /* > * This case is currently not supported, but there's no way to ask for > it > - * in the grammar anyway, so it can't happen. > + * in the grammar anyway, so it can't happen. This might be called > during a > + * conccurrent reindex operation, in this case sufficient locks are > already > + * taken on the related relations. > */ I'd rather change that to something like /* * This case is currently only supported during a concurrent index * rebuild, but there is no way to ask for it in the grammar otherwise * anyway. */ Or similar. > + > +/* > + * index_concurrent_create > + * > + * Create an index based on the given one that will be used for concurrent > + * operations. The index is inserted into catalogs and needs to be built > later > + * on. This is called during concurrent index processing. The heap relation > + * on which is based the index needs to be closed by the caller. > + */ > +Oid > +index_concurrent_create(Relation heapRelation, Oid indOid, char > *concurrentName) > +{ > ... > + /* > + * Determine if index is initdeferred, this depends on its dependent > + * constraint. > + */ > + if (OidIsValid(constraintOid)) > + { > + /* Look for the correct value */ > + HeapTuple constTuple; > + Form_pg_constraint constraint; > + > + constTuple = SearchSysCache1(CONSTROID, > + > ObjectIdGetDatum(constraintOid)); > + if (!HeapTupleIsValid(constTuple)) > + elog(ERROR, "cache lookup failed for constraint %u", > + constraintOid); > + constraint = (Form_pg_constraint) GETSTRUCT(constTuple); > + initdeferred = constraint->condeferred; > + > + ReleaseSysCache(constTuple); > + } Very, very nitpicky, but I find "constTuple" to be confusing, I thought at first it meant that the tuple shouldn't be modified or something. > + /* > + * Index is considered as a constraint if it is PRIMARY KEY or > EXCLUSION. > + */ > + isconstraint = indexRelation->rd_index->indisprimary || > + indexRelation->rd_index->indisexclusion; unique constraints aren't mattering here? > +/* > + * index_concurrent_swap > + * > + * Replace old index by old index in a concurrent context. For the time being > + * what is done here is switching the relation relfilenode of the indexes. If > + * extra operations are necessary during a concurrent swap, processing should > + * be added here. AccessExclusiveLock is taken on the index relations that > are > + * swapped until the end of the transaction where this function is called. > + */ > +void > +index_concurrent_swap(Oid newIndexOid, Oid oldIndexOid) > +{ > + Relation oldIndexRel, newIndexRel, pg_class; > + HeapTuple oldIndexTuple, newIndexTuple; > + Form_pg_class oldIndexForm, newIndexForm; > + Oid tmpnode; > + > + /* > + * Take an exclusive lock on the old and new index before swapping them. > + */ > + oldIndexRel = relation_open(oldIndexOid, AccessExclusiveLock); > + newIndexRel = relation_open(newIndexOid, AccessExclusiveLock); > + > + /* Now swap relfilenode of those indexes */ Any chance to reuse swap_relation_files here? Not sure whether it would be beneficial given that it is more generic and normally works on a relation level... We probably should remove the fsm of the index altogether after this? > + pg_class = heap_open(RelationRelationId, RowExclusiveLock); > + > + oldIndexTuple = SearchSysCacheCopy1(RELOID, > + > ObjectIdGetDatum(oldIndexOid)); > + if (!HeapTupleIsValid(oldIndexTuple)) > + elog(ERROR, "could not find tuple for relation %u", > oldIndexOid); > + newIndexTuple = SearchSysCacheCopy1(RELOID, > + > ObjectIdGetDatum(newIndexOid)); > + if (!HeapTupleIsValid(newIndexTuple)) > + elog(ERROR, "could not find tuple for relation %u", > newIndexOid); > + oldIndexForm = (Form_pg_class) GETSTRUCT(oldIndexTuple); > + newIndexForm = (Form_pg_class) GETSTRUCT(newIndexTuple); > + > + /* Here is where the actual swapping happens */ > + tmpnode = oldIndexForm->relfilenode; > + oldIndexForm->relfilenode = newIndexForm->relfilenode; > + newIndexForm->relfilenode = tmpnode; > + > + /* Then update the tuples for each relation */ > + simple_heap_update(pg_class, &oldIndexTuple->t_self, oldIndexTuple); > + simple_heap_update(pg_class, &newIndexTuple->t_self, newIndexTuple); > + CatalogUpdateIndexes(pg_class, oldIndexTuple); > + CatalogUpdateIndexes(pg_class, newIndexTuple); > + > + /* Close relations and clean up */ > + heap_close(pg_class, RowExclusiveLock); > + > + /* The lock taken previously is not released until the end of > transaction */ > + relation_close(oldIndexRel, NoLock); > + relation_close(newIndexRel, NoLock); It might be worthwile adding a heap_freetuple here for (old, new)IndexTuple, just to spare the reader the thinking whether it needs to be done. > +/* > + * index_concurrent_drop > + * > + * Drop a single index concurrently as the last step of an index concurrent > + * process Deletion is done through performDeletion or dependencies of the > + * index are not dropped. At this point all the indexes are already > considered > + * as invalid and dead so they can be dropped without using any concurrent > + * options. > + */ "or dependencies of the index would not get dropped"? > +void > +index_concurrent_drop(Oid indexOid) > +{ > + Oid constraintOid = > get_index_constraint(indexOid); > + ObjectAddress object; > + Form_pg_index indexForm; > + Relation pg_index; > + HeapTuple indexTuple; > + bool indislive; > + > + /* > + * Check that the index dropped here is not alive, it might be used by > + * other backends in this case. > + */ > + pg_index = heap_open(IndexRelationId, RowExclusiveLock); > + > + indexTuple = SearchSysCacheCopy1(INDEXRELID, > + > ObjectIdGetDatum(indexOid)); > + if (!HeapTupleIsValid(indexTuple)) > + elog(ERROR, "cache lookup failed for index %u", indexOid); > + indexForm = (Form_pg_index) GETSTRUCT(indexTuple); > + indislive = indexForm->indislive; > + > + /* Clean up */ > + heap_close(pg_index, RowExclusiveLock); > + > + /* Leave if index is still alive */ > + if (indislive) > + return; This seems like a confusing path? Why is it valid to get here with a valid index and why is it ok to silently ignore that case? > /* > + * ReindexRelationConcurrently > + * > + * Process REINDEX CONCURRENTLY for given relation Oid. The relation can be > + * either an index or a table. If a table is specified, each reindexing step > + * is done in parallel with all the table's indexes as well as its dependent > + * toast indexes. > + */ > +bool > +ReindexRelationConcurrently(Oid relationOid) > +{ > + List *concurrentIndexIds = NIL, > + *indexIds = NIL, > + *parentRelationIds = NIL, > + *lockTags = NIL, > + *relationLocks = NIL; > + ListCell *lc, *lc2; > + Snapshot snapshot; > + > + /* > + * Extract the list of indexes that are going to be rebuilt based on the > + * list of relation Oids given by caller. For each element in given > list, > + * If the relkind of given relation Oid is a table, all its valid > indexes > + * will be rebuilt, including its associated toast table indexes. If > + * relkind is an index, this index itself will be rebuilt. The locks > taken > + * parent relations and involved indexes are kept until this transaction > + * is committed to protect against schema changes that might occur until > + * the session lock is taken on each relation. > + */ > + switch (get_rel_relkind(relationOid)) > + { > + case RELKIND_RELATION: > + { > + /* > + * In the case of a relation, find all its > indexes > + * including toast indexes. > + */ > + Relation heapRelation = > heap_open(relationOid, > + > ShareUpdateExclusiveLock); > + > + /* Track this relation for session locks */ > + parentRelationIds = > lappend_oid(parentRelationIds, relationOid); > + > + /* Relation on which is based index cannot be > shared */ > + if (heapRelation->rd_rel->relisshared) > + ereport(ERROR, > + > (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), > + errmsg("concurrent > reindex is not supported for shared relations"))); > + > + /* Add all the valid indexes of relation to > list */ > + foreach(lc2, RelationGetIndexList(heapRelation)) Hm. This means we will not notice having about-to-be dropped indexes around. Which seems safe because locks will prevent that anyway... > + default: > + /* nothing to do */ > + break; Shouldn't we error out? > + foreach(lc, indexIds) > + { > + Relation indexRel; > + Oid indOid = lfirst_oid(lc); > + Oid concurrentOid = lfirst_oid(lc2); > + bool primary; > + > + /* Move to next concurrent item */ > + lc2 = lnext(lc2); forboth() > + /* > + * Phase 3 of REINDEX CONCURRENTLY > + * > + * During this phase the concurrent indexes catch up with the INSERT > that > + * might have occurred in the parent table and are marked as valid once > done. > + * > + * We once again wait until no transaction can have the table open with > + * the index marked as read-only for updates. Each index validation is > done > + * with a separate transaction to avoid opening transaction for an > + * unnecessary too long time. > + */ Maybe I am being dumb because I have the feeling I said differently in the past, but why do we not need a WaitForMultipleVirtualLocks() here? The comment seems to say we need to do so. > + /* > + * Perform a scan of each concurrent index with the heap, then insert > + * any missing index entries. > + */ > + foreach(lc, concurrentIndexIds) > + { > + Oid indOid = lfirst_oid(lc); > + Oid relOid; > + > + /* Open separate transaction to validate index */ > + StartTransactionCommand(); > + > + /* Get the parent relation Oid */ > + relOid = IndexGetRelation(indOid, false); > + > + /* > + * Take the reference snapshot that will be used for the > concurrent indexes > + * validation. > + */ > + snapshot = RegisterSnapshot(GetTransactionSnapshot()); > + PushActiveSnapshot(snapshot); > + > + /* Validate index, which might be a toast */ > + validate_index(relOid, indOid, snapshot); > + > + /* > + * This concurrent index is now valid as they contain all the > tuples > + * necessary. However, it might not have taken into account > deleted tuples > + * before the reference snapshot was taken, so we need to wait > for the > + * transactions that might have older snapshots than ours. > + */ > + WaitForOldSnapshots(snapshot); > + > + /* > + * Concurrent index can now be marked as valid -- update > pg_index > + * entries. > + */ > + index_set_state_flags(indOid, INDEX_CREATE_SET_VALID); > + > + /* > + * The pg_index update will cause backends to update its > entries for the > + * concurrent index but it is necessary to do the same thing > for cache. > + */ > + CacheInvalidateRelcacheByRelid(relOid); > + > + /* we can now do away with our active snapshot */ > + PopActiveSnapshot(); > + > + /* And we can remove the validating snapshot too */ > + UnregisterSnapshot(snapshot); > + > + /* Commit this transaction to make the concurrent index valid */ > + CommitTransactionCommand(); > + } > + /* > + * Phase 5 of REINDEX CONCURRENTLY > + * > + * The concurrent indexes now hold the old relfilenode of the other > indexes > + * transactions that might use them. Each operation is performed with a > + * separate transaction. > + */ > + > + /* Now mark the concurrent indexes as not ready */ > + foreach(lc, concurrentIndexIds) > + { > + Oid indOid = lfirst_oid(lc); > + Oid relOid; > + > + StartTransactionCommand(); > + relOid = IndexGetRelation(indOid, false); > + > + /* > + * Finish the index invalidation and set it as dead. It is not > + * necessary to wait for virtual locks on the parent relation > as it > + * is already sure that this session holds sufficient locks.s > + */ tiny typo (lock.s) > + /* > + * Phase 6 of REINDEX CONCURRENTLY > + * > + * Drop the concurrent indexes. This needs to be done through > + * performDeletion or related dependencies will not be dropped for the > old > + * indexes. The internal mechanism of DROP INDEX CONCURRENTLY is not > used > + * as here the indexes are already considered as dead and invalid, so > they > + * will not be used by other backends. > + */ > + foreach(lc, concurrentIndexIds) > + { > + Oid indexOid = lfirst_oid(lc); > + > + /* Start transaction to drop this index */ > + StartTransactionCommand(); > + > + /* Get fresh snapshot for next step */ > + PushActiveSnapshot(GetTransactionSnapshot()); > + > + /* > + * Open transaction if necessary, for the first index treated > its > + * transaction has been already opened previously. > + */ > + index_concurrent_drop(indexOid); > + > + /* > + * For the last index to be treated, do not commit transaction > yet. > + * This will be done once all the locks on indexes and parent > relations > + * are released. > + */ Hm. This doesn't seem to commit the last transaction at all right now? Not sure why UnlockRelationIdForSession needs to be run in a transaction anyway? > + if (indexOid != llast_oid(concurrentIndexIds)) > + { > + /* We can do away with our snapshot */ > + PopActiveSnapshot(); > + > + /* Commit this transaction to make the update visible. > */ > + CommitTransactionCommand(); > + } > + } > + > + /* > + * Last thing to do is release the session-level lock on the parent > table > + * and the indexes of table. > + */ > + foreach(lc, relationLocks) > + { > + LockRelId lockRel = * (LockRelId *) lfirst(lc); > + UnlockRelationIdForSession(&lockRel, ShareUpdateExclusiveLock); > + } > + > + return true; > +} > + > + > + /* > + * Check the case of a system index that might have been invalidated by > a > + * failed concurrent process and allow its drop. > + */ This is only possible for toast indexes right now, right? If so, the comment should mention that. > + if (IsSystemClass(classform) && > + relkind == RELKIND_INDEX) > + { > + HeapTuple locTuple; > + Form_pg_index indexform; > + bool indisvalid; > + > + locTuple = SearchSysCache1(INDEXRELID, > ObjectIdGetDatum(state->heapOid)); > + if (!HeapTupleIsValid(locTuple)) > + { > + ReleaseSysCache(tuple); > + return; > + } > + > + indexform = (Form_pg_index) GETSTRUCT(locTuple); > + indisvalid = indexform->indisvalid; > + ReleaseSysCache(locTuple); > + > + /* Leave if index entry is not valid */ > + if (!indisvalid) > + { > + ReleaseSysCache(tuple); > + return; > + } > + } > + Ok, thats what I have for now... Greetings, Andres Freund -- Andres Freund http://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Training & Services -- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers