Re: [HACKERS] advance local xmin more aggressively
On 01/15/2015 09:57 PM, Robert Haas wrote: On Thu, Jan 15, 2015 at 3:08 AM, Michael Paquier wrote: On Mon, Dec 22, 2014 at 7:31 PM, Heikki Linnakangas wrote: Here's an updated version, rebased over the pairing heap code that I just committed, and fixing those bugs. So, are we reaching an outcome for the match happening here? Well, I still like using the existing ResourceOwner pointers to find the snapshots more than introducing a separate data structure just for that. But I'm OK with Heikki committing his version and, if anybody notices the new code becoming a hotspot, we can revisit the issue. Ok, committed. - Heikki -- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers
Re: [HACKERS] advance local xmin more aggressively
On Thu, Jan 15, 2015 at 3:08 AM, Michael Paquier wrote: > On Mon, Dec 22, 2014 at 7:31 PM, Heikki Linnakangas > wrote: >> Here's an updated version, rebased over the pairing heap code that I just >> committed, and fixing those bugs. > So, are we reaching an outcome for the match happening here? Well, I still like using the existing ResourceOwner pointers to find the snapshots more than introducing a separate data structure just for that. But I'm OK with Heikki committing his version and, if anybody notices the new code becoming a hotspot, we can revisit the issue. -- Robert Haas EnterpriseDB: http://www.enterprisedb.com The Enterprise PostgreSQL Company -- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers
Re: [HACKERS] advance local xmin more aggressively
On Mon, Dec 22, 2014 at 7:31 PM, Heikki Linnakangas wrote: > Here's an updated version, rebased over the pairing heap code that I just > committed, and fixing those bugs. So, are we reaching an outcome for the match happening here? -- Michael -- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers
Re: [HACKERS] advance local xmin more aggressively
On 12/16/2014 10:41 PM, Jeff Janes wrote: On Wed, Dec 10, 2014 at 3:46 PM, Robert Haas wrote: On Wed, Dec 10, 2014 at 3:28 PM, Heikki Linnakangas wrote: Care to code it up? Here you are. That was quick. You need to add a semicolon to the end of line 20 in pairingheap.c. In addition to the semicolon, it doesn't build under cassert. There are some pairingheap_empty that need to be pairingheap_is_empty, and snapmgr.c needs an address of operator near line 355 and something is wrong in snapmgr.c near line 811. Here's an updated version, rebased over the pairing heap code that I just committed, and fixing those bugs. - Heikki commit 4f37313a5b173c2952aebc91c41c744dcc3cf2df Author: Heikki Linnakangas Date: Mon Dec 22 12:22:39 2014 +0200 Use pairing heap to keep registered snapshots in xmin-order. This allows us to advance the xmin in PGPROC more aggressively. diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c index d601efe..08d6d3d 100644 --- a/src/backend/utils/time/snapmgr.c +++ b/src/backend/utils/time/snapmgr.c @@ -46,6 +46,7 @@ #include "access/transam.h" #include "access/xact.h" +#include "lib/pairingheap.h" #include "miscadmin.h" #include "storage/predicate.h" #include "storage/proc.h" @@ -58,6 +59,12 @@ #include "utils/syscache.h" #include "utils/tqual.h" +/* Prototypes for local functions */ +static Snapshot CopySnapshot(Snapshot snapshot); +static void FreeSnapshot(Snapshot snapshot); +static void SnapshotResetXmin(void); +static int xmin_cmp(const pairingheap_node *a, const pairingheap_node *b, void *arg); + /* * CurrentSnapshot points to the only snapshot taken in transaction-snapshot @@ -122,14 +129,8 @@ typedef struct ActiveSnapshotElt /* Top of the stack of active snapshots */ static ActiveSnapshotElt *ActiveSnapshot = NULL; -/* - * How many snapshots is resowner.c tracking for us? - * - * Note: for now, a simple counter is enough. However, if we ever want to be - * smarter about advancing our MyPgXact->xmin we will need to be more - * sophisticated about this, perhaps keeping our own list of snapshots. - */ -static int RegisteredSnapshots = 0; +/* Snapshots registered with resowners. Ordered in a heap by xmin. */ +static pairingheap RegisteredSnapshots = { &xmin_cmp, NULL, NULL }; /* first GetTransactionSnapshot call in a transaction? */ bool FirstSnapshotSet = false; @@ -151,11 +152,6 @@ static Snapshot FirstXactSnapshot = NULL; static List *exportedSnapshots = NIL; -static Snapshot CopySnapshot(Snapshot snapshot); -static void FreeSnapshot(Snapshot snapshot); -static void SnapshotResetXmin(void); - - /* * GetTransactionSnapshot * Get the appropriate snapshot for a new query in a transaction. @@ -183,7 +179,7 @@ GetTransactionSnapshot(void) /* First call in transaction? */ if (!FirstSnapshotSet) { - Assert(RegisteredSnapshots == 0); + Assert(pairingheap_is_empty(&RegisteredSnapshots)); Assert(FirstXactSnapshot == NULL); /* @@ -205,7 +201,7 @@ GetTransactionSnapshot(void) FirstXactSnapshot = CurrentSnapshot; /* Mark it as "registered" in FirstXactSnapshot */ FirstXactSnapshot->regd_count++; - RegisteredSnapshots++; + pairingheap_add(&RegisteredSnapshots, &FirstXactSnapshot->ph_node); } else CurrentSnapshot = GetSnapshotData(&CurrentSnapshotData); @@ -350,7 +346,7 @@ SetTransactionSnapshot(Snapshot sourcesnap, TransactionId sourcexid) /* Caller should have checked this already */ Assert(!FirstSnapshotSet); - Assert(RegisteredSnapshots == 0); + Assert(pairingheap_is_empty(&RegisteredSnapshots)); Assert(FirstXactSnapshot == NULL); Assert(!HistoricSnapshotActive()); @@ -413,7 +409,7 @@ SetTransactionSnapshot(Snapshot sourcesnap, TransactionId sourcexid) FirstXactSnapshot = CurrentSnapshot; /* Mark it as "registered" in FirstXactSnapshot */ FirstXactSnapshot->regd_count++; - RegisteredSnapshots++; + pairingheap_add(&RegisteredSnapshots, &FirstXactSnapshot->ph_node); } FirstSnapshotSet = true; @@ -639,7 +635,8 @@ RegisterSnapshotOnOwner(Snapshot snapshot, ResourceOwner owner) snap->regd_count++; ResourceOwnerRememberSnapshot(owner, snap); - RegisteredSnapshots++; + if (snap->regd_count == 1) + pairingheap_add(&RegisteredSnapshots, &snap->ph_node); return snap; } @@ -671,11 +668,16 @@ UnregisterSnapshotFromOwner(Snapshot snapshot, ResourceOwner owner) return; Assert(snapshot->regd_count > 0); - Assert(RegisteredSnapshots > 0); + Assert(!pairingheap_is_empty(&RegisteredSnapshots)); ResourceOwnerForgetSnapshot(owner, snapshot); - RegisteredSnapshots--; - if (--snapshot->regd_count == 0 && snapshot->active_count == 0) + + snapshot->regd_count--; + + if (snapshot->regd_count == 0) + pairingheap_remove(&RegisteredSnapshots, &snapshot->ph_node); + + if (snapshot->regd_count == 0 && snapshot->active_count == 0) { FreeSnapshot(snapshot); SnapshotResetXmin(); @@ -683,17 +685,54 @@ UnregisterSnapshotFromOwne
Re: [HACKERS] advance local xmin more aggressively
On Wed, Dec 10, 2014 at 3:46 PM, Robert Haas wrote: > > On Wed, Dec 10, 2014 at 3:28 PM, Heikki Linnakangas > wrote: > >> Care to code it up? > > > > Here you are. > > That was quick. > > You need to add a semicolon to the end of line 20 in pairingheap.c. > In addition to the semicolon, it doesn't build under cassert. There are some pairingheap_empty that need to be pairingheap_is_empty, and snapmgr.c needs an address of operator near line 355 and something is wrong in snapmgr.c near line 811. Cheers, Jeff
Re: [HACKERS] advance local xmin more aggressively
On Wed, Dec 10, 2014 at 3:28 PM, Heikki Linnakangas wrote: >> Care to code it up? > > Here you are. That was quick. You need to add a semicolon to the end of line 20 in pairingheap.c. I wonder what the worst case for this is. I tried it on your destruction test case from before and it doesn't seem to cost anything material: your patch: 0m38.714s 0m34.424s 0m35.191s master: 0m35.260s 0m34.643s 0m34.945s my patch: 0m34.728s 0m34.619s 0m35.015s The first measurement with your patch is somewhat of an outlier, but that was also the first measurement I took, which might have thrown off the results. Basically, either your patch or my patch looks free from here. I'm kind of suspicious about that: it doesn't seem like the additional linked-list manipulation you're doing in RegisterSnapshotOnOwner ought to cost something, but maybe that function just isn't called enough for it to matter, at least on this test case. -- Robert Haas EnterpriseDB: http://www.enterprisedb.com The Enterprise PostgreSQL Company -- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers
Re: [HACKERS] advance local xmin more aggressively
On 12/10/2014 08:35 PM, Robert Haas wrote: On Wed, Dec 10, 2014 at 12:57 PM, Heikki Linnakangas wrote: Clever. Could we use that method in ResourceOwnerReleaseInternal and ResourceOwnerDelete, too? Might be best to have a ResourceOwnerWalk(resowner, callback) function for walking all resource owners in a tree, instead of one for walking the snapshots in them. Sure. It would be a little more complicated there since you want to stop when you get back to the starting point, but not too bad. But is that solving any actual problem? I thought that a transaction commit or abort in some special circumstances might call ResourceOwnerReleaseInternal on the top level, but I can't make it happen. The machinery in xact.c is too clever, and always releases the resource owners from the bottom up. And I can't find a way to create a deep resource owner tree in any other way. So I guess it's fine as it is. MemoryContextCheck and MemoryContextPrint also recurse, however. MemoryContextCheck is only enabled with --enable-cassert, but MemoryContextPrint is called when you run out of memory. That could turn a plain "out of memory" error into a stack overrun, triggering a server crash and restart. It occurs to me that the pairing heap I just posted in another thread (http://www.postgresql.org/message-id/54886bb8.9040...@vmware.com) would be a good fit for this. It's extremely cheap to insert to and to find the minimum item (O(1)), and the delete operation is O(log N), amortized. I didn't implement a delete operation, for removing a particular node, I only did delete-min, but it's basically the same code. Using a pairing heap for this might be overkill, but if we have that implementation anyway, the code in snapmgr.c to use it would be very simple, so I see little reason not to. It might even be simpler than your patch, because you wouldn't need to have the heuristics on whether to attempt updating the xmin; it would be cheap enough to always try it. Care to code it up? Here you are. - Heikki diff --git a/src/backend/lib/Makefile b/src/backend/lib/Makefile index 327a1bc..b24ece6 100644 --- a/src/backend/lib/Makefile +++ b/src/backend/lib/Makefile @@ -12,6 +12,6 @@ subdir = src/backend/lib top_builddir = ../../.. include $(top_builddir)/src/Makefile.global -OBJS = ilist.o binaryheap.o stringinfo.o +OBJS = ilist.o binaryheap.o pairingheap.o stringinfo.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/lib/pairingheap.c b/src/backend/lib/pairingheap.c new file mode 100644 index 000..06cd117 --- /dev/null +++ b/src/backend/lib/pairingheap.c @@ -0,0 +1,237 @@ +/*- + * + * pairingheap.c + * A Pairing Heap implementation + * + * Portions Copyright (c) 2012-2014, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/lib/pairingheap.c + * + *- + */ + +#include "postgres.h" + +#include + +#include "lib/pairingheap.h" + +static pairingheap_node * merge(pairingheap *heap, pairingheap_node *a, pairingheap_node *b) +static pairingheap_node *merge_children(pairingheap *heap, pairingheap_node *children); + +/* + * pairingheap_allocate + * + * Returns a pointer to a newly-allocated heap, with the heap property + * defined by the given comparator function, which will be invoked with the + * additional argument specified by 'arg'. + */ +pairingheap * +pairingheap_allocate(pairingheap_comparator compare, void *arg) +{ + pairingheap *heap; + + heap = (pairingheap *) palloc(sizeof(pairingheap)); + heap->ph_compare = compare; + heap->ph_arg = arg; + + heap->ph_root = NULL; + + return heap; +} + +/* + * pairingheap_free + * + * Releases memory used by the given pairingheap. + * + * Note: The items in the heap are not released! + */ +void +pairingheap_free(pairingheap *heap) +{ + pfree(heap); +} + + +/* A helper function to merge two subheaps into one. */ +static pairingheap_node * +merge(pairingheap *heap, pairingheap_node *a, pairingheap_node *b) +{ + if (a == NULL) + return b; + if (b == NULL) + return a; + + /* Put the larger of the items as a child of the smaller one */ + if (heap->ph_compare(a, b, heap->ph_arg) < 0) + { + pairingheap_node *tmp; + + tmp = a; + a = b; + b = tmp; + } + + if (a->first_child) + a->first_child->prev_or_parent = b; + b->prev_or_parent = a; + b->next_sibling = a->first_child; + a->first_child = b; + return a; +} + +/* + * pairingheap_add + * + * Adds the given datum to the heap in O(1) time. + */ +void +pairingheap_add(pairingheap *heap, pairingheap_node *d) +{ + d->first_child = NULL; + + /* Link the new item as a new tree */ + heap->ph_root = merge(heap, heap->ph_root, d); +} + +/* + * pairingheap_first + * + * Returns a pointer to the first (root, topmost) node in the heap + * without modifying the heap. The caller must ensure that this + * routine is not used on an empty heap. Always O(1
Re: [HACKERS] advance local xmin more aggressively
On Wed, Dec 10, 2014 at 12:57 PM, Heikki Linnakangas wrote: > Clever. Could we use that method in ResourceOwnerReleaseInternal and > ResourceOwnerDelete, too? Might be best to have a > ResourceOwnerWalk(resowner, callback) function for walking all resource > owners in a tree, instead of one for walking the snapshots in them. Sure. It would be a little more complicated there since you want to stop when you get back to the starting point, but not too bad. But is that solving any actual problem? >> 2. Instead of traversing the tree until we find an xmin equal to the >> one we're currently advertising, the code now traverses the entire >> tree each time it runs. However, it also keeps a record of how many >> times the oldest xmin occurred in the tree, which is decremented each >> time we unregister a snapshot with that xmin; the traversal doesn't >> run again until that count reaches 0. That fixed the performance >> regression on your test case. >> >> With a million subtransactions: >> >> master 34.464s 33.742s 34.317s >> advance-xmin 34.516s 34.069s 34.196s > > Well, you can still get the slowness back by running other stuff in the > background. I admit that it's a very obscure case, probably fine in > practice. I would still feel better if snapmgr.c did its own bookkeeping, > from an aesthetic point of view. In a heap, or even just a linked list, if > the performance characteristics of that is acceptable. > > It occurs to me that the pairing heap I just posted in another thread > (http://www.postgresql.org/message-id/54886bb8.9040...@vmware.com) would be > a good fit for this. It's extremely cheap to insert to and to find the > minimum item (O(1)), and the delete operation is O(log N), amortized. I > didn't implement a delete operation, for removing a particular node, I only > did delete-min, but it's basically the same code. Using a pairing heap for > this might be overkill, but if we have that implementation anyway, the code > in snapmgr.c to use it would be very simple, so I see little reason not to. > It might even be simpler than your patch, because you wouldn't need to have > the heuristics on whether to attempt updating the xmin; it would be cheap > enough to always try it. Care to code it up? -- Robert Haas EnterpriseDB: http://www.enterprisedb.com The Enterprise PostgreSQL Company -- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers
Re: [HACKERS] advance local xmin more aggressively
On 12/10/2014 06:56 PM, Robert Haas wrote: On Wed, Dec 10, 2014 at 9:49 AM, Robert Haas wrote: I guess this bears some further thought. I certainly don't like the fact that this makes the whole system crap out at a lower number of subtransactions than presently. The actual performance numbers don't bother me very much; I'm comfortable with the possibility that closing a cursor will be some modest percentage slower if you've got thousands of active savepoints. Here's a new version with two changes: 1. I changed the traversal of the resource owner tree to iterate instead of recurse. It now does a depth-first, pre-order traversal of the tree; when we reach the last child of a node, we follow its parent pointer to get back to where we were. That way, we don't need to keep anything on the stack. That fixed the crash at 100k cursors, but it was still 4x slower. Clever. Could we use that method in ResourceOwnerReleaseInternal and ResourceOwnerDelete, too? Might be best to have a ResourceOwnerWalk(resowner, callback) function for walking all resource owners in a tree, instead of one for walking the snapshots in them. 2. Instead of traversing the tree until we find an xmin equal to the one we're currently advertising, the code now traverses the entire tree each time it runs. However, it also keeps a record of how many times the oldest xmin occurred in the tree, which is decremented each time we unregister a snapshot with that xmin; the traversal doesn't run again until that count reaches 0. That fixed the performance regression on your test case. With a million subtransactions: master 34.464s 33.742s 34.317s advance-xmin 34.516s 34.069s 34.196s Well, you can still get the slowness back by running other stuff in the background. I admit that it's a very obscure case, probably fine in practice. I would still feel better if snapmgr.c did its own bookkeeping, from an aesthetic point of view. In a heap, or even just a linked list, if the performance characteristics of that is acceptable. It occurs to me that the pairing heap I just posted in another thread (http://www.postgresql.org/message-id/54886bb8.9040...@vmware.com) would be a good fit for this. It's extremely cheap to insert to and to find the minimum item (O(1)), and the delete operation is O(log N), amortized. I didn't implement a delete operation, for removing a particular node, I only did delete-min, but it's basically the same code. Using a pairing heap for this might be overkill, but if we have that implementation anyway, the code in snapmgr.c to use it would be very simple, so I see little reason not to. It might even be simpler than your patch, because you wouldn't need to have the heuristics on whether to attempt updating the xmin; it would be cheap enough to always try it. - Heikki -- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers
Re: [HACKERS] advance local xmin more aggressively
On Wed, Dec 10, 2014 at 9:49 AM, Robert Haas wrote: > I guess this bears some further thought. I certainly don't like the > fact that this makes the whole system crap out at a lower number of > subtransactions than presently. The actual performance numbers don't > bother me very much; I'm comfortable with the possibility that closing > a cursor will be some modest percentage slower if you've got thousands > of active savepoints. Here's a new version with two changes: 1. I changed the traversal of the resource owner tree to iterate instead of recurse. It now does a depth-first, pre-order traversal of the tree; when we reach the last child of a node, we follow its parent pointer to get back to where we were. That way, we don't need to keep anything on the stack. That fixed the crash at 100k cursors, but it was still 4x slower. 2. Instead of traversing the tree until we find an xmin equal to the one we're currently advertising, the code now traverses the entire tree each time it runs. However, it also keeps a record of how many times the oldest xmin occurred in the tree, which is decremented each time we unregister a snapshot with that xmin; the traversal doesn't run again until that count reaches 0. That fixed the performance regression on your test case. With a million subtransactions: master 34.464s 33.742s 34.317s advance-xmin 34.516s 34.069s 34.196s -- Robert Haas EnterpriseDB: http://www.enterprisedb.com The Enterprise PostgreSQL Company diff --git a/src/backend/utils/resowner/resowner.c b/src/backend/utils/resowner/resowner.c index 0955bcc..529209f 100644 --- a/src/backend/utils/resowner/resowner.c +++ b/src/backend/utils/resowner/resowner.c @@ -21,6 +21,7 @@ #include "postgres.h" #include "access/hash.h" +#include "miscadmin.h" #include "storage/predicate.h" #include "storage/proc.h" #include "utils/memutils.h" @@ -113,6 +114,7 @@ typedef struct ResourceOwnerData ResourceOwner CurrentResourceOwner = NULL; ResourceOwner CurTransactionResourceOwner = NULL; ResourceOwner TopTransactionResourceOwner = NULL; +ResourceOwner FirstRootResourceOwner = NULL; /* * List of add-on callbacks for resource releasing @@ -167,6 +169,11 @@ ResourceOwnerCreate(ResourceOwner parent, const char *name) owner->nextchild = parent->firstchild; parent->firstchild = owner; } + else + { + owner->nextchild = FirstRootResourceOwner; + FirstRootResourceOwner = owner; + } return owner; } @@ -442,6 +449,8 @@ ResourceOwnerDelete(ResourceOwner owner) * the owner tree. Better a leak than a crash. */ ResourceOwnerNewParent(owner, NULL); + Assert(FirstRootResourceOwner == owner); + FirstRootResourceOwner = owner->nextchild; /* And free the object. */ if (owner->buffers) @@ -502,6 +511,14 @@ ResourceOwnerNewParent(ResourceOwner owner, } } } + else + { + ResourceOwner *link = &FirstRootResourceOwner; + + while (*link != owner) + link = &((*link)->nextchild); + *link = owner->nextchild; + } if (newparent) { @@ -513,7 +530,8 @@ ResourceOwnerNewParent(ResourceOwner owner, else { owner->parent = NULL; - owner->nextchild = NULL; + owner->nextchild = FirstRootResourceOwner; + FirstRootResourceOwner = owner; } } @@ -1161,6 +1179,59 @@ ResourceOwnerForgetSnapshot(ResourceOwner owner, Snapshot snapshot) } /* + * Invoke a caller-supplied function on every snapshot registered with any + * resource owner in the system. The callback can abort the traversal by + * returning true. + */ +bool +ResourceOwnerWalkAllSnapshots(ResourceWalkSnapshotCallback callback, void *arg) +{ + ResourceOwner owner = FirstRootResourceOwner; + bool visited = false; + + /* + * We do this traversal non-recursively in order to avoid running out of + * stack space, which can otherwise happen with large numbers of nested + * subtransactions. The easiest way to do that is to search depth-first, + * so that we visit all of a given node's descendents before reaching its + * right sibling. When we've visited all of the node's descendents, we'll + * follow the last child's parent pointer back to that node, but visited + * will be set to true at that point, so we'll advance to the right + * sibling instead of traversing it again. + */ + while (owner != NULL) + { + if (!visited) + { + int i; + + for (i = 0; i < owner->nsnapshots; ++i) +if (callback(owner->snapshots[i], arg)) + return true; + + if (owner->firstchild != NULL) + { +owner = owner->firstchild; +continue; + } + } + + if (owner->nextchild != NULL) + { + owner = owner->nextchild; + visited = false; + } + else + { + owner = owner->parent; + visited = true; + } + } + + return false; +} + +/* * Debugging subroutine */ static void diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c index d601efe..1fd73c8 100644 --- a/src/backend/utils/time/snapmgr.c +++ b/src/backend/utils/time/snapmgr.c @@ -122,14 +122,28 @@ typedef struct ActiveSnapshotElt /* Top
Re: [HACKERS] advance local xmin more aggressively
On Wed, Dec 10, 2014 at 7:34 AM, Heikki Linnakangas wrote: >>> 1. I don't really see why resource owners shouldn't be traversed like >>> that. They are clearly intended to form a hierarchy, and there's >>> existing code that recurses through the hierarchy from a given level >>> downward. What's ugly about that? > > I can't exactly point my finger on it, but it just feels wrong from a > modularity point of view. Their purpose is to make sure that we don't leak > resources on abort, by allowing easy an "release everything" operation. It's > not designed for finding objects based on some other criteria. > > There is similar double bookkeeping of many other things that are tracked by > resource owners. Heavy-weight locks are tracked by LOCALLOCK structs, buffer > pins in PrivateRefCount array etc. Those things existed before resource > owners were invented, but if we were starting from scratch, that design > would still make sense, as different objects have different access criteria. > fd.c needs to be able to find the least-recently-used open file, for > example, and you need to find the snapshot with the lowest xmin. I think all of these are performance trade-offs rather than questions of style. LOCALLOCK structs and private buffer pin tracking existed before resource owners because we need to do lookups of locks by lock tag or buffers by buffer number frequently enough that, if we had to grovel trough all the locks or buffer pins in the system without any indexing, it would be slow. We *could* do it that way now that we have resource owners, but it's pretty obvious that it would suck. That's less obvious here. If we throw all of the registered snapshots into a binary heap, finding the lowest xmin will be cheaper every time we do it, but we'll have the overhead of maintaining that binary heap even if it never gets used. > I think you're confusing the N and the N above. It's true that deleting a > resource owner is O(M), where the M is the number of children of that > resource owner. It does not follow that releasing N resource owners is > O(N^2), where N is the number of resource owners released. Calling > ResourceOwnerDelete(x) will only visit each resource owner in that tree > once, so it's O(N), where N is the total number of resource owners in the > tree. Not if the portals are dropped in separate commands with an unpredictable ordering. Then you have N separate calls to ResourceOwnerDelete. > I did some testing of the worst case scenario. The attached script first > creates a lot of cursors, then a lot of savepoints, and finally closes the > cursors in FIFO order. When the number of savepoints is high enough, this > actually segfaults with your patch, because you run out of stack space when > recursing the subxact resource owners. That's hardly this patch's fault, I'm > actually surprised it doesn't crash without it, because we recurse into all > resource owners in ResourceOwnerRelease too. Apparently the subxacts are > closed in LIFO order at commit, but there might be are other cases where you > could trigger that. In any case, a stack-depth check would be nice. Thanks for the test case; that's helpful. I added a stack depth check, but it doesn't help much: ERROR: stack depth limit exceeded HINT: Increase the configuration parameter "max_stack_depth" (currently 2048kB), after ensuring the platform's stack depth limit is adequate. STATEMENT: CLOSE cur0; ERROR: stack depth limit exceeded HINT: Increase the configuration parameter "max_stack_depth" (currently 2048kB), after ensuring the platform's stack depth limit is adequate. WARNING: AbortSubTransaction while in ABORT state ERROR: stack depth limit exceeded HINT: Increase the configuration parameter "max_stack_depth" (currently 2048kB), after ensuring the platform's stack depth limit is adequate. WARNING: AbortSubTransaction while in ABORT state ERROR: stack depth limit exceeded HINT: Increase the configuration parameter "max_stack_depth" (currently 2048kB), after ensuring the platform's stack depth limit is adequate. WARNING: AbortSubTransaction while in ABORT state ERROR: stack depth limit exceeded HINT: Increase the configuration parameter "max_stack_depth" (currently 2048kB), after ensuring the platform's stack depth limit is adequate. PANIC: ERRORDATA_STACK_SIZE exceeded That's at 100k subtransactions. I took some performance data for lower numbers of subtransactions: -- at 1000 subtransactions -- master 0.156s 0.157s 0.154s advance-xmin 0.177s 0.175s 0.177s -- at 1 subtransactions -- master 0.458s 0.457s 0.456s advance-xmin 0.639s 0.637s 0.633 I guess this bears some further thought. I certainly don't like the fact that this makes the whole system crap out at a lower number of subtransactions than presently. The actual performance numbers don't bother me very much; I'm comfortable with the possibility that closing a cursor will be some modest percentage slower if you've got thousands of active savepoints.
Re: [HACKERS] advance local xmin more aggressively
On 12/09/2014 10:35 PM, Robert Haas wrote: On Mon, Dec 8, 2014 at 9:31 AM, Robert Haas wrote: On Mon, Dec 8, 2014 at 4:56 AM, Heikki Linnakangas wrote: I don't immediately see the problem either, but I have to say that grovelling through all the resource owners seems ugly anyway. Resource owners are not meant to be traversed like that. And there could be a lot of them, and you have to visit every one of them. That could get expensive if there are a lot of resource owners. 1. I don't really see why resource owners shouldn't be traversed like that. They are clearly intended to form a hierarchy, and there's existing code that recurses through the hierarchy from a given level downward. What's ugly about that? I can't exactly point my finger on it, but it just feels wrong from a modularity point of view. Their purpose is to make sure that we don't leak resources on abort, by allowing easy an "release everything" operation. It's not designed for finding objects based on some other criteria. There is similar double bookkeeping of many other things that are tracked by resource owners. Heavy-weight locks are tracked by LOCALLOCK structs, buffer pins in PrivateRefCount array etc. Those things existed before resource owners were invented, but if we were starting from scratch, that design would still make sense, as different objects have different access criteria. fd.c needs to be able to find the least-recently-used open file, for example, and you need to find the snapshot with the lowest xmin. Upthread, I suggested keeping a tally of the number of snapshots with the advertised xmin and recomputing the xmin to advertise only when it reaches 0. This patch doesn't implementation that optimization, but it does have code that aborts the traversal of the resource owner hierarchy as soon as we see an xmin that will preclude advancing our advertised xmin. Releasing N resource owners could therefore cost O(N^2) in the worst case, but note that releasing N resource owners is *already* an O(N^2) operation in the worst case, because the list of children of a particular parent resource owner is singly linked, and thus deleting a resource owner is O(N). It's been that way for an awfully long time without anyone complaining, probably because (a) it's not particularly common to have large numbers of cursors open simultaneously and (b) even if you do have that, the constant factor is pretty low. I think you're confusing the N and the N above. It's true that deleting a resource owner is O(M), where the M is the number of children of that resource owner. It does not follow that releasing N resource owners is O(N^2), where N is the number of resource owners released. Calling ResourceOwnerDelete(x) will only visit each resource owner in that tree once, so it's O(N), where N is the total number of resource owners in the tree. I did some testing of the worst case scenario. The attached script first creates a lot of cursors, then a lot of savepoints, and finally closes the cursors in FIFO order. When the number of savepoints is high enough, this actually segfaults with your patch, because you run out of stack space when recursing the subxact resource owners. That's hardly this patch's fault, I'm actually surprised it doesn't crash without it, because we recurse into all resource owners in ResourceOwnerRelease too. Apparently the subxacts are closed in LIFO order at commit, but there might be are other cases where you could trigger that. In any case, a stack-depth check would be nice. - Heikki cursors.sh Description: Bourne shell script -- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers
Re: [HACKERS] advance local xmin more aggressively
On Mon, Dec 8, 2014 at 9:31 AM, Robert Haas wrote: > On Mon, Dec 8, 2014 at 4:56 AM, Heikki Linnakangas > wrote: >> I don't immediately see the problem either, but I have to say that >> grovelling through all the resource owners seems ugly anyway. Resource >> owners are not meant to be traversed like that. And there could be a lot of >> them, and you have to visit every one of them. That could get expensive if >> there are a lot of resource owners. > > 1. I don't really see why resource owners shouldn't be traversed like > that. They are clearly intended to form a hierarchy, and there's > existing code that recurses through the hierarchy from a given level > downward. What's ugly about that? Here's a patch. I looked at the issue of tracking parent-less resource owners a bit more closely. It turns out that resource owners are always created in TopMemoryContext "since they should only be freed explicitly" (cf. resowner.c). I was a bit worried about that, because it would be bad to keep a list of all parent-less resource owners if list elements could vanish in a context reset. But that doesn't seem to be a concern. So I stole the nextchild links of parent-less resource owners, which are not used for anything currently, to keep a list of such resource owners. It occurred to me that there's probably not much value in recomputing xmin when the active snapshot stack is non-empty. It's not impossible that a PL/pgsql function could close a cursor with an old xmin and then do lots of other work (or just sleep for a long time) before returning to the top-level, but it is pretty unlikely. So the attached patch only considers recomputing the advertised xmin when the active snapshot stack is empty. That'll happen at the end of each statement, which seems soon enough. Upthread, I suggested keeping a tally of the number of snapshots with the advertised xmin and recomputing the xmin to advertise only when it reaches 0. This patch doesn't implementation that optimization, but it does have code that aborts the traversal of the resource owner hierarchy as soon as we see an xmin that will preclude advancing our advertised xmin. Releasing N resource owners could therefore cost O(N^2) in the worst case, but note that releasing N resource owners is *already* an O(N^2) operation in the worst case, because the list of children of a particular parent resource owner is singly linked, and thus deleting a resource owner is O(N). It's been that way for an awfully long time without anyone complaining, probably because (a) it's not particularly common to have large numbers of cursors open simultaneously and (b) even if you do have that, the constant factor is pretty low. Following Heikki's previous suggestion, the patch contains checks to make sure that we find exactly the number of registered snapshots that we expect to find. We could consider demoting these to asserts or something, but this is more likely to catch bugs if there are any. -- Robert Haas EnterpriseDB: http://www.enterprisedb.com The Enterprise PostgreSQL Company diff --git a/src/backend/utils/resowner/resowner.c b/src/backend/utils/resowner/resowner.c index 0955bcc..186fa91 100644 --- a/src/backend/utils/resowner/resowner.c +++ b/src/backend/utils/resowner/resowner.c @@ -113,6 +113,7 @@ typedef struct ResourceOwnerData ResourceOwner CurrentResourceOwner = NULL; ResourceOwner CurTransactionResourceOwner = NULL; ResourceOwner TopTransactionResourceOwner = NULL; +ResourceOwner FirstRootResourceOwner = NULL; /* * List of add-on callbacks for resource releasing @@ -167,6 +168,11 @@ ResourceOwnerCreate(ResourceOwner parent, const char *name) owner->nextchild = parent->firstchild; parent->firstchild = owner; } + else + { + owner->nextchild = FirstRootResourceOwner; + FirstRootResourceOwner = owner; + } return owner; } @@ -442,6 +448,8 @@ ResourceOwnerDelete(ResourceOwner owner) * the owner tree. Better a leak than a crash. */ ResourceOwnerNewParent(owner, NULL); + Assert(FirstRootResourceOwner == owner); + FirstRootResourceOwner = owner->nextchild; /* And free the object. */ if (owner->buffers) @@ -502,6 +510,14 @@ ResourceOwnerNewParent(ResourceOwner owner, } } } + else + { + ResourceOwner *link = &FirstRootResourceOwner; + + while (*link != owner) + link = &((*link)->nextchild); + *link = owner->nextchild; + } if (newparent) { @@ -513,7 +529,8 @@ ResourceOwnerNewParent(ResourceOwner owner, else { owner->parent = NULL; - owner->nextchild = NULL; + owner->nextchild = FirstRootResourceOwner; + FirstRootResourceOwner = owner; } } @@ -1161,6 +1178,50 @@ ResourceOwnerForgetSnapshot(ResourceOwner owner, Snapshot snapshot) } /* + * Invoke a caller-supplied function on every snapshot registered with this + * resource owner or any descendent resource owner. The callback can abort + * the traversal by returning true. + */ +bool +ResourceOwnerWalkSnapshots(ResourceOwner owner, + Reso
Re: [HACKERS] advance local xmin more aggressively
On Mon, Dec 8, 2014 at 4:56 AM, Heikki Linnakangas wrote: > I don't immediately see the problem either, but I have to say that > grovelling through all the resource owners seems ugly anyway. Resource > owners are not meant to be traversed like that. And there could be a lot of > them, and you have to visit every one of them. That could get expensive if > there are a lot of resource owners. 1. I don't really see why resource owners shouldn't be traversed like that. They are clearly intended to form a hierarchy, and there's existing code that recurses through the hierarchy from a given level downward. What's ugly about that? 2. If you have a lot of resource owners, you probably have a lot of snapshots, so walking a list will be expensive, too. It will be disproportionately expensive to walk the resource owner tree only if there are lots of resource owners but very few of them have any snapshots. But I don't think that can really happen. If you've got lots of resource owners and each one has a snapshot, you'll traverse ~3 pointers per snapshot: ~1 to find the next ResourceOwner, 1 to find the snapshot array, and 1 to reach the snapshot itself. A non-inlined list would traverse only 2 pointers per snapshot, but that doesn't seem like enough of a difference to get excited about. > BTW, you could easily detect that you haven't seen all the registered > snapshots, after traversing the resource owner, as we keep the counter of > them. So you could just fall back to not advancing the xmin if it happens. Not a bad idea. Or we could elog(FATAL) or fail an assertion if we don't see them all, and then if it happens we call it a bug and fix it. > I would prefer doing separate bookkeeping in snapmgr.c. It seems like it > wouldn't be difficult to do. It has to be cheap, but I don't see any reason > to believe that it'd be more expensive than traversing through all resource > owners. A binary heap or some other priority queue implementation should > work pretty well for this. That's optimizing for making the xmin recomputation cheap, but we don't expect xmin recomputation to happen very often, so I'm not sure that's the right trade-off. > And that optimization won't save you in the cases where it doesn't apply. > For example, what if snapshots happen to form a queue, rather than a stack: > > DECLARE c1 CURSOR FOR ...; > DECLARE c2 CURSOR FOR ...; > DECLARE c3 CURSOR FOR ...; > ... > DECLARE c1000 CURSOR FOR ...; > > CLOSE c1; > CLOSE c2; > CLOSE c3; > ... > CLOSE c1000; > > It's not hard to imagine an application doing that. Sure, you could rewrite > it to close the cursors in different order, but on the face of it that's not > an unreasonable thing for an application to do. I think we should avoid the > O(n^2) behaviour in that case. You won't actually get O(n^2) behavior here unless those DECLARE CURSOR statements all get snapshots with different xmins; if many of them have snapshots that share an xmin, then the optimization of recomputing only when there are no snaps with a given xmin will save you. That's a bit pathological, but maybe we should try to cater to it. -- Robert Haas EnterpriseDB: http://www.enterprisedb.com The Enterprise PostgreSQL Company -- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers
Re: [HACKERS] advance local xmin more aggressively
On 12/05/2014 05:05 PM, Robert Haas wrote: [ reviving a very old thread ] On Tue, Feb 10, 2009 at 4:11 PM, Tom Lane wrote: Alvaro Herrera writes: For example, maybe we could keep track of counts of snapshots removed since the last xmin calculation, and only run this routine if the number is different from zero (or some small positive integer). I think most of the callers of SnapshotResetXmin already know they removed something. It might be interesting for FreeSnapshot or something nearby to note whether the snapshot being killed has xmin = proc's xmin, and only do the update calculation if so. I still dislike the assumption that all resource owners are children of a known owner. I suspect in fact that it's demonstrably wrong right now, let alone in future (cf comments in PortalRun). If we're going to do this then snapmgr.c needs to track the snapshots for itself. Of course that's going to make the "is it worth it" question even more pressing. I've run into essentially the same problem Jeff originally complained about with a large customer who has long-running transactions that make extensive use of cursors. Cursors are opened and closed over time but it is rare for the number open to reach exactly zero, so what ends up happening is that the backend xmin does not advance. As you can imagine, that doesn't work out well. The approach I came up with initially was similar to Jeff's: start at the topmost resource owner and traverse them all, visiting every snapshot along the way. But then I found this thread and saw the comment that this might be "demonstrably wrong" and referring to the comments in PortalRun. Having reviewed those comments, which don't seem to have changed much in the last five years, I can't understand how they related to this issue. It's true that the TopTransaction resource owner could get swapped out under us during an internal commit, but why should SnapshotResetXmin() have to care? It just traverses the one that is in effect at the time it gets called. The only real danger I see here is that there could be *more than one* toplevel resource owner. I wonder if we could solve that problem by adding a registry of active toplevel resource owners, so that if we have a forest rather than a tree we can still find everything. I don't immediately see the problem either, but I have to say that grovelling through all the resource owners seems ugly anyway. Resource owners are not meant to be traversed like that. And there could be a lot of them, and you have to visit every one of them. That could get expensive if there are a lot of resource owners. BTW, you could easily detect that you haven't seen all the registered snapshots, after traversing the resource owner, as we keep the counter of them. So you could just fall back to not advancing the xmin if it happens. The problem I see with having snapmgr.c track the snapshots for itself is that it is mostly duplicating of bookkeeping which is already being done. Since this problem doesn't affect the majority of users, it's not desirable to add a lot of extra bookkeeping to cater to it - but even if it did affect a lot of users, we still want it to be as cheap as possible, and reusing the tracking that resource owners are already doing seems like the way to get there. I would prefer doing separate bookkeeping in snapmgr.c. It seems like it wouldn't be difficult to do. It has to be cheap, but I don't see any reason to believe that it'd be more expensive than traversing through all resource owners. A binary heap or some other priority queue implementation should work pretty well for this. I think there are a couple of things we can do to make this cheaper. Suppose we keep track of the oldest xmin of any registered snapshot and the number of registered snapshots that have that particular xmin. Every time we deregister a snapshot, we check whether it is one of the ones with the minimum xmin; if it is, we decrement the count. When the count reaches 0, we know that a traversal of all registered snapshots is guaranteed to find a newer value to advertise in MyProc->xmin; that way, we can arrange to do the work only when it's likely to pay off. In most cases this won't happen until the last snapshot is unregistered, because our snapshots normally form a stack, with newer snapshots having been taken later. But if somebody unregisters the oldest snapshot we'll immediately notice and recalculate. Yeah, that's a reasonable optimization. It's a reasonable optimization even if you do the bookkeeping in snapmgr.c. And that optimization won't save you in the cases where it doesn't apply. For example, what if snapshots happen to form a queue, rather than a stack: DECLARE c1 CURSOR FOR ...; DECLARE c2 CURSOR FOR ...; DECLARE c3 CURSOR FOR ...; ... DECLARE c1000 CURSOR FOR ...; CLOSE c1; CLOSE c2; CLOSE c3; ... CLOSE c1000; It's not hard to imagine an application doing that. Sure, you could rewrite it to close the
Re: [HACKERS] advance local xmin more aggressively
[ reviving a very old thread ] On Tue, Feb 10, 2009 at 4:11 PM, Tom Lane wrote: > Alvaro Herrera writes: >> For example, maybe we could keep track of counts of snapshots removed >> since the last xmin calculation, and only run this routine if the number >> is different from zero (or some small positive integer). > > I think most of the callers of SnapshotResetXmin already know they > removed something. > > It might be interesting for FreeSnapshot or something nearby to note > whether the snapshot being killed has xmin = proc's xmin, and only do > the update calculation if so. > > I still dislike the assumption that all resource owners are children of > a known owner. I suspect in fact that it's demonstrably wrong right > now, let alone in future (cf comments in PortalRun). If we're going to > do this then snapmgr.c needs to track the snapshots for itself. Of > course that's going to make the "is it worth it" question even more > pressing. I've run into essentially the same problem Jeff originally complained about with a large customer who has long-running transactions that make extensive use of cursors. Cursors are opened and closed over time but it is rare for the number open to reach exactly zero, so what ends up happening is that the backend xmin does not advance. As you can imagine, that doesn't work out well. The approach I came up with initially was similar to Jeff's: start at the topmost resource owner and traverse them all, visiting every snapshot along the way. But then I found this thread and saw the comment that this might be "demonstrably wrong" and referring to the comments in PortalRun. Having reviewed those comments, which don't seem to have changed much in the last five years, I can't understand how they related to this issue. It's true that the TopTransaction resource owner could get swapped out under us during an internal commit, but why should SnapshotResetXmin() have to care? It just traverses the one that is in effect at the time it gets called. The only real danger I see here is that there could be *more than one* toplevel resource owner. I wonder if we could solve that problem by adding a registry of active toplevel resource owners, so that if we have a forest rather than a tree we can still find everything. The problem I see with having snapmgr.c track the snapshots for itself is that it is mostly duplicating of bookkeeping which is already being done. Since this problem doesn't affect the majority of users, it's not desirable to add a lot of extra bookkeeping to cater to it - but even if it did affect a lot of users, we still want it to be as cheap as possible, and reusing the tracking that resource owners are already doing seems like the way to get there. I think there are a couple of things we can do to make this cheaper. Suppose we keep track of the oldest xmin of any registered snapshot and the number of registered snapshots that have that particular xmin. Every time we deregister a snapshot, we check whether it is one of the ones with the minimum xmin; if it is, we decrement the count. When the count reaches 0, we know that a traversal of all registered snapshots is guaranteed to find a newer value to advertise in MyProc->xmin; that way, we can arrange to do the work only when it's likely to pay off. In most cases this won't happen until the last snapshot is unregistered, because our snapshots normally form a stack, with newer snapshots having been taken later. But if somebody unregisters the oldest snapshot we'll immediately notice and recalculate. Thoughts? -- Robert Haas EnterpriseDB: http://www.enterprisedb.com The Enterprise PostgreSQL Company -- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers
Re: [HACKERS] advance local xmin more aggressively
Jeff Davis writes: > I could imagine some situations that this could be more useful in the > future. For instance, Hot Standby will increase the consequences of not > advancing xmin when it's possible to do so. The question wasn't really about the consequences; it was about whether there was any hope of this patch being able to advance xmin more than the code that's there, for common usage patterns. regards, tom lane -- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers
Re: [HACKERS] advance local xmin more aggressively
On Wed, 2009-02-11 at 12:20 -0500, Tom Lane wrote: > You pointed out the case of opening a cursor in a read-committed > transaction, and then closing it before end of transaction, as a > place where your patch could hope to improve matters. Are there > others? Could your application be made to close that cursor before > opening another one (so that its set of open snapshots momentarily > goes to zero)? It seems like the use case for more complex > bookkeeping for open snapshots is a tad narrow. > I'm approaching this from the perspective of our system at Truviso. I used the cursor example to illustrate the issue in normal postgresql, but our problem isn't directly related to cursors. I don't have a compelling use case right now for normal postgresql, because of the reasons you mention. However, at Truviso, we have to come up with some kind of solution to this anyway. Ideally, I'd like to make something that's acceptable to postgres in general (meaning no performance impact or code ugliness). I could imagine some situations that this could be more useful in the future. For instance, Hot Standby will increase the consequences of not advancing xmin when it's possible to do so. Regards, Jeff Davis -- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers
Re: [HACKERS] advance local xmin more aggressively
Jeff Davis writes: > On Wed, 2009-02-11 at 10:20 -0500, Robert Haas wrote: >> Can you clarify the circumstances in which this patch would show a >> benefit over the current system? > In the current code, if the process is always holding at least one > snapshot, the process' xmin never advances. Right, and the question is the scope of the circumstances in which that's the case and your patch makes things better. I believe that * a process outside a transaction has no snapshots, so your patch won't change anything * a process in a serializable transaction will hold the serializable snapshot till end of xact, so your patch won't change anything * a process in a read-committed transaction will typically hold snapshot(s) for the duration of each query, and then release them all, so your patch won't change anything You pointed out the case of opening a cursor in a read-committed transaction, and then closing it before end of transaction, as a place where your patch could hope to improve matters. Are there others? Could your application be made to close that cursor before opening another one (so that its set of open snapshots momentarily goes to zero)? It seems like the use case for more complex bookkeeping for open snapshots is a tad narrow. regards, tom lane -- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers
Re: [HACKERS] advance local xmin more aggressively
On Wed, 2009-02-11 at 10:20 -0500, Robert Haas wrote: > On Tue, Feb 10, 2009 at 3:06 PM, Jeff Davis wrote: > > With the new snapshot maintenance code, it looks like we can advance the > > xmin more aggressively. > > Can you clarify the circumstances in which this patch would show a > benefit over the current system? In the current code, if the process is always holding at least one snapshot, the process' xmin never advances. That means VACUUM will never be able to reclaim tuples visible during the first snapshot taken during the transaction. With the patch, as long as snapshots are being released, the process' xmin will keep advancing to reflect the oldest snapshot currently held by that process. In order to accomplish that, every time a snapshot is released I have to look at every snapshot that the process still holds to find the new local minimum xmin. The current code will only change the process' xmin if there are no snapshots at all. As Tom pointed out, one of the assumptions I made writing the patch is not always true. I am still trying to determine the implications of that. Regards, Jeff Davis -- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers
Re: [HACKERS] advance local xmin more aggressively
On Tue, Feb 10, 2009 at 3:06 PM, Jeff Davis wrote: > With the new snapshot maintenance code, it looks like we can advance the > xmin more aggressively. Can you clarify the circumstances in which this patch would show a benefit over the current system? ...Robert -- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers
Re: [HACKERS] advance local xmin more aggressively
Alvaro Herrera writes: > For example, maybe we could keep track of counts of snapshots removed > since the last xmin calculation, and only run this routine if the number > is different from zero (or some small positive integer). I think most of the callers of SnapshotResetXmin already know they removed something. It might be interesting for FreeSnapshot or something nearby to note whether the snapshot being killed has xmin = proc's xmin, and only do the update calculation if so. I still dislike the assumption that all resource owners are children of a known owner. I suspect in fact that it's demonstrably wrong right now, let alone in future (cf comments in PortalRun). If we're going to do this then snapmgr.c needs to track the snapshots for itself. Of course that's going to make the "is it worth it" question even more pressing. regards, tom lane -- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers
Re: [HACKERS] advance local xmin more aggressively
Tom Lane wrote: > Jeff Davis writes: > > With the new snapshot maintenance code, it looks like we can advance the > > xmin more aggressively. > > The original design for that contemplated having snapmgr.c track > all the snapshots (cf the comment for RegisteredSnapshots). I don't > particularly care for having it assume that it can find all the resource > owners. > > But really the more important problem is to demonstrate that you > actually get a benefit commensurate with the additional cycles spent. > IIRC the reason the code is the way it is is that we concluded that for > typical usage patterns there wouldn't be any win from tracking things > more aggressively. As somebody pointed out recently, SnapshotResetXmin > is called quite a lot; if it's expensive it's going to be a problem. I think Jeff is coming from the Truviso point of view: they have very long running transactions, and they normally have a number of snapshots that are always being updated, but it's rare that there are no snapshots at all. So this optimization actually buys a chance to update Xmin at all; with the current code, they keep the same xmin all the time because there's always some snapshot. I'm not sure if the best answer is to just state that Truviso should keep maintaining this patch privately. It would be, of course, much better to come up with a way to keep track of this in a cheaper way. For example, maybe we could keep track of counts of snapshots removed since the last xmin calculation, and only run this routine if the number is different from zero (or some small positive integer). -- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers
Re: [HACKERS] advance local xmin more aggressively
Jeff Davis writes: > With the new snapshot maintenance code, it looks like we can advance the > xmin more aggressively. The original design for that contemplated having snapmgr.c track all the snapshots (cf the comment for RegisteredSnapshots). I don't particularly care for having it assume that it can find all the resource owners. But really the more important problem is to demonstrate that you actually get a benefit commensurate with the additional cycles spent. IIRC the reason the code is the way it is is that we concluded that for typical usage patterns there wouldn't be any win from tracking things more aggressively. As somebody pointed out recently, SnapshotResetXmin is called quite a lot; if it's expensive it's going to be a problem. regards, tom lane -- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers
[HACKERS] advance local xmin more aggressively
With the new snapshot maintenance code, it looks like we can advance the xmin more aggressively. For instance: S1: INSERT INTO foo VALUES(1); S2: BEGIN; DECLARE c1 CURSOR FOR SELECT i FROM foo; S1: DELETE FROM foo; S2: DECLARE c2 CURSOR FOR SELECT i FROM foo; CLOSE c1; S1: VACUUM VERBOSE foo; The VACUUM should be able to clean up the deleted tuple, because it's no longer visible to anyone. Attached a small patch to accomplish this. I don't expect it to be put in 8.4, but it's small enough that I thought I should at least send it in just in case. Regards, Jeff Davis diff --git a/src/backend/utils/resowner/resowner.c b/src/backend/utils/resowner/resowner.c index 7b6e15b..06bf425 100644 --- a/src/backend/utils/resowner/resowner.c +++ b/src/backend/utils/resowner/resowner.c @@ -21,6 +21,7 @@ #include "postgres.h" #include "access/hash.h" +#include "access/transam.h" #include "storage/bufmgr.h" #include "storage/proc.h" #include "utils/memutils.h" @@ -1026,6 +1027,47 @@ ResourceOwnerForgetSnapshot(ResourceOwner owner, Snapshot snapshot) } /* + * Find the smallest xmin among all ResourceOwners under owner. + */ +TransactionId +ResourceOwnerSnapshotsMinXmin(ResourceOwner owner) +{ + ResourceOwner child; + Snapshot *snapshots = owner->snapshots; + int ns1 = owner->nsnapshots - 1; + int i; + TransactionId min_xmin = InvalidTransactionId; + + for (i = ns1; i >= 0; i--) + { + TransactionId xmin = snapshots[i]->xmin; + + if (!TransactionIdIsValid(xmin)) + continue; + + if (!TransactionIdIsValid(min_xmin) || + TransactionIdPrecedes(xmin, min_xmin)) + min_xmin = xmin; + } + + /* Recurse to handle descendants */ + for (child = owner->firstchild; child != NULL; child = child->nextchild) + { + TransactionId xmin = ResourceOwnerSnapshotsMinXmin(child); + + if (!TransactionIdIsValid(xmin)) + continue; + + if (!TransactionIdIsValid(min_xmin) || + TransactionIdPrecedes(xmin, min_xmin)) + min_xmin = xmin; + } + + return min_xmin; +} + + +/* * Debugging subroutine */ static void diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c index 9992895..b7b0506 100644 --- a/src/backend/utils/time/snapmgr.c +++ b/src/backend/utils/time/snapmgr.c @@ -107,6 +107,7 @@ static boolregistered_serializable = false; static Snapshot CopySnapshot(Snapshot snapshot); static void FreeSnapshot(Snapshot snapshot); static void SnapshotResetXmin(void); +static TransactionId GetTrueLocalXmin(void); /* @@ -432,8 +433,53 @@ UnregisterSnapshotFromOwner(Snapshot snapshot, ResourceOwner owner) static void SnapshotResetXmin(void) { + TransactionId local_xmin; + if (RegisteredSnapshots == 0 && ActiveSnapshot == NULL) + { MyProc->xmin = InvalidTransactionId; + return; + } + + /* + * The transaction may have a snapshot but no xmin during abort + * when the transaction has a registered snapshot that is not + * active. + */ + if (!TransactionIdIsValid(MyProc->xmin)) + return; + + local_xmin = GetTrueLocalXmin(); + if (!TransactionIdIsValid(local_xmin) || + TransactionIdPrecedes(MyProc->xmin, local_xmin)) + MyProc->xmin = local_xmin; +} + +/* + * Returns the smallest xmin value in use by any of the active + * snapshots in the current process. + */ +static TransactionId +GetTrueLocalXmin(void) +{ + TransactionId min_xmin = InvalidTransactionId; + ActiveSnapshotElt *active_elt; + + min_xmin = ResourceOwnerSnapshotsMinXmin(TopTransactionResourceOwner); + + for (active_elt = ActiveSnapshot; active_elt != NULL; active_elt = active_elt->as_next) + { + TransactionId xmin = active_elt->as_snap->xmin; + + if (!TransactionIdIsValid(xmin)) + continue; + + if (!TransactionIdIsValid(min_xmin) || + TransactionIdPrecedes(xmin, min_xmin)) + min_xmin = xmin; + } + + return min_xmin; } /* @@ -458,7 +504,7 @@ AtSubCommit_Snapshot(int level) /* * AtSubAbort_Snapshot - * Clean up snapshots after a subtransaction abort + * Clean up snapshots after a subtransaction abort */ void AtSubAbort_Snapshot(int level) diff --git a/src/include/utils/resowner.h b/src/include/utils/resowner.h index 3f05bf4..f4e051e 100644 --- a/src/include/utils/resowner.h +++ b/src/include/utils/resowner.h @@ -128,5 +128,6 @@ extern void ResourceOwnerRememberSnapshot(ResourceOwner owner, Snapshot snapshot); extern void ResourceOwnerForgetSnapshot(ResourceOwner owner, Snapshot snapshot); +extern TransactionId ResourceOwnerSnapshotsMinXmin(ResourceOwner owner); #endif /* RESOWNER_H */ -- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers