Re: Implementing Incremental View Maintenance
On Tue, 20 Apr 2021 09:51:34 +0900 Yugo NAGATA wrote: > On Mon, 19 Apr 2021 17:40:31 -0400 > Tom Lane wrote: > > > Andrew Dunstan writes: > > > This patch (v22c) just crashed for me with an assertion failure on > > > Fedora 31. Here's the stack trace: > > > > > #2 0x0094a54a in ExceptionalCondition > > > (conditionName=conditionName@entry=0xa91dae "queryDesc->sourceText != > > > NULL", errorType=errorType@entry=0x99b468 "FailedAssertion", > > > fileName=fileName@entry=0xa91468 > > > "/home/andrew/pgl/pg_head/src/backend/executor/execMain.c", > > > lineNumber=lineNumber@entry=199) at > > > /home/andrew/pgl/pg_head/src/backend/utils/error/assert.c:69 > > > > That assert just got added a few days ago, so that's why the patch > > seemed OK before. > > Thank you for letting me know. I'll fix it. Attached is the fixed patch. queryDesc->sourceText cannot be NULL after commit b2668d8, so now we pass an empty string "" for refresh_matview_datafill() instead NULL when maintaining views incrementally. Regards, Yugo Nagata -- Yugo NAGATA IVM_patches_v22d.tar.gz Description: application/gzip
Re: Use simplehash.h instead of dynahash in SMgr
On Mon, 26 Apr 2021 at 05:03, Yura Sokolov wrote: > If your test so sensitive to hash function speed, then I'd suggest > to try something even simpler: > > static inline uint32 > relfilenodebackend_hash(RelFileNodeBackend *rnode) > { > uint32 h = 0; > #define step(x) h ^= (uint32)(x) * 0x85ebca6b; h = pg_rotate_right(h, > 11); h *= 9; > step(rnode->node.relNode); > step(rnode->node.spcNode); // spcNode could be different for same > relNode only > // during table movement. Does it pay > to hash it? > step(rnode->node.dbNode); > step(rnode->backend); // does it matter to hash backend? >// It equals to InvalidBackendId for > non-temporary relations >// and temporary relations in same > database never have same >// relNode (have they?). > return murmurhash32(hashkey); > } I tried that and it got a median result of 113.795 seconds over 14 runs with this recovery benchmark test. LOG: size: 4096, members: 2032, filled: 0.496094, total chain: 1014, max chain: 6, avg chain: 0.499016, total_collisions: 428, max_collisions: 3, avg_collisions: 0.210630 I also tried the following hash function just to see how much performance might be left from speeding it up: static inline uint32 relfilenodebackend_hash(RelFileNodeBackend *rnode) { uint32 h; h = pg_rotate_right32((uint32) rnode->node.relNode, 16) ^ ((uint32) rnode->node.dbNode); return murmurhash32(h); } I got a median of 112.685 seconds over 14 runs with: LOG: size: 4096, members: 2032, filled: 0.496094, total chain: 1044, max chain: 7, avg chain: 0.513780, total_collisions: 438, max_collisions: 3, avg_collisions: 0.215551 So it looks like there might not be too much left given that v2 was 113.375 seconds (median over 10 runs) > I'd like to see benchmark code. It quite interesting this place became > measurable at all. Sure. $ cat recoverybench_insert_hash.sh #!/bin/bash pg_ctl stop -D pgdata -m smart pg_ctl start -D pgdata -l pg.log -w psql -f setup1.sql postgres > /dev/null psql -c "create table log_wal (lsn pg_lsn not null);" postgres > /dev/null psql -c "insert into log_wal values(pg_current_wal_lsn());" postgres > /dev/null psql -c "insert into hp select x,0 from generate_series(1,1) x;" postgres > /dev/null psql -c "insert into log_wal values(pg_current_wal_lsn());" postgres > /dev/null psql -c "select 'Used ' || pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_lsn(), lsn)) || ' of WAL' from log_wal limit 1;" postgres pg_ctl stop -D pgdata -m immediate -w echo Starting Postgres... pg_ctl start -D pgdata -l pg.log $ cat setup1.sql drop table if exists hp; create table hp (a int primary key, b int not null) partition by hash(a); select 'create table hp'||x|| ' partition of hp for values with (modulus 1000, remainder '||x||');' from generate_Series(0,999) x; \gexec config: shared_buffers = 10GB checkpoint_timeout = 60min max_wal_size = 20GB min_wal_size = 20GB For subsequent runs, if you apply the patch that does the PANIC at the end of recovery, you'll just need to start the database up again to perform recovery again. You can then just tail -f on your postgres logs to watch for the "redo done" message which will show you the time spent doing recovery. David.
Re: logical replication empty transactions
On Fri, Apr 23, 2021 at 3:46 PM Ajin Cherian wrote: > > > > On Mon, Apr 19, 2021 at 6:22 PM Peter Smith wrote: >> >> >> Here are a some review comments: >> >> -- >> >> 1. The patch v3 applied OK but with whitespace warnings >> >> [postgres@CentOS7-x64 oss_postgres_2PC]$ git apply >> ../patches_misc/v3-0001-Skip-empty-transactions-for-logical-replication.patch >> ../patches_misc/v3-0001-Skip-empty-transactions-for-logical-replication.patch:98: >> indent with spaces. >> /* output BEGIN if we haven't yet, avoid for streaming and >> non-transactional messages */ >> ../patches_misc/v3-0001-Skip-empty-transactions-for-logical-replication.patch:99: >> indent with spaces. >> if (!data->xact_wrote_changes && !in_streaming && transactional) >> warning: 2 lines add whitespace errors. >> >> -- > > > Fixed. > >> >> >> 2. Please create a CF entry in [1] for this patch. >> >> -- >> >> 3. Patch comment >> >> The comment describes the problem and then suddenly just says >> "Postpone the BEGIN message until the first change." >> >> I suggest changing it to say more like... "(blank line) This patch >> addresses the above problem by postponing the BEGIN message until the >> first change." >> >> -- >> > > Updated. > >> >> 4. pgoutput.h >> >> Maybe for consistency with the context member, the comment for the new >> member should be to the right instead of above it? >> >> @@ -20,6 +20,9 @@ typedef struct PGOutputData >> MemoryContext context; /* private memory context for transient >> * allocations */ >> >> + /* flag indicating whether messages have previously been sent */ >> + boolxact_wrote_changes; >> + >> >> -- >> >> 5. pgoutput.h >> >> + /* flag indicating whether messages have previously been sent */ >> >> "previously been sent" --> "already been sent" ?? >> >> -- >> >> 6. pgoutput.h - misleading member name >> >> Actually, now that I have read all the rest of the code and how this >> member is used I feel that this name is very misleading. e.g. For >> "streaming" case then you still are writing changes but are not >> setting this member at all - therefore it does not always mean what it >> says. >> >> I feel a better name for this would be something like >> "sent_begin_txn". Then if you have sent BEGIN it is true. If you >> haven't sent BEGIN it is false. It eliminates all ambiguity naming it >> this way instead. >> >> (This makes my feedback #5 redundant because the comment will be a bit >> different if you do this). >> >> -- > > > Fixed above comments. >> >> >> 7. pgoutput.c - function pgoutput_begin_txn >> >> @@ -345,6 +345,23 @@ pgoutput_startup(LogicalDecodingContext *ctx, >> OutputPluginOptions *opt, >> static void >> pgoutput_begin_txn(LogicalDecodingContext *ctx, ReorderBufferTXN *txn) >> { >> >> I guess that you still needed to pass the txn because that is how the >> API is documented, right? >> >> But I am wondering if you ought to flag it as unused so you wont get >> some BF machine giving warnings about it. >> >> e.g. Syntax like this? >> >> pgoutput_begin_txn(LogicalDecodingContext *ctx, ReorderBufferTXN * txn) { >> (void)txn; >> ... > > > Updated. >> >> -- >> >> 8. pgoutput.c - function pgoutput_begin_txn >> >> @@ -345,6 +345,23 @@ pgoutput_startup(LogicalDecodingContext *ctx, >> OutputPluginOptions *opt, >> static void >> pgoutput_begin_txn(LogicalDecodingContext *ctx, ReorderBufferTXN *txn) >> { >> + PGOutputData *data = ctx->output_plugin_private; >> + >> + /* >> + * Don't send BEGIN message here. Instead, postpone it until the first >> + * change. In logical replication, a common scenario is to replicate a set >> + * of tables (instead of all tables) and transactions whose changes were on >> + * table(s) that are not published will produce empty transactions. These >> + * empty transactions will send BEGIN and COMMIT messages to subscribers, >> + * using bandwidth on something with little/no use for logical replication. >> + */ >> + data->xact_wrote_changes = false; >> + elog(LOG,"Holding of begin"); >> +} >> >> Why is this loglevel LOG? Looks like leftover debugging. > > > Removed. >> >> >> -- >> >> 9. pgoutput.c - function pgoutput_commit_txn >> >> @@ -384,8 +401,14 @@ static void >> pgoutput_commit_txn(LogicalDecodingContext *ctx, ReorderBufferTXN *txn, >> XLogRecPtr commit_lsn) >> { >> + PGOutputData *data = ctx->output_plugin_private; >> + >> OutputPluginUpdateProgress(ctx); >> >> + /* skip COMMIT message if nothing was sent */ >> + if (!data->xact_wrote_changes) >> + return; >> + >> >> In the case where you decided to do nothing does it make sense that >> you still called the function OutputPluginUpdateProgress(ctx); ? >> I thought perhaps that your new check should come first so this call >> would never happen. > > > Even though the empty transaction is not sent, the LSN is tracked as decoded, > hence the progress needs to be updated. > >> >> -- >> >> 10. pgoutput.c - variable declarations without casts >> >> + PGOutputDat
Re: Table refer leak in logical replication
On Fri, Apr 23, 2021 at 09:38:01PM +0900, Amit Langote wrote: > On Thu, Apr 22, 2021 at 1:45 PM Michael Paquier wrote: >> On the other hand, the tests for partitions have much more value IMO, >> but looking closely I think that we can do better: >> - Create triggers on more relations of the partition tree, >> particularly to also check when a trigger does not fire. > > It seems you're suggesting to adopt 003's trigger firing tests for > partitions in 013, but would we gain much by that? I was suggesting the opposite, aka apply the trigger design that you are introducing in 013 within 003. But that may not be necessary either :) >> - Use a more generic name for tab1_2_op_log and its function >> log_tab1_2_op(), say subs{1,2}_log_trigger_activity. > > Sure, done. At the end I have used something simpler, as of sub{1,2}_trigger_activity and sub{1,2}_trigger_activity_func. >> - Create some extra BEFORE triggers perhaps? > > Again, that seems separate from what we're trying to do here. AIUI, > our aim here is to expand coverage for after triggers, and not really > that of the trigger functionality proper, because nothing seems broken > about it, but that of the trigger target relation opening/closing. > ISTM that's what you're talking about below... Exactly. My review of the worker code is make me feeling that reworking this code could easily lead to some incorrect behavior, so I'd rather be careful with a couple of extra triggers created across the partition tree, down to the partitions on which the triggers are fired actually. >> similarly to what issues_sql_like() or connect_{fails,ok}() do >> already, but that would mean increasing the log level and we don't do >> that to ease the load of the nodes. > > ...sorry, I am not very familiar with our Perl testing infra. Is > there some script that already does something like this? For example, > checking in the logs generated by a "node" that no instance of a > certain WARNING is logged? See for example how we test for SQL patterns with the backend logs using issues_sql_like(), or the more recent connect_ok() and connect_fails(). This functions match regexps with the logs of the backend for patterns. I am not sure if that's worth the extra cycles though. I also feel that we may want a more centralized place as well to check such things, with more matching patterns, like at the end of one run on a set of log files? So, after a set of edits related to the format of the SQL queries, the object names and some indentation (including a perltidy run), I have applied this patch to close the loop. -- Michael signature.asc Description: PGP signature
Re: Parallel INSERT SELECT take 2
On Mon, Apr 26, 2021 at 7:00 AM houzj.f...@fujitsu.com wrote: > > > Instead, how about > > postgres retries the query upon detecting the error that came from a > > parallel > > unsafe function during execution, disable parallelism and run the query? I > > think > > this kind of retry query feature can be built outside of the core postgres, > > but > > IMO it will be good to have inside (of course configurable). IIRC, the > > Teradata > > database has a Query Retry feature. > > > > Thanks for the suggestion. > The retry query feature sounds like a good idea to me. > OTOH, it sounds more like an independent feature which parallel select can > also benefit from it. > +1. I also think retrying a query on an error is not related to this feature and should be built separately if required. -- With Regards, Amit Kapila.
RE: Truncate in synchronous logical replication failed
On Monday, April 26, 2021 1:50 PM Amit Kapila wrote: > On Fri, Apr 23, 2021 at 7:18 PM osumi.takami...@fujitsu.com > wrote: > > > > The latest patch looks good to me. I have made a minor modification and > added a commit message in the attached. Thank you for updating the patch. I think we need one space for "targetindex" in the commit message. From my side, there is no more additional comments ! > I would like to once again ask > whether anybody else thinks we should backpatch this? Just a summary for > anybody not following this thread: > > This patch fixes the Logical Replication of Truncate in synchronous commit > mode. The Truncate operation acquires an exclusive lock on the target > relation and indexes and waits for logical replication of the operation to > finish > at commit. Now because we are acquiring the shared lock on the target index > to get index attributes in pgoutput while sending the changes for the Truncate > operation, it leads to a deadlock. > > Actually, we don't need to acquire a lock on the target index as we build the > cache entry using a historic snapshot and all the later changes are absorbed > while decoding WAL. So, we wrote a special purpose function for logical > replication to get a bitmap of replica identity attribute numbers where we get > that information without locking the target index. > > We are planning not to backpatch this as there doesn't seem to be any field > complaint about this issue since it was introduced in commit 5dfd1e5a in v11. Please anyone, share your opinion on this matter, when you have. Best Regards, Takamichi Osumi
Re: Asynchronous Append on postgres_fdw nodes.
On 4/23/21 8:12 AM, Etsuro Fujita wrote: I have committed the patch. While studying the capabilities of AsyncAppend, I noticed an inconsistency with the cost model of the optimizer: async_capable = off: Append (cost=100.00..695.00 ...) -> Foreign Scan on f1 part_1 (cost=100.00..213.31 ...) -> Foreign Scan on f2 part_2 (cost=100.00..216.07 ...) -> Foreign Scan on f3 part_3 (cost=100.00..215.62 ...) async_capable = on: --- Append (cost=100.00..695.00 ...) -> Async Foreign Scan on f1 part_1 (cost=100.00..213.31 ...) -> Async Foreign Scan on f2 part_2 (cost=100.00..216.07 ...) -> Async Foreign Scan on f3 part_3 (cost=100.00..215.62 ...) Here I see two problems: 1. Cost of an AsyncAppend is the same as cost of an Append. But execution time of the AsyncAppend for three remote partitions has more than halved. 2. Cost of an AsyncAppend looks as a sum of the child ForeignScan costs. I haven't ideas why it may be a problem right now. But I can imagine that it may be a problem in future if we have alternative paths: complex pushdown in synchronous mode (a few rows to return) or simple asynchronous append with a large set of rows to return. -- regards, Andrey Lepikhov Postgres Professional
Re: GISTSTATE is too large
> 26 апр. 2021 г., в 03:20, Andres Freund написал(а): > > So the basic GISTSTATE is 14kB large. And all the information needed to > call support functions for one attribute is spaced so far appart that > it's guaranteed to be on different cachelines and to be very unlikely to > be prefetched by the hardware prefetcher. > > It seems pretty clear that this should be changed to be something more > like > > ... > > with initGISTstate allocating based on > IndexRelationGetNumberOfKeyAttributes() instead of using a constant. > > And then subsequently change GIST_COL_STATE to embed the > FunctionCallInfo, rather than initializiing them on the stack for every > call. Yes, this makes sense. Also, it's viable to reorder fields to group scan and insert routines together, currently they are interlaced. Or maybe we could even split state into insert state and scan state. > I'm not planning on doing this work, but I thought it's sensible to send > to the list anyway. Thanks for idea, I would give it a shot this summer, unless someone else will take it earlier. BTW, It would make sense to avoid penalty call at all: there are many GiST-based access methods that want to see all items together to choose insertion subtree (e.g. R*-tree and RR-tree). Calling penalty function for each tuple on page often is not a good idea at all. Thanks! Best regards, Andrey Borodin.
Re: Forget close an open relation in ReorderBufferProcessTXN()
On Fri, Apr 23, 2021 at 8:03 PM osumi.takami...@fujitsu.com wrote: > > On Saturday, April 17, 2021 4:13 PM Amit Kapila > wrote: > > > I also don't find a test for this. It is introduced in 5dfd1e5a6696, > > > wrote by Simon Riggs, Marco Nenciarini and Peter Eisentraut. Maybe > > > they can explain when we can enter this condition? > > > > My guess is that this has been copied from the code a few lines above to > > handle insert/update/delete where it is required to handle some DDL ops like > > Alter Table but I think we don't need it here (for Truncate op). If that > > understanding turns out to be true then we should either have an Assert for > > this or an elog message. > In this thread, we are discussing 3 topics below... > > (1) necessity of the check for REORDER_BUFFER_CHANGE_TRUNCATE in > ReorderBufferProcessTXN() > (2) discussion of whether we disallow decoding of operations on user catalog > tables or not > (3) memory leak of maybe_send_schema() (patch already provided) > > Let's address those one by one. > In terms of (1), which was close to the motivation of this thread, > I think (1) and (2) are related because if we need (2) then the check removed by (1) needs to be replaced with another check. So, I am not sure how to make this decision. -- With Regards, Amit Kapila.
Re: TRUNCATE on foreign table
On Fri, Apr 23, 2021 at 9:50 PM Fujii Masao wrote: > Thanks for the review! I fixed this. Thanks for the updated patches. In docs v4 patch, I think we can combine below two lines into a single line: + supported by the foreign data wrapper, see . Other than the above minor change, both patches look good to me, I have no further comments. With Regards, Bharath Rupireddy. EnterpriseDB: http://www.enterprisedb.com
Re: comment typo - misnamed function
On Mon, Apr 26, 2021 at 7:59 AM Peter Smith wrote: > > PSA patch to fix a misnamed function in a comment. > > typo: "DecodePreare" --> "DecodePrepare" > Pushed. -- With Regards, Amit Kapila.
Re: Truncate in synchronous logical replication failed
On Fri, Apr 23, 2021 at 7:18 PM osumi.takami...@fujitsu.com wrote: > The latest patch looks good to me. I have made a minor modification and added a commit message in the attached. I would like to once again ask whether anybody else thinks we should backpatch this? Just a summary for anybody not following this thread: This patch fixes the Logical Replication of Truncate in synchronous commit mode. The Truncate operation acquires an exclusive lock on the target relation and indexes and waits for logical replication of the operation to finish at commit. Now because we are acquiring the shared lock on the target index to get index attributes in pgoutput while sending the changes for the Truncate operation, it leads to a deadlock. Actually, we don't need to acquire a lock on the target index as we build the cache entry using a historic snapshot and all the later changes are absorbed while decoding WAL. So, we wrote a special purpose function for logical replication to get a bitmap of replica identity attribute numbers where we get that information without locking the target index. We are planning not to backpatch this as there doesn't seem to be any field complaint about this issue since it was introduced in commit 5dfd1e5a in v11. -- With Regards, Amit Kapila. truncate_in_synchronous_logical_replication_v08.patch Description: Binary data
Re: Does rewriteTargetListIU still need to add UPDATE tlist entries?
On Mon, Apr 26, 2021 at 9:40 AM Tom Lane wrote: > The comments for rewriteTargetListIU say (or said until earlier today) > > * 2. For an UPDATE on a trigger-updatable view, add tlist entries for any > * unassigned-to attributes, assigning them their old values. These will > * later get expanded to the output values of the view. (This is equivalent > * to what the planner's expand_targetlist() will do for UPDATE on a regular > * table, but it's more convenient to do it here while we still have easy > * access to the view's original RT index.) This is only necessary for > * trigger-updatable views, for which the view remains the result relation of > * the query. For auto-updatable views we must not do this, since it might > * add assignments to non-updatable view columns. For rule-updatable views it > * is unnecessary extra work, since the query will be rewritten with a > * different result relation which will be processed when we recurse via > * RewriteQuery. > > I noticed that this is referencing something that, in fact, > expand_targetlist() doesn't do anymore, so that this is a poor > justification for the behavior. My first thought was that we still > need to do it to produce the correct row contents for the INSTEAD OF > trigger, so I updated the comment (in 08a986966) to claim that. Check. > However, on closer inspection, that's nonsense. nodeModifyTable.c > populates the trigger "OLD" row from the whole-row variable that is > generated for the view, and then it computes the "NEW" row using > that old row and the UPDATE tlist; there is no need there for the > UPDATE tlist to compute all the columns. The regression tests still > pass just fine if we take out the questionable logic (cf. attached > patch). Moreover, if you poke into it a little closer than the tests > do, you notice that the existing code is uselessly computing non-updated > columns twice, in both the extra tlist item and the whole-row variable. > > As an example, consider > > create table base (a int, b int); > create view v1 as select a+1 as a1, b+2 as b2 from base; > -- you also need an INSTEAD OF UPDATE trigger, not shown here > explain verbose update v1 set a1 = a1 - 44; > > With HEAD you get > > Update on public.v1 (cost=0.00..60.85 rows=0 width=0) >-> Seq Scan on public.base (cost=0.00..60.85 rows=2260 width=46) > Output: ((base.a + 1) - 44), (base.b + 2), ROW((base.a + 1), (base.b > + 2)), base.ctid > > There's really no need to compute base.b + 2 twice, and with this > patch we don't: > > Update on public.v1 (cost=0.00..55.20 rows=0 width=0) >-> Seq Scan on public.base (cost=0.00..55.20 rows=2260 width=42) > Output: ((base.a + 1) - 44), ROW((base.a + 1), (base.b + 2)), > base.ctid That makes sense to me, at least logically. > I would think that this is a totally straightforward improvement, > but there's one thing in the comments for rewriteTargetListIU that > gives me a little pause: it says > > * We must do items 1,2,3 before firing rewrite rules, else rewritten > * references to NEW.foo will produce wrong or incomplete results. > > As far as I can tell, though, references to NEW values still do the > right thing. I'm not certain whether any of the existing regression > tests really cover this point, but experimenting with the scenario shown > in the attached SQL file says that the DO ALSO rule gets the right > results. It's possible that the expansion sequence is a bit different > than before, but we still end up with the right answer. I also checked what the rewriter and the planner do for the following DO ALSO insert: create rule logit as on update to v1 do also insert into log values(old.a1, new.a1, old.b2, new.b2); and can see that the insert ends up with the right targetlist irrespective of whether or not rewriteTargetListIU() adds an item for NEW.b2. So, I attached a debugger to the update query in your shared script and focused on how ReplaceVarsFromTargetList(), running on the insert query added by the rule, handles the item for NEW.b2 no longer being added to the update's targetlist after your patch. Turns out the result (the insert's targetlist) is the same even if the path taken in ReplaceVarsFromTargetList_callback() is different after your patch. Before: explain verbose update v1 set a1 = a1-44; QUERY PLAN --- Insert on public.log (cost=0.00..60.85 rows=0 width=0) -> Seq Scan on public.base (cost=0.00..60.85 rows=2260 width=16) Output: (base.a + 1), ((base.a + 1) - 44), (base.b + 2), (base.b + 2) Update on public.v1 (cost=0.00..60.85 rows=0 width=0) -> Seq Scan on public.base (cost=0.00..60.85 rows=2260 width=46) Output: ((base.a + 1) - 44), (base.b + 2), ROW((base.a + 1), (base.b + 2)), base.ctid (7 rows) After: explain verbose update v1 set a1 = a1-44;
Re: Can a child process detect postmaster death when in pg_usleep?
On Tue, Apr 20, 2021 at 7:36 AM Bharath Rupireddy wrote: > > On Thu, Apr 15, 2021 at 11:48 AM Bharath Rupireddy > wrote: > > > We definitely have replaced a lot of sleeps with latch.c primitives > > > over the past few years, since we got WL_EXIT_ON_PM_DEATH and > > > condition variables. There may be many more to improve... You > > > mentioned autovacuum... yeah, Stephen fixed one of these with commit > > > 4753ef37, but yeah it's not great to have those others in there... > > > > I have not looked at the commit 4753ef37 previously, but it > > essentially addresses the problem with pg_usleep for vacuum delay. I'm > > thinking we can also replace pg_usleep in below places based on the > > fact that pg_usleep should be avoided in 1) short waits in a loop 2) > > when wait time is dependent on user configurable parameters. And using > > WaitLatch may require us to add wait event types to WaitEventTimeout > > enum, but that's okay. > > I'm attaching 3 patches that replace pg_usleep with WaitLatch: 0001 in > lazy_truncate_heap, 0002 in do_pg_stop_backup and 0003 for Pre and > Post Auth Delay. Regression tests pass with these patches. Please > review them. I made a CF entry [1] so that it may get a chance for review. [1] https://commitfest.postgresql.org/33/3085/ With Regards, Bharath Rupireddy. EnterpriseDB: http://www.enterprisedb.com
Re: Addition of authenticated ID to pg_stat_activity
On Mon, Apr 26, 2021 at 11:34:16AM +0900, Michael Paquier wrote: > +++ b/doc/src/sgml/config.sgml > @@ -7596,6 +7596,24 @@ COPY postgres_log FROM '/full/path/to/logfile.csv' > WITH csv; > > > > + xreflabel="track_activity_authn_size"> > + track_activity_authn_size > (integer) > + > + track_activity_authn_size configuration > parameter > + > + > + > + > + Specifies the amount of memory reserved to store the text of the > + currently executing command for each active session, for the That part looks to be a copy+paste error. > + > pg_stat_activity.authenticated_id > field. > + If this value is specified without units, it is taken as bytes. > + The default value is 128 bytes. > + This parameter can only be set at server start. > + > + > + I think many/most things in log/CSV should also go in PSA, and vice versa. It seems like there should be a comment about this - in both places - to avoid forgetting it in the future. -- Justin
Remove redundant variable pageSize in gistinitpage
Hi, In gistinitpage, pageSize variable looks redundant, instead we could just pass BLCKSZ. This will be consistent with its peers BloomInitPage, brin_page_init and SpGistInitPage. Attaching a small patch. Thoughts? With Regards, Bharath Rupireddy. EnterpriseDB: http://www.enterprisedb.com From b77d8d445ebf7c0a00b4931c0f346d2bffb64a61 Mon Sep 17 00:00:00 2001 From: Bharath Rupireddy Date: Sun, 25 Apr 2021 19:51:06 +0530 Subject: [PATCH v1] Remove redundant variable pageSize in gistinitpage In gistinitpage, pageSize variable looks redundant, instead just pass BLCKSZ. This will be consistent with its peers BloomInitPage, brin_page_init and SpGistInitPage. --- src/backend/access/gist/gistutil.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c index 8dcd53c457..43ba03b6eb 100644 --- a/src/backend/access/gist/gistutil.c +++ b/src/backend/access/gist/gistutil.c @@ -756,9 +756,8 @@ void gistinitpage(Page page, uint32 f) { GISTPageOpaque opaque; - Size pageSize = BLCKSZ; - PageInit(page, pageSize, sizeof(GISTPageOpaqueData)); + PageInit(page, BLCKSZ, sizeof(GISTPageOpaqueData)); opaque = GistPageGetOpaque(page); opaque->rightlink = InvalidBlockNumber; -- 2.25.1
Re: Replication slot stats misgivings
On Mon, Apr 26, 2021 at 8:01 AM Masahiko Sawada wrote: > > On Fri, Apr 23, 2021 at 6:15 PM Amit Kapila wrote: > > > > On Mon, Apr 19, 2021 at 4:28 PM vignesh C wrote: > > > > > > I have made the changes to update the replication statistics at > > > replication slot release. Please find the patch attached for the same. > > > Thoughts? > > > > > > > Thanks, the changes look mostly good to me. The slot stats need to be > > initialized in RestoreSlotFromDisk and ReplicationSlotCreate, not in > > StartupDecodingContext. Apart from that, I have moved the declaration > > of UpdateDecodingStats from slot.h back to logical.h. I have also > > added/edited a few comments. Please check and let me know what do you > > think of the attached? > > The patch moves slot stats to the ReplicationSlot data that is on the > shared memory. If we have a space to store the statistics in the > shared memory can we simply accumulate the stats there and make them > persistent without using the stats collector? > But for that, we need to write to file at every commit/abort/prepare (decode of commit) which I think will incur significant overhead. Also, we try to write after few commits then there is a danger of losing them and still there could be a visible overhead for small transactions. > And I think there is > also a risk to increase shared memory when we want to add other > statistics in the future. > Yeah, so do you think it is not a good idea to store stats in ReplicationSlot? Actually storing them in a slot makes it easier to send them during ReplicationSlotRelease which is quite helpful if the replication is interrupted due to some reason. Or the other idea was that we send stats every time we stream or spill changes. -- With Regards, Amit Kapila.
Re: PageGetItemIdCareful - should we MAXALIGN sizeof(BTPageOpaqueData)?
On Sat, Apr 24, 2021 at 4:11 AM Peter Geoghegan wrote: > > PageInit always max aligns this structure, when we > > initialize the btree page in _bt_pageini and in all other places we > > max align it before use. Since this is an error throwing path, I think > > we should max align it just to be on the safer side. While on it, I > > think we can also replace BLCKSZ with PageGetPageSize(page). > > I didn't replace BLCKSZ with PageGetPageSize() in the commit, though. > We definitely don't want to rely on that being sane in amcheck (this > is also why we don't use PageGetSpecialPointer(), which is the usual > approach). If the PageGetPageSize can't be sane within amcheck, does it mean that the page would have been corrupted somewhere? > Actually, even if this wasn't amcheck code I might make the same call. > I personally don't think that most existing calls to PageGetPageSize() > make very much sense. Should we get rid of all existing PageGetPageSize and directly use BLCKSZ instead? AFAICS, all the index and heap pages are of BLCKSZ (PageInit has Assert(pageSize == BLCKSZ);). Using PageGetPageSize to get the size that's been stored in the page, we might catch errors early if at all the page is corrupted and the size is overwritten . That's not the case if we use BLCKSZ which is not stored in the page. In this case the size stored on the page becomes redundant and the pd_pagesize_version could just be 2 bytes storing the page version. While we save 2 bytes per page, I'm not sure this is acceptable as PageHeader size gets changed. > I'm curious: Was this just something that you noticed randomly, while > looking at the code? Or do you have a specific practical reason to > care about it? (I always like hearing about the ways in which people > use amcheck.) I found this while working on one internal feature but not while using amcheck. With Regards, Bharath Rupireddy. EnterpriseDB: http://www.enterprisedb.com
Addition of authenticated ID to pg_stat_activity
Hi all, 9afffcb has added the concept of authenticated identity to the information provided in log_connections for audit purposes, with this data stored in each backend's port. One extra thing that can be really useful for monitoring is the capability to track this information directly in pg_stat_activity. Please find attached a patch to do that, with the following choices made: - Like query strings, authenticated IDs could be rather long, so we need a GUC to control the maximum size allocated for these in shared memory. The attached uses 128 bytes by default, that should be enough in most cases even for DNs, LDAP or krb5. - Multi-byte strings need to be truncated appropriately. As a matter of consistency with the query string code, I have made things so as the truncation is done each time a string is requested, with PgBackendStatus storing the raw information truncated depending on the maximum size allowed at the GUC level. - Some tests are added within the SSL and LDAP code paths. We could add more of that within the authentication and kerberos tests but that did not strike me as mandatory either as the backend logs are checked everywhere already. - The new field has been added at the end of pg_stat_end_activity() mainly as a matter of readability. I'd rather move that just after the application_name, now only pg_stat_activity does that. I am adding that to the next CF. Thanks, -- Michael From 5acd643aff5a12b8ee2ca3365532f5773c1b02a8 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Mon, 26 Apr 2021 11:25:17 +0900 Subject: [PATCH v1] Add authenticated data to pg_stat_activity --- src/include/catalog/pg_proc.dat | 6 +-- src/include/utils/backend_status.h| 16 --- src/backend/catalog/system_views.sql | 1 + src/backend/utils/activity/backend_status.c | 45 +-- src/backend/utils/adt/pgstatfuncs.c | 21 +++-- src/backend/utils/misc/guc.c | 11 + src/backend/utils/misc/postgresql.conf.sample | 1 + src/test/ldap/t/001_auth.pl | 4 +- src/test/regress/expected/rules.out | 9 ++-- src/test/ssl/t/001_ssltests.pl| 8 ++-- doc/src/sgml/config.sgml | 18 doc/src/sgml/monitoring.sgml | 10 + 12 files changed, 117 insertions(+), 33 deletions(-) diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index db1abc149c..7217e017a2 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -5279,9 +5279,9 @@ proname => 'pg_stat_get_activity', prorows => '100', proisstrict => 'f', proretset => 't', provolatile => 's', proparallel => 'r', prorettype => 'record', proargtypes => 'int4', - proallargtypes => '{int4,oid,int4,oid,text,text,text,text,text,timestamptz,timestamptz,timestamptz,timestamptz,inet,text,int4,xid,xid,text,bool,text,text,int4,text,numeric,text,bool,text,bool,int4,int8}', - proargmodes => '{i,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o}', - proargnames => '{pid,datid,pid,usesysid,application_name,state,query,wait_event_type,wait_event,xact_start,query_start,backend_start,state_change,client_addr,client_hostname,client_port,backend_xid,backend_xmin,backend_type,ssl,sslversion,sslcipher,sslbits,ssl_client_dn,ssl_client_serial,ssl_issuer_dn,gss_auth,gss_princ,gss_enc,leader_pid,query_id}', + proallargtypes => '{int4,oid,int4,oid,text,text,text,text,text,timestamptz,timestamptz,timestamptz,timestamptz,inet,text,int4,xid,xid,text,bool,text,text,int4,text,numeric,text,bool,text,bool,int4,int8,text}', + proargmodes => '{i,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o}', + proargnames => '{pid,datid,pid,usesysid,application_name,state,query,wait_event_type,wait_event,xact_start,query_start,backend_start,state_change,client_addr,client_hostname,client_port,backend_xid,backend_xmin,backend_type,ssl,sslversion,sslcipher,sslbits,ssl_client_dn,ssl_client_serial,ssl_issuer_dn,gss_auth,gss_princ,gss_enc,leader_pid,query_id,authenticated_id}', prosrc => 'pg_stat_get_activity' }, { oid => '3318', descr => 'statistics: information about progress of backends running maintenance command', diff --git a/src/include/utils/backend_status.h b/src/include/utils/backend_status.h index 0cbcc9c943..d2381f07dd 100644 --- a/src/include/utils/backend_status.h +++ b/src/include/utils/backend_status.h @@ -145,13 +145,16 @@ typedef struct PgBackendStatus char *st_appname; /* - * Current command string; MUST be null-terminated. Note that this string - * possibly is truncated in the middle of a multi-byte character. As - * activity strings are stored more frequently than read, that allows to - * move the cost of correct truncation to the display side. Use - * pgstat_clip_activity() to truncate correctly. + * Current command string and authenticated ID string; MUST be + * null-terminated. Note that those strings possibl
Re: Replication slot stats misgivings
On Fri, Apr 23, 2021 at 6:15 PM Amit Kapila wrote: > > On Mon, Apr 19, 2021 at 4:28 PM vignesh C wrote: > > > > I have made the changes to update the replication statistics at > > replication slot release. Please find the patch attached for the same. > > Thoughts? > > > > Thanks, the changes look mostly good to me. The slot stats need to be > initialized in RestoreSlotFromDisk and ReplicationSlotCreate, not in > StartupDecodingContext. Apart from that, I have moved the declaration > of UpdateDecodingStats from slot.h back to logical.h. I have also > added/edited a few comments. Please check and let me know what do you > think of the attached? The patch moves slot stats to the ReplicationSlot data that is on the shared memory. If we have a space to store the statistics in the shared memory can we simply accumulate the stats there and make them persistent without using the stats collector? And I think there is also a risk to increase shared memory when we want to add other statistics in the future. Regards, -- Masahiko Sawada EDB: https://www.enterprisedb.com/
comment typo - misnamed function
Hi, PSA patch to fix a misnamed function in a comment. typo: "DecodePreare" --> "DecodePrepare" -- Kind Regards, Peter Smith. Fujitsu Australia v1-0001-Fix-typo-misnamed-function-in-comment.patch Description: Binary data
Re: [PATCH] add concurrent_abort callback for output plugin
Hi, While testing another WIP patch [1] a clashing GID problem was found, which gives us apply worker errors like: 2021-04-26 10:07:12.883 AEST [22055] ERROR: transaction identifier "pg_gid_16403_608" is already in use 2021-04-26 10:08:05.149 AEST [22124] ERROR: transaction identifier "pg_gid_16403_757" is already in use These GID clashes were traced back to a problem of the concurrent-abort logic: when "streaming" is enabled the concurrent-abort logic was always sending "prepare" even though a "stream_prepare" had already been sent. PSA a patch to correct this. -- [1] https://www.postgresql.org/message-id/CAHut%2BPuB07xOgJLnDhvbtp0t_qMDhjDD%2BkO%2B2yB%2Br6tgfaR-5Q%40mail.gmail.com Kind Regards, Peter Smith. Fujitsu Australia v1-0001-Fix-concurrent-abort-for-when-streaming.patch Description: Binary data
RE: Parallel INSERT SELECT take 2
> > Based on above, we plan to move forward with the apporache 2) (declarative > idea). > > IIUC, the declarative behaviour idea attributes parallel > safe/unsafe/restricted > tags to each table with default being the unsafe. Does it mean for a parallel > unsafe table, no parallel selects, inserts (may be updates) will be picked > up? Or > is it only the parallel inserts? If both parallel inserts, selects will be > picked, then > the existing tables need to be adjusted to set the parallel safety tags while > migrating? Thanks for looking into this. The parallel attributes in table means the parallel safety when user does some data-modification operations on it. So, It only limit the use of parallel plan when using INSERT/UPDATE/DELETE. > Another point, what does it mean a table being parallel restricted? > What should happen if it is present in a query of other parallel safe tables? If a table is parallel restricted, it means the table contains some parallel restricted objects(such as: parallel restricted functions in index expressions). And in planner, it means parallel insert plan will not be chosen, but it can use parallel select(with serial insert). > I may be wrong here: IIUC, the main problem we are trying to solve with the > declarative approach is to let the user decide parallel safety for partition > tables > as it may be costlier for postgres to determine it. And for the normal tables > we > can perform parallel safety checks without incurring much cost. So, I think we > should restrict the declarative approach to only partitioned tables? Yes, we are tring to avoid overhead when checking parallel safety. The cost to check all the partition's parallel safety is the biggest one. Another is the safety check of index's expression. Currently, for INSERT, the planner does not open the target table's indexinfo and does not parse the expression of the index. We need to parse the expression in planner if we want to do parallel safety check for it which can bring some overhead(it will open the index the do the parse in executor again). So, we plan to skip all of the extra check and let user take responsibility for the safety. Of course, maybe we can try to pass the indexinfo to the executor but it need some further refactor and I will take a look into it. > While reading the design, I came across this "erroring out during execution > of a > query when a parallel unsafe function is detected". If this is correct, isn't > it > warranting users to run pg_get_parallel_safety to know the parallel unsafe > objects, set parallel safety to all of them if possible, otherwise disable > parallelism to run the query? Isn't this burdensome? How about: If detecting parallel unsafe objects in executor, then, alter the table to parallel unsafe internally. So, user do not need to alter it manually. > Instead, how about > postgres retries the query upon detecting the error that came from a parallel > unsafe function during execution, disable parallelism and run the query? I > think > this kind of retry query feature can be built outside of the core postgres, > but > IMO it will be good to have inside (of course configurable). IIRC, the > Teradata > database has a Query Retry feature. > Thanks for the suggestion. The retry query feature sounds like a good idea to me. OTOH, it sounds more like an independent feature which parallel select can also benefit from it. I think maybe we can try to achieve it after we commit the parallel insert ? Best regards, houzj
Re: wal stats questions
On 2021/04/23 16:30, Fujii Masao wrote: > > > On 2021/04/23 10:25, Andres Freund wrote: >> Hi, >> >> On 2021-04-23 09:26:17 +0900, Masahiro Ikeda wrote: >>> On 2021/04/23 0:36, Andres Freund wrote: On Thu, Apr 22, 2021, at 06:42, Fujii Masao wrote: > On 2021/04/21 18:31, Masahiro Ikeda wrote: >>> BTW, is it better to change PgStat_Counter from int64 to uint64 >>> because> there aren't any counters with negative number? > On second thought, it's ok even if the counters like wal_records get > overflowed. > Because they are always used to calculate the diff between the previous > and > current counters. Even when the current counters are overflowed and > the previous ones are not, WalUsageAccumDiff() seems to return the right > diff of them. If this understanding is right, I'd withdraw my comment and > it's ok to use "long" type for those counters. Thought? Why long? It's of a platform dependent size, which doesn't seem useful here? >>> >>> I think it's ok to unify uint64. Although it's better to use small size for >>> cache, the idea works well for only some platform which use 4bytes for >>> "long". >>> >>> >>> (Although I'm not familiar with the standardization...) >>> It seems that we need to adopt unsinged long if use "long", which may be >>> 4bytes. >>> >>> I though WalUsageAccumDiff() seems to return the right value too. But, I >>> researched deeply and found that ISO/IEC 9899:1999 defines unsinged integer >>> never overflow(2.6.5 Types 9th section) although it doesn't define overflow >>> of >>> signed integer type. >>> >>> If my understanding is right, the definition only guarantee >>> WalUsageAccumDiff() returns the right value only if the type is unsigned >>> integer. So, it's safe to change "long" to "unsigned long". >> >> I think this should just use 64bit counters. Emitting more than 4 >> billion records in one transaction isn't actually particularly rare. And > > Right. I agree to change the types of the counters to int64. > > I think that it's better to make this change as a separate patch from > the changes for pg_stat_wal view. Thanks for your comments. OK, I separate two patches. First patch has only the changes for pg_stat_wal view. ("v6-0001-performance-improvements-of-reporting-wal-stats-without-introducing-a-new-variable.patch") Second one has the changes for the type of the BufferUsage's and WalUsage's members. I change the type from long to int64. Is it better to make new thread? ("v6-0002-change-the-data-type-of-XXXUsage-from-long-to-int64.patch") Regards, -- Masahiro Ikeda NTT DATA CORPORATION diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c index 237e13361b..75ecd00c23 100644 --- a/src/backend/executor/instrument.c +++ b/src/backend/executor/instrument.c @@ -17,6 +17,12 @@ #include "executor/instrument.h" +/* + * Buffer and generated WAL usage counters. + * + * The counters are accumulated values. There are infrastructures + * to add the values and calculate the difference within a specific period. + */ BufferUsage pgBufferUsage; static BufferUsage save_pgBufferUsage; WalUsage pgWalUsage; diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c index e7e6a2a459..1761694a5b 100644 --- a/src/backend/postmaster/checkpointer.c +++ b/src/backend/postmaster/checkpointer.c @@ -505,7 +505,7 @@ CheckpointerMain(void) pgstat_send_bgwriter(); /* Send WAL statistics to the stats collector. */ - pgstat_report_wal(); + pgstat_send_wal(true); /* * If any checkpoint flags have been set, redo the loop to handle the @@ -583,7 +583,7 @@ HandleCheckpointerInterrupts(void) BgWriterStats.m_requested_checkpoints++; ShutdownXLOG(0, 0); pgstat_send_bgwriter(); - pgstat_report_wal(); + pgstat_send_wal(true); /* Normal exit from the checkpointer is here */ proc_exit(0); /* done */ diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 6e8dee9784..f7f28673f8 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -133,8 +133,8 @@ PgStat_MsgWal WalStats; /* * WAL usage counters saved from pgWALUsage at the previous call to - * pgstat_report_wal(). This is used to calculate how much WAL usage - * happens between pgstat_report_wal() calls, by substracting + * pgstat_send_wal(). This is used to calculate how much WAL usage + * happens between pgstat_send_wal() calls, by substracting * the previous counters from the current ones. */ static WalUsage prevWalUsage; @@ -855,9 +855,19 @@ pgstat_report_stat(bool disconnect) TabStatusArray *tsa; int i; - /* Don't expend a clock check if nothing to do */ + /* + * Don't expend a clock check if nothing to do. + * + * Note: regarding to WAL statistics counters, it's not enough to check + * only whether the wal record is generated or not. If a read transaction + * is executed, wal records aren't nomally g
Does rewriteTargetListIU still need to add UPDATE tlist entries?
The comments for rewriteTargetListIU say (or said until earlier today) * 2. For an UPDATE on a trigger-updatable view, add tlist entries for any * unassigned-to attributes, assigning them their old values. These will * later get expanded to the output values of the view. (This is equivalent * to what the planner's expand_targetlist() will do for UPDATE on a regular * table, but it's more convenient to do it here while we still have easy * access to the view's original RT index.) This is only necessary for * trigger-updatable views, for which the view remains the result relation of * the query. For auto-updatable views we must not do this, since it might * add assignments to non-updatable view columns. For rule-updatable views it * is unnecessary extra work, since the query will be rewritten with a * different result relation which will be processed when we recurse via * RewriteQuery. I noticed that this is referencing something that, in fact, expand_targetlist() doesn't do anymore, so that this is a poor justification for the behavior. My first thought was that we still need to do it to produce the correct row contents for the INSTEAD OF trigger, so I updated the comment (in 08a986966) to claim that. However, on closer inspection, that's nonsense. nodeModifyTable.c populates the trigger "OLD" row from the whole-row variable that is generated for the view, and then it computes the "NEW" row using that old row and the UPDATE tlist; there is no need there for the UPDATE tlist to compute all the columns. The regression tests still pass just fine if we take out the questionable logic (cf. attached patch). Moreover, if you poke into it a little closer than the tests do, you notice that the existing code is uselessly computing non-updated columns twice, in both the extra tlist item and the whole-row variable. As an example, consider create table base (a int, b int); create view v1 as select a+1 as a1, b+2 as b2 from base; -- you also need an INSTEAD OF UPDATE trigger, not shown here explain verbose update v1 set a1 = a1 - 44; With HEAD you get Update on public.v1 (cost=0.00..60.85 rows=0 width=0) -> Seq Scan on public.base (cost=0.00..60.85 rows=2260 width=46) Output: ((base.a + 1) - 44), (base.b + 2), ROW((base.a + 1), (base.b + 2)), base.ctid There's really no need to compute base.b + 2 twice, and with this patch we don't: Update on public.v1 (cost=0.00..55.20 rows=0 width=0) -> Seq Scan on public.base (cost=0.00..55.20 rows=2260 width=42) Output: ((base.a + 1) - 44), ROW((base.a + 1), (base.b + 2)), base.ctid I would think that this is a totally straightforward improvement, but there's one thing in the comments for rewriteTargetListIU that gives me a little pause: it says * We must do items 1,2,3 before firing rewrite rules, else rewritten * references to NEW.foo will produce wrong or incomplete results. As far as I can tell, though, references to NEW values still do the right thing. I'm not certain whether any of the existing regression tests really cover this point, but experimenting with the scenario shown in the attached SQL file says that the DO ALSO rule gets the right results. It's possible that the expansion sequence is a bit different than before, but we still end up with the right answer. So, as far as I can tell, this is an oversight in 86dc90056 and we ought to clean it up as attached. regards, tom lane diff --git a/src/backend/rewrite/rewriteHandler.c b/src/backend/rewrite/rewriteHandler.c index 4fa4ac2aef..4bba6eb805 100644 --- a/src/backend/rewrite/rewriteHandler.c +++ b/src/backend/rewrite/rewriteHandler.c @@ -679,18 +679,7 @@ adjustJoinTreeList(Query *parsetree, bool removert, int rt_index) * and UPDATE, replace explicit DEFAULT specifications with column default * expressions. * - * 2. For an UPDATE on a trigger-updatable view, add tlist entries for any - * unassigned-to attributes, assigning them their old values. These will - * later get expanded to the output values of the view. This is only needed - * for trigger-updatable views, for which the view remains the result relation - * of the query; without it, we would not have a complete "new tuple" to pass - * to triggers. For auto-updatable views we must not do this, since it might - * add assignments to non-updatable view columns. For rule-updatable views it - * is unnecessary extra work, since the query will be rewritten with a - * different result relation which will be processed when we recurse via - * RewriteQuery. - * - * 3. Merge multiple entries for the same target attribute, or declare error + * 2. Merge multiple entries for the same target attribute, or declare error * if we can't. Multiple entries are only allowed for INSERT/UPDATE of * portions of an array or record field, for example * UPDATE table SET foo[2] = 42, foo[4] = 43; @@ -698,11 +687,11 @@ adjustJoinTreeList(Query *parsetree, bool removert, int rt_index)
Re: decoupling table and index vacuum
On Sat, Apr 24, 2021 at 12:22 AM Robert Haas wrote: > > On Fri, Apr 23, 2021 at 7:04 AM Masahiko Sawada wrote: > > I think we can divide the TID fork into 16MB or 32MB chunks like WAL > > segment files so that we can easily remove old chunks. Regarding the > > efficient search part, I think we need to consider the case where the > > TID fork gets bigger than maintenance_work_mem. In that case, during > > the heap scan, we need to check if the discovered TID exists in a > > chunk of the TID fork that could be on the disk. Even if all > > known-dead-TIDs are loaded into an array on the memory, it could get > > much slower than the current heap scan to bsearch over the array for > > each dead TID discovered during heap scan. So it would be better to > > have a way to skip searching by already recorded TIDs. For example, > > during heap scan or HOT pruning, I think that when marking TIDs dead > > and recording it to the dead TID fork we can mark them “dead and > > recorded” instead of just “dead” so that future heap scans can skip > > those TIDs without existence check. > > I'm not very excited about this. If we did this, and if we ever > generated dead-but-not-recorded TIDs, then we will potentially dirty > those blocks again later to mark them recorded. Since the idea I imagined is that we always mark a TID recorded at the same time when marking it dead we don't dirty the page again, but, yes, if we do that the recorded flag is not necessary. We can simply think that TID marked dead is recorded to the TID fork. Future vacuum can skip TID that are already marked dead. > > Also, if bsearch() is a bottleneck, how about just using an O(1) > algorithm instead of an O(lg n) algorithm, rather than changing the > on-disk format? > > Also, can you clarify exactly what you think the problem case is here? > It seems to me that: > > 1. If we're pruning the heap to collect dead TIDs, we should stop when > the number of TIDs we've accumulated reaches maintenance_work_mem. It > is possible that we could find when starting to prune that there are > *already* more dead TIDs than will fit, because maintenance_work_mem > might have been reduced since they were gathered. But that's OK: we > can figure out that there are more than will fit without loading them > all, and since we shouldn't do additional pruning in this case, > there's no issue. The case I'm thinking is that pruning the heap and sanitizing indexes are running concurrently as you mentioned that concurrency is one of the benefits of decoupling vacuum phases. In that case, one process is doing index vacuuming using known-dead-TIDs in the TID fork while another process is appending new dead TIDs. We can suspend heap pruning until the size of the TID fork gets smaller as you mentioned but it seems inefficient. > > 2. If we're sanitizing indexes, we should normally discover that there > are few enough TIDs that we can still fit them all in memory. But if > that proves not to be the case, again because for example > maintenance_work_mem has been reduced, then we can handle that with > multiple index passes just as we do today. Yeah, there seems to be room for improvement but not worse than today. I imagine users will want to set a high maintenance_work_mem for sanitizing global index separately from the setting for heap pruning. > > 3. If we're going back to the heap to permit TIDs to be recycled by > setting dead line pointers to unused, we can load in as many of those > as will fit in maintenance_work_mem, sort them by block number, and go > through block by block and DTRT. Then, we can release all that memory > and, if necessary, do the whole thing again. This isn't even > particularly inefficient. Agreed. Just an idea: during pruning the heap, we can buffer the collected dead TIDs before writing the TID fork to the disk so that we can sort the dead TIDs in a chunk (say a 16MB chunk consists of 8KB blocks)? We write the chunk to the disk either when the chunk filled with dead TIDs or when index sanitizing starts. The latter timing is required to remember the chunk ID or uint64 ID of dead TID indicating how far index sanitizing removed dead TIDs up to. One of the benefits would be to reduce the disk I/O for the dead TID fork. Another would be we’re likely to complete the recycle phase in one heap scan since we load only one block per chunk during scanning the heap. Regards, -- Masahiko Sawada EDB: https://www.enterprisedb.com/
GISTSTATE is too large
Hi, On twitter it was mentioned [1] that gist index builds spend a lot of time in FunctionCall3Coll. Which could be addressed to a good degree by not using FunctionCall3Coll() which needs to call InitFunctionCallInfoData() every time, but instead doing so once by including the FunctionCallInfo in GISTSTATE. Which made me look at GISTSTATEs layout. And, uh, I was a bit shocked: struct GISTSTATE { MemoryContext scanCxt; /* 0 8 */ MemoryContext tempCxt; /* 8 8 */ TupleDesc leafTupdesc; /*16 8 */ TupleDesc nonLeafTupdesc; /*24 8 */ TupleDesc fetchTupdesc; /*32 8 */ FmgrInfo consistentFn[32]; /*40 1536 */ /* --- cacheline 24 boundary (1536 bytes) was 40 bytes ago --- */ FmgrInfo unionFn[32]; /* 1576 1536 */ ... /* --- cacheline 216 boundary (13824 bytes) was 40 bytes ago --- */ OidsupportCollation[32]; /* 13864 128 */ /* size: 13992, cachelines: 219, members: 15 */ /* last cacheline: 40 bytes */ }; So the basic GISTSTATE is 14kB large. And all the information needed to call support functions for one attribute is spaced so far appart that it's guaranteed to be on different cachelines and to be very unlikely to be prefetched by the hardware prefetcher. It seems pretty clear that this should be changed to be something more like typedef struct GIST_COL_STATE { FmgrInfoconsistentFn; FmgrInfounionFn; FmgrInfocompressFn; FmgrInfodecompressFn; FmgrInfopenaltyFn; FmgrInfopicksplitFn; FmgrInfoequalFn; FmgrInfodistanceFn; FmgrInfofetchFn; /* Collations to pass to the support functions */ Oid supportCollation; } GIST_COL_STATE; typedef struct GISTSTATE { MemoryContext scanCxt; /* context for scan-lifespan data */ MemoryContext tempCxt; /* short-term context for calling functions */ TupleDesc leafTupdesc;/* index's tuple descriptor */ TupleDesc nonLeafTupdesc; /* truncated tuple descriptor for non-leaf * pages */ TupleDesc fetchTupdesc; /* tuple descriptor for tuples returned in an * index-only scan */ GIST_COL_STATE column_state[FLEXIBLE_ARRAY_MEMBER]; } with initGISTstate allocating based on IndexRelationGetNumberOfKeyAttributes() instead of using a constant. And then subsequently change GIST_COL_STATE to embed the FunctionCallInfo, rather than initializiing them on the stack for every call. I'm not planning on doing this work, but I thought it's sensible to send to the list anyway. Greetings, Andres Freund [1] https://twitter.com/komzpa/status/138642045240065
Re: Fix dropped object handling in pg_event_trigger_ddl_commands
On 2021-Apr-25, Sven Klemm wrote: > On Sun, Apr 18, 2021 at 2:12 PM Sven Klemm wrote: > > when creating an event trigger for ddl_command_end that calls > > pg_event_trigger_ddl_commands certain statements will cause the > > trigger to fail with a cache lookup error. The error happens on > > master, 13 and 12 I didnt test any previous versions. > > > > trg=# ALTER TABLE t ALTER COLUMN f1 SET DATA TYPE bigint, ALTER COLUMN > > f1 DROP IDENTITY; > > ERROR: XX000: cache lookup failed for relation 13476892 > > CONTEXT: PL/pgSQL function ddl_end() line 5 at FOR over SELECT rows > > LOCATION: getRelationTypeDescription, objectaddress.c:4178 > > Any opinions on the patch? Is this not worth the effort to fix or is > there a better way to fix this? Hello, I haven't looked at this but judging from the general shape of function and error message, it seems clearly a bug that needs to be fixed somehow. I'll try to make time to look at it sometime soon, but I have other bugs to investigate and fix, so it may be some time. I fear your proposal of ignoring the object may be the best we can do, but I don't like it much. -- Álvaro Herrera39°49'30"S 73°17'W "La verdad no siempre es bonita, pero el hambre de ella sí"
Re: Allowing to create LEAKPROOF functions to non-superuser
Stephen Frost writes: > * Andrey Borodin (x4...@yandex-team.ru) wrote: >> Customer was restoring pg_dump of on-premise ERP known as 1C (something like >> TurboTax) with this add-on [0] > Erm, it's very clearly not leakproof and will happily return information > about the value passed in during some error cases... Yeah, that's pretty much a poster-child example for NOT letting random users fool with leakproofness settings. regards, tom lane
Re: Allowing to create LEAKPROOF functions to non-superuser
Greetings, * Noah Misch (n...@leadboat.com) wrote: > On Mon, Apr 19, 2021 at 05:38:43PM -0400, Stephen Frost wrote: > > > > > On Fri, Apr 16, 2021 at 3:57 AM Noah Misch wrote: > > > > >> Hence, I do find it reasonable to let pg_read_all_data be sufficient > > > > >> for > > > > >> setting LEAKPROOF. > > > I'm not really sure that attaching it to pg_read_all_data makes sense.. > > > > In general, I've been frustrated by the places where we lump privileges > > together rather than having a way to distinctly GRANT capabilities > > independently- that's more-or-less exactly what lead me to work on > > implementing the role system in the first place, and later the > > predefined roles. > > This would be no more lumpy than e.g. pg_read_all_stats. However, I could > live with a separate pg_change_leakproof (or whatever name). There's been already some disagreements about pg_read_all_stats, so I don't think that is actually a good model to look at. I have doubts about users generally being able to write actually leakproof functions (this case being an example of someone writing a function that certainly wasn't leakproof but marking it as such anyway...), though I suppose it's unlikely that it's any worse than the cases of people writing SECURITY DEFINER functions that aren't careful enough, of which I've seen plenty of. I would think the role/capability would be 'pg_mark_function_leakproof' or similar though, and allow a user who had that role to either create leakproof functions (provided they have access to create the function in the first place) or to mark an existing function as leakproof (but requiring them to be a member of the role which owns the function). Thanks, Stephen signature.asc Description: PGP signature
Re: Allowing to create LEAKPROOF functions to non-superuser
Greetings, * Andrey Borodin (x4...@yandex-team.ru) wrote: > > 20 апр. 2021 г., в 02:38, Stephen Frost написал(а): > > Here's what I'd ask Andrey- what's the actual use-case here? Are these > > cases where users are actually adding new functions which they believe > > are leakproof where those functions don't require superuser already to > > be created? Clearly if they're in a untrusted language and you have to > > be a superuser to install them in the first place then they should just > > mark the function as leakproof when they install it. If these are > > trusted language functions, I'd be curious to actually see them as I > > have doubts about if they're actually leakproof.. > > > > Or is the actual use-case here that they just want to mark functions we > > know aren't leakproof as leakproof anyway because they aren't getting > > the performance they want? > > > > If it's the latter, as I suspect it is, then I don't really think the > > use-case justifies any change on our part- the right answer is to make > > those functions actually leakproof, or write ones which are. > > Customer was restoring pg_dump of on-premise ERP known as 1C (something like > TurboTax) with this add-on [0] > > CREATE FUNCTION simple1c.date_from_guid(varchar(36)) RETURNS timestamp > LANGUAGE plpgsql IMMUTABLE LEAKPROOF STRICT > > I'm not 1C-expert (programmed it a bit to get few bucks when I was a > student), but seems like this function simple1c.date_from_guid() can be used > in DSL queries. It have no obvious side effects. Maybe we could hack it by > exploiting timestamp overflow, but I doubt it's practically usable. Erm, it's very clearly not leakproof and will happily return information about the value passed in during some error cases... Thanks, Stephen signature.asc Description: PGP signature
Better sanity checking for message length words
We had a report [1] of a case where a broken client application sent some garbage to the server, which then hung up because it misinterpreted the garbage as a very long message length, and was sitting waiting for data that would never arrive. There is already a sanity check on the message type byte in postgres.c's SocketBackend (which the trouble case accidentally got past because 'S' is a valid type code); but the only check on the message length is that it be at least 4. pq_getmessage() does have the ability to enforce an upper limit on message length, but we only use that capability for authentication messages, and not entirely consistently even there. Meanwhile on the client side, libpq has had simple message-length sanity checking for ages: it disbelieves message lengths greater than 3 bytes unless the message type is one of a short list of types that can be long. So the attached proposed patch changes things to make it required not optional for callers of pq_getmessage to provide an upper length bound, and installs the same sort of short-vs-long message heuristic as libpq has in the server. This is also a good opportunity for other callers to absorb the lesson SocketBackend learned many years ago: we should validate the message type code *before* believing anything about the message length. All of this is just heuristic of course, but I think it makes for a substantial reduction in the trouble surface. Given the small number of complaints to date, I'm hesitant to back-patch this: if there's anybody out there with valid use for long messages that I didn't think should be long, this might break things for them. But I think it'd be reasonable to sneak it into v14, since we've not started beta yet. Thoughts? regards, tom lane [1] https://www.postgresql.org/message-id/flat/YIKCCXcEozx9iiBU%40c720-r368166.fritz.box diff --git a/src/backend/commands/copyfromparse.c b/src/backend/commands/copyfromparse.c index 0813424768..fdf57f1556 100644 --- a/src/backend/commands/copyfromparse.c +++ b/src/backend/commands/copyfromparse.c @@ -265,6 +265,7 @@ CopyGetData(CopyFromState cstate, void *databuf, int minread, int maxread) { /* Try to receive another message */ int mtype; + int maxmsglen; readmessage: HOLD_CANCEL_INTERRUPTS(); @@ -274,11 +275,33 @@ CopyGetData(CopyFromState cstate, void *databuf, int minread, int maxread) ereport(ERROR, (errcode(ERRCODE_CONNECTION_FAILURE), errmsg("unexpected EOF on client connection with an open transaction"))); - if (pq_getmessage(cstate->fe_msgbuf, 0)) + /* Validate message type and set packet size limit */ + switch (mtype) + { + case 'd': /* CopyData */ + maxmsglen = PQ_LARGE_MESSAGE_LIMIT; + break; + case 'c': /* CopyDone */ + case 'f': /* CopyFail */ + case 'H': /* Flush */ + case 'S': /* Sync */ + maxmsglen = PQ_SMALL_MESSAGE_LIMIT; + break; + default: + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("unexpected message type 0x%02X during COPY from stdin", + mtype))); + maxmsglen = 0; /* keep compiler quiet */ + break; + } + /* Now collect the message body */ + if (pq_getmessage(cstate->fe_msgbuf, maxmsglen)) ereport(ERROR, (errcode(ERRCODE_CONNECTION_FAILURE), errmsg("unexpected EOF on client connection with an open transaction"))); RESUME_CANCEL_INTERRUPTS(); + /* ... and process it */ switch (mtype) { case 'd': /* CopyData */ @@ -304,11 +327,7 @@ CopyGetData(CopyFromState cstate, void *databuf, int minread, int maxread) */ goto readmessage; default: - ereport(ERROR, - (errcode(ERRCODE_PROTOCOL_VIOLATION), - errmsg("unexpected message type 0x%02X during COPY from stdin", - mtype))); - break; + Assert(false); /* NOT REACHED */ } } avail = cstate->fe_msgbuf->len - cstate->fe_msgbuf->cursor; diff --git a/src/backend/libpq/auth.c b/src/backend/libpq/auth.c index 27865b14a0..45a91235a4 100644 --- a/src/backend/libpq/auth.c +++ b/src/backend/libpq/auth.c @@ -210,6 +210,7 @@ static int PerformRadiusTransaction(const char *server, const char *secret, cons /* * Maximum accepted size of GSS and SSPI authentication tokens. + * We also use this as a limit on ordinary password packet lengths. * * Kerberos tickets are usually quite small, but the TGTs issued by Windows * domain controllers include an authorization field known as the Privilege @@ -724,7 +725,7 @@ recv_password_packet(Port *port) } initStringInfo(&buf); - if (pq_getmessage(&buf, 0)) /* receive password */ + if (pq_getmessage(&buf, PG_MAX_AUTH_TOKEN_LENGTH)) /* receive password */ { /* EOF - pq_getmessage already logged a suitable message */ pfree(buf.data); diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq
Re: compute_query_id and pg_stat_statements
On Sun, Apr 25, 2021 at 01:17:03PM -0400, Tom Lane wrote: > Julien Rouhaud writes: > > On Sun, Apr 25, 2021 at 11:39:55AM -0400, Tom Lane wrote: > >> I agree repeated warnings would be bad news. I was wondering if we could > >> arrange a single warning at the time pg_stat_statements is preloaded into > >> the postmaster. In this way, if you tried to use a configuration file > >> that used to work, you'd hopefully get some notice about why it no longer > >> does what you want. Also, if you are preloading pg_stat_statements, it > >> seems reasonable to assume that you'd like the global value of the flag > >> to be "on", even if there are use-cases for transiently disabling it. > > > What about people who wants to use pg_stat_statements but are not ok with > > our > > query_id heuristics and use a third-party plugin for that? > > They're still going to want the GUC set to something other than "off", > no? They will want compute_query_id to be off. And they actually will *need* that, as we recommend third-party plugins computing alternative query_id to error out if they see a that a query_id has already been generated, to avoid any problem if compute_query_id is being temporarily toggled. That's what I did in the POC plugin for external query_id at [1]. > In any case it's just a one-time log message, so it's not likely > to be *that* annoying. In that case it should be phrased in a way that makes it clear that pg_stat_statements can work without enabling compute_query_id, something like: "compute_query_id is disabled. This module won't track any activity unless you configured a third-party extension that computes query identifiers" [1] https://github.com/rjuju/pg_queryid/blob/master/pg_queryid.c#L172
Re: compute_query_id and pg_stat_statements
Julien Rouhaud writes: > On Sun, Apr 25, 2021 at 11:39:55AM -0400, Tom Lane wrote: >> I agree repeated warnings would be bad news. I was wondering if we could >> arrange a single warning at the time pg_stat_statements is preloaded into >> the postmaster. In this way, if you tried to use a configuration file >> that used to work, you'd hopefully get some notice about why it no longer >> does what you want. Also, if you are preloading pg_stat_statements, it >> seems reasonable to assume that you'd like the global value of the flag >> to be "on", even if there are use-cases for transiently disabling it. > What about people who wants to use pg_stat_statements but are not ok with our > query_id heuristics and use a third-party plugin for that? They're still going to want the GUC set to something other than "off", no? In any case it's just a one-time log message, so it's not likely to be *that* annoying. regards, tom lane
Re: Use simplehash.h instead of dynahash in SMgr
David Rowley писал 2021-04-25 16:36: On Sun, 25 Apr 2021 at 18:48, Yura Sokolov wrote: If you read comments in SH_START_ITERATE, you'll see: * Search for the first empty element. As deletions during iterations are * supported, we want to start/end at an element that cannot be affected * by elements being shifted. * Iterate backwards, that allows the current element to be deleted, even * if there are backward shifts Therefore, it is safe to delete during iteration, and it doesn't lead nor require loop restart. I had only skimmed that with a pre-loaded assumption that it wouldn't be safe. I didn't do a very good job of reading it as I failed to notice the lack of guarantees were about deleting items other than the current one. I didn't consider the option of finding a free bucket then looping backwards to avoid missing entries that are moved up during a delete. With that, I changed the patch to get rid of the SH_TRUNCATE and got rid of the smgrclose_internal which skips the hash delete. The code is much more similar to how it was now. In regards to the hashing stuff. I added a new function to pg_bitutils.h to rotate left and I'm using that instead of the other expression that was taken from nodeHash.c For the hash function, I've done some further benchmarking with: 1) The attached v2 patch 2) The attached + plus use_hash_combine.patch.txt which uses hash_combine() instead of pg_rotate_left32()ing the hashkey each time. 3) The attached v2 with use_hash_bytes.patch.txt applied. 4) Master I've also included the hash stats from each version of the hash function. I hope the numbers help indicate the reason I picked the hash function that I did. 1) v2 patch. Median = 113.375 s LOG: size: 4096, members: 2032, filled: 0.496094, total chain: 997, max chain: 5, avg chain: 0.490650, total_collisions: 422, max_collisions: 3, avg_collisions: 0.207677 2) v2 patch + use_hash_combine.patch.txt Median = 119.355 s LOG: size: 4096, members: 2032, filled: 0.496094, total chain: 971, max chain: 6, avg chain: 0.477854, total_collisions: 433, max_collisions: 4, avg_collisions: 0.213091 3) v2 patch + use_hash_bytes.patch.txt Median = 117.685 s LOG: size: 4096, members: 2032, filled: 0.496094, total chain: 900, max chain: 4, avg chain: 0.442913, total_collisions: 415, max_collisions: 4, avg_collisions: 0.204232 4) master Median = 124.49 s You can see that the bare v2 patch is quite a bit faster than any of the alternatives. We'd be better of with hash_bytes than using hash_combine() both for performance and for the seemingly better job the hash function does at reducing the hash collisions. David Looks much better! Simpler is almost always better. Minor remarks: Comment for SH_ENTRY_INITIALIZER e. May be like: - SH_ENTRY_INITIALIZER(a) - if defined, this macro is called for new entries before key or hash is stored in. For example, it can be used to make necessary memory allocations. `pg_rotate_left32(x, 1) == pg_rotate_right(x, 31)`, therefore there's no need to add `pg_rotate_left32` at all. More over, for hash combining there's no much difference between `pg_rotate_left32(x, 1)` and `pg_rotate_right(x, 1)`. (To be honestly, there is a bit of difference due to murmur construction, but it should not be very big.) If your test so sensitive to hash function speed, then I'd suggest to try something even simpler: static inline uint32 relfilenodebackend_hash(RelFileNodeBackend *rnode) { uint32 h = 0; #define step(x) h ^= (uint32)(x) * 0x85ebca6b; h = pg_rotate_right(h, 11); h *= 9; step(rnode->node.relNode); step(rnode->node.spcNode); // spcNode could be different for same relNode only // during table movement. Does it pay to hash it? step(rnode->node.dbNode); step(rnode->backend); // does it matter to hash backend? // It equals to InvalidBackendId for non-temporary relations // and temporary relations in same database never have same // relNode (have they?). return murmurhash32(hashkey); } I'd like to see benchmark code. It quite interesting this place became measurable at all. regards, Yura Sokolov.diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index 8310f73212..33e5cadd03 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -34,8 +34,6 @@ typedef struct SMgrEntry SMgrRelation data; /* Pointer to the SMgrRelationData */ } SMgrEntry; -static inline uint32 relfilenodebackend_hash(RelFileNodeBackend *rnode); - /* * Because simplehash.h does not provide a stable pointer to hash table * entries, we don't make the element type a SMgrRelation directly, instead we @@ -51,7 +49,7 @@ static inline uint32 relfilenodebackend_hash(RelFileNodeBackend *rnode); #define SH_ELEMENT_TYPESMgrEntry #de
Re: compute_query_id and pg_stat_statements
On Sun, Apr 25, 2021 at 11:39:55AM -0400, Tom Lane wrote: > > I agree repeated warnings would be bad news. I was wondering if we could > arrange a single warning at the time pg_stat_statements is preloaded into > the postmaster. In this way, if you tried to use a configuration file > that used to work, you'd hopefully get some notice about why it no longer > does what you want. Also, if you are preloading pg_stat_statements, it > seems reasonable to assume that you'd like the global value of the flag > to be "on", even if there are use-cases for transiently disabling it. What about people who wants to use pg_stat_statements but are not ok with our query_id heuristics and use a third-party plugin for that?
Re: compute_query_id and pg_stat_statements
Julien Rouhaud writes: > On Sat, Apr 24, 2021 at 01:43:51PM -0400, Tom Lane wrote: >> I haven't looked, but did we put anything into pg_stat_statements >> to make it easy to tell if you've messed up this setting? > You mean apart from from having pg_stat_statements' view/SRFs returning > nothing? > I think it's a reasonable use case to sometime disable query_id calculation, > eg. if you know that it will only lead to useless bloat in the entry and that > you won't need the info, so spamming warnings if there are no queryid could > cause some pain. I agree repeated warnings would be bad news. I was wondering if we could arrange a single warning at the time pg_stat_statements is preloaded into the postmaster. In this way, if you tried to use a configuration file that used to work, you'd hopefully get some notice about why it no longer does what you want. Also, if you are preloading pg_stat_statements, it seems reasonable to assume that you'd like the global value of the flag to be "on", even if there are use-cases for transiently disabling it. I think the way to detect "being loaded into the postmaster" is if (IsPostmasterEnvironment && !IsUnderPostmaster) regards, tom lane
Re: [PATCH] Automatic HASH and LIST partition creation
I have reviewed the v4 patch. The patch does not get applied on the latest source. Kindly rebase. However I have found few comments. 1. > +-- must fail because of wrong configuration > +CREATE TABLE tbl_hash_fail (i int) PARTITION BY HASH (i) > +CONFIGURATION (values in (1, 2), (3, 4) default partition tbl_default); Here some of the keywords are mentioned in UPPER CASE (Ex: CREATE TABLE, CONFIGURATION, etc) and some are mentioned in lower case (Ex: values in, default partition, etc). Kindly make it common. I feel making it to UPPER CASE is better. Please take care of this in all the cases. 2. It is better to separate the failure cases and success cases in /src/test/regress/sql/create_table.sql for better readability. All the failure cases can be listed first and then the success cases. 3. > + char *part_relname; > + > + /* > +* Generate partition name in the format: > +* $relname_$partnum > +* All checks of name validity will be made afterwards in DefineRelation() > +*/ > + part_relname = psprintf("%s_%d", cxt->relation->relname, i); The assignment can be done directly while declaring the variable. Thanks & Regards, Nitin Jadhav On Wed, Mar 3, 2021 at 1:56 AM Justin Pryzby wrote: > https://commitfest.postgresql.org/32/2694/ > > I don't know what committers will say, but I think that "ALTER TABLE" > might be > the essential thing for this patch to support, not "CREATE". (This is > similar > to ALTER..SET STATISTICS, which is not allowed in CREATE.) > > The reason is that ALTER is what's important for RANGE partitions, which > need > to be created dynamically (for example, to support time-series data > continuously inserting data around 'now'). I assume it's sometimes also > important for LIST. I think this patch should handle those cases better > before > being commited, or else we risk implementing grammar and other user-facing > interface > that fails to handle what's needed into the future (or that's > non-essential). > Even if dynamic creation isn't implemented yet, it seems important to at > least > implement the foundation for setting the configuration to *allow* that in > the > future, in a manner that's consistent with the initial implementation for > "static" partitions. > > ALTER also supports other ideas I mentioned here: > https://www.postgresql.org/message-id/20200706145947.GX4107%40telsasoft.com > > - ALTER .. SET interval (for dynamic/deferred RANGE partitioning) > - ALTER .. SET modulus, for HASH partitioning, in the initial > implementation, > this would allow CREATING paritions, but wouldn't attempt to handle > moving > data if overlapping table already exists: > - Could also set the table-name, maybe by format string; > - Could set "retention interval" for range partitioning; > - Could set if the partitions are themselves partitioned(??) > > I think once you allow setting configuration parameters like this, then you > might have an ALTER command to "effect" them, which would create any static > tables required by the configuration. maybe that'd be automatic, but if > there's an "ALTER .. APPLY PARTITIONS" command (or whatever), maybe in the > future, the command could also be used to "repartition" existing table data > into partitions with more fine/course granularity (modulus, or daily vs > monthly > range, etc). > > -- > Justin > > >
Re: Use simplehash.h instead of dynahash in SMgr
On Sun, 25 Apr 2021 at 18:48, Yura Sokolov wrote: > If you read comments in SH_START_ITERATE, you'll see: > >* Search for the first empty element. As deletions during iterations > are >* supported, we want to start/end at an element that cannot be > affected >* by elements being shifted. > >* Iterate backwards, that allows the current element to be deleted, > even >* if there are backward shifts > > Therefore, it is safe to delete during iteration, and it doesn't lead > nor > require loop restart. I had only skimmed that with a pre-loaded assumption that it wouldn't be safe. I didn't do a very good job of reading it as I failed to notice the lack of guarantees were about deleting items other than the current one. I didn't consider the option of finding a free bucket then looping backwards to avoid missing entries that are moved up during a delete. With that, I changed the patch to get rid of the SH_TRUNCATE and got rid of the smgrclose_internal which skips the hash delete. The code is much more similar to how it was now. In regards to the hashing stuff. I added a new function to pg_bitutils.h to rotate left and I'm using that instead of the other expression that was taken from nodeHash.c For the hash function, I've done some further benchmarking with: 1) The attached v2 patch 2) The attached + plus use_hash_combine.patch.txt which uses hash_combine() instead of pg_rotate_left32()ing the hashkey each time. 3) The attached v2 with use_hash_bytes.patch.txt applied. 4) Master I've also included the hash stats from each version of the hash function. I hope the numbers help indicate the reason I picked the hash function that I did. 1) v2 patch. CPU: user: 108.23 s, system: 6.97 s, elapsed: 115.63 s CPU: user: 114.78 s, system: 6.88 s, elapsed: 121.71 s CPU: user: 107.53 s, system: 5.70 s, elapsed: 113.28 s CPU: user: 108.43 s, system: 5.73 s, elapsed: 114.22 s CPU: user: 106.18 s, system: 5.73 s, elapsed: 111.96 s CPU: user: 108.04 s, system: 5.29 s, elapsed: 113.39 s CPU: user: 107.64 s, system: 5.64 s, elapsed: 113.34 s CPU: user: 106.64 s, system: 5.58 s, elapsed: 112.27 s CPU: user: 107.91 s, system: 5.40 s, elapsed: 113.36 s CPU: user: 115.35 s, system: 6.60 s, elapsed: 122.01 s Median = 113.375 s LOG: size: 4096, members: 2032, filled: 0.496094, total chain: 997, max chain: 5, avg chain: 0.490650, total_collisions: 422, max_collisions: 3, avg_collisions: 0.207677 2) v2 patch + use_hash_combine.patch.txt CPU: user: 113.22 s, system: 5.52 s, elapsed: 118.80 s CPU: user: 116.63 s, system: 5.87 s, elapsed: 122.56 s CPU: user: 115.33 s, system: 5.73 s, elapsed: 121.12 s CPU: user: 113.11 s, system: 5.61 s, elapsed: 118.78 s CPU: user: 112.56 s, system: 5.51 s, elapsed: 118.13 s CPU: user: 114.55 s, system: 5.80 s, elapsed: 120.40 s CPU: user: 121.79 s, system: 6.45 s, elapsed: 128.29 s CPU: user: 113.98 s, system: 4.50 s, elapsed: 118.52 s CPU: user: 113.24 s, system: 5.63 s, elapsed: 118.93 s CPU: user: 114.11 s, system: 5.60 s, elapsed: 119.78 s Median = 119.355 s LOG: size: 4096, members: 2032, filled: 0.496094, total chain: 971, max chain: 6, avg chain: 0.477854, total_collisions: 433, max_collisions: 4, avg_collisions: 0.213091 3) v2 patch + use_hash_bytes.patch.txt CPU: user: 120.87 s, system: 6.69 s, elapsed: 127.62 s CPU: user: 112.40 s, system: 4.68 s, elapsed: 117.14 s CPU: user: 113.19 s, system: 5.44 s, elapsed: 118.69 s CPU: user: 112.15 s, system: 4.73 s, elapsed: 116.93 s CPU: user: 111.10 s, system: 5.59 s, elapsed: 116.74 s CPU: user: 112.03 s, system: 5.74 s, elapsed: 117.82 s CPU: user: 113.69 s, system: 4.33 s, elapsed: 118.07 s CPU: user: 113.30 s, system: 4.19 s, elapsed: 117.55 s CPU: user: 112.77 s, system: 5.57 s, elapsed: 118.39 s CPU: user: 112.25 s, system: 4.59 s, elapsed: 116.88 s Median = 117.685 s LOG: size: 4096, members: 2032, filled: 0.496094, total chain: 900, max chain: 4, avg chain: 0.442913, total_collisions: 415, max_collisions: 4, avg_collisions: 0.204232 4) master CPU: user: 117.89 s, system: 5.7 s, elapsed: 123.65 s CPU: user: 117.81 s, system: 5.74 s, elapsed: 123.62 s CPU: user: 119.39 s, system: 5.75 s, elapsed: 125.2 s CPU: user: 117.98 s, system: 4.39 s, elapsed: 122.41 s CPU: user: 117.92 s, system: 4.79 s, elapsed: 122.76 s CPU: user: 119.84 s, system: 4.75 s, elapsed: 124.64 s CPU: user: 120.6 s, system: 5.82 s, elapsed: 126.49 s CPU: user: 118.74 s, system: 5.71 s, elapsed: 124.51 s CPU: user: 124.29 s, system: 6.79 s, elapsed: 131.14 s CPU: user: 118.73 s, system: 5.67 s, elapsed: 124.47 s Median = 124.49 s You can see that the bare v2 patch is quite a bit faster than any of the alternatives. We'd be better of with hash_bytes than using hash_combine() both for performance and for the seemingly better job the hash function does at reducing the hash collisions. David diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index 8310f73212..33e5cadd03 100644 --- a/src/backend/storage/smgr/smgr
Re: Allowing to create LEAKPROOF functions to non-superuser
> 20 апр. 2021 г., в 02:38, Stephen Frost написал(а): > > Here's what I'd ask Andrey- what's the actual use-case here? Are these > cases where users are actually adding new functions which they believe > are leakproof where those functions don't require superuser already to > be created? Clearly if they're in a untrusted language and you have to > be a superuser to install them in the first place then they should just > mark the function as leakproof when they install it. If these are > trusted language functions, I'd be curious to actually see them as I > have doubts about if they're actually leakproof.. > > Or is the actual use-case here that they just want to mark functions we > know aren't leakproof as leakproof anyway because they aren't getting > the performance they want? > > If it's the latter, as I suspect it is, then I don't really think the > use-case justifies any change on our part- the right answer is to make > those functions actually leakproof, or write ones which are. Customer was restoring pg_dump of on-premise ERP known as 1C (something like TurboTax) with this add-on [0] CREATE FUNCTION simple1c.date_from_guid(varchar(36)) RETURNS timestamp LANGUAGE plpgsql IMMUTABLE LEAKPROOF STRICT I'm not 1C-expert (programmed it a bit to get few bucks when I was a student), but seems like this function simple1c.date_from_guid() can be used in DSL queries. It have no obvious side effects. Maybe we could hack it by exploiting timestamp overflow, but I doubt it's practically usable. Thanks! Best regards, Andrey Borodin. [0] https://github.com/ivan816/simple-1c/blob/f2e5ce78b98f70f30039fd3de79308a59d432fc2/Simple1C/Impl/Sql/SchemaMapping/Simple1cSchemaCreator.cs#L74
Re: Allowing to create LEAKPROOF functions to non-superuser
On Mon, Apr 19, 2021 at 05:38:43PM -0400, Stephen Frost wrote: > > > > On Fri, Apr 16, 2021 at 3:57 AM Noah Misch wrote: > > > >> Hence, I do find it reasonable to let pg_read_all_data be sufficient > > > >> for > > > >> setting LEAKPROOF. > I'm not really sure that attaching it to pg_read_all_data makes sense.. > > In general, I've been frustrated by the places where we lump privileges > together rather than having a way to distinctly GRANT capabilities > independently- that's more-or-less exactly what lead me to work on > implementing the role system in the first place, and later the > predefined roles. This would be no more lumpy than e.g. pg_read_all_stats. However, I could live with a separate pg_change_leakproof (or whatever name). > Here's what I'd ask Andrey- what's the actual use-case here? Are these > cases where users are actually adding new functions which they believe > are leakproof where those functions don't require superuser already to > be created? Clearly if they're in a untrusted language and you have to > be a superuser to install them in the first place then they should just > mark the function as leakproof when they install it. If these are > trusted language functions, I'd be curious to actually see them as I > have doubts about if they're actually leakproof.. > > Or is the actual use-case here that they just want to mark functions we > know aren't leakproof as leakproof anyway because they aren't getting > the performance they want? Hearing those answers would be interesting, but it shouldn't change decisions. A reasonable person can write an actually-leakproof function and wish to mark it LEAKPROOF, independent of whether that happened in the case that prompted this thread.
Re: Fix dropped object handling in pg_event_trigger_ddl_commands
On Sun, Apr 18, 2021 at 2:12 PM Sven Klemm wrote: > when creating an event trigger for ddl_command_end that calls > pg_event_trigger_ddl_commands certain statements will cause the > trigger to fail with a cache lookup error. The error happens on > master, 13 and 12 I didnt test any previous versions. > > trg=# ALTER TABLE t ALTER COLUMN f1 SET DATA TYPE bigint, ALTER COLUMN > f1 DROP IDENTITY; > ERROR: XX000: cache lookup failed for relation 13476892 > CONTEXT: PL/pgSQL function ddl_end() line 5 at FOR over SELECT rows > LOCATION: getRelationTypeDescription, objectaddress.c:4178 Any opinions on the patch? Is this not worth the effort to fix or is there a better way to fix this? https://www.postgresql.org/message-id/CAMCrgp2R1cEXU53iYKtW6yVEp2_yKUz+z=3-ctryppp+xry...@mail.gmail.com -- Regards, Sven Klemm
Re: compute_query_id and pg_stat_statements
On Sat, Apr 24, 2021 at 01:43:51PM -0400, Tom Lane wrote: > > I haven't looked, but did we put anything into pg_stat_statements > to make it easy to tell if you've messed up this setting? You mean apart from from having pg_stat_statements' view/SRFs returning nothing? I think it's a reasonable use case to sometime disable query_id calculation, eg. if you know that it will only lead to useless bloat in the entry and that you won't need the info, so spamming warnings if there are no queryid could cause some pain.
Some oversights in query_id calculation
Hi, While doing some sanity checks on the regression tests, I found some queries that are semantically different but end up with identical query_id. Two are an old issues: - the "ONLY" in FROM [ONLY] isn't hashed - the agglevelsup field in GROUPING isn't hashed Another one was introduced in pg13 with the WITH TIES not being hashed. The last one new in pg14: the "DISTINCT" in "GROUP BY [DISTINCT]" isn't hash. I'm attaching a patch that fixes those, with regression tests to reproduce each problem. There are also 2 additional debatable cases on whether this is a semantic difference or not: - aliases aren't hashed. That's usually not a problem, except when you use row_to_json(), since you'll get different keys - the NAME in XmlExpr (eg: xmlpi(NAME foo,...)) isn't hashed, so you generate different elements >From cf4ee4b493cdb730b7ac98d65bcca785455af7ce Mon Sep 17 00:00:00 2001 From: Julien Rouhaud Date: Sun, 25 Apr 2021 15:49:32 +0800 Subject: [PATCH v1] Fix some oversights in query_id calculation. --- .../expected/pg_stat_statements.out | 151 ++ .../sql/pg_stat_statements.sql| 52 ++ src/backend/utils/misc/queryjumble.c | 4 + 3 files changed, 207 insertions(+) diff --git a/contrib/pg_stat_statements/expected/pg_stat_statements.out b/contrib/pg_stat_statements/expected/pg_stat_statements.out index fb97f68737..40b5109b55 100644 --- a/contrib/pg_stat_statements/expected/pg_stat_statements.out +++ b/contrib/pg_stat_statements/expected/pg_stat_statements.out @@ -916,4 +916,155 @@ SELECT query, toplevel, plans, calls FROM pg_stat_statements WHERE query LIKE '% $$ LANGUAGE plpgsql | | | (3 rows) +-- FROM [ONLY] +CREATE TABLE tbl_inh(id integer); +CREATE TABLE tbl_inh_1() INHERITS (tbl_inh); +INSERT INTO tbl_inh_1 SELECT 1; +SELECT * FROM tbl_inh; + id + + 1 +(1 row) + +SELECT * FROM ONLY tbl_inh; + id + +(0 rows) + +SELECT COUNT(*) FROM pg_stat_statements WHERE query LIKE '%FROM%tbl_inh%'; + count +--- + 2 +(1 row) + +-- WITH TIES +CREATE TABLE limitoption AS SELECT 0 AS val FROM generate_series(1, 10); +SELECT * +FROM limitoption +WHERE val < 2 +ORDER BY val +FETCH FIRST 2 ROWS WITH TIES; + val +- + 0 + 0 + 0 + 0 + 0 + 0 + 0 + 0 + 0 + 0 +(10 rows) + +SELECT * +FROM limitoption +WHERE val < 2 +ORDER BY val +FETCH FIRST 2 ROW ONLY; + val +- + 0 + 0 +(2 rows) + +SELECT COUNT(*) FROM pg_stat_statements WHERE query LIKE '%FETCH FIRST%'; + count +--- + 2 +(1 row) + +-- GROUP BY [DISTINCT] +SELECT a, b, c +FROM (VALUES (1, 2, 3), (4, NULL, 6), (7, 8, 9)) AS t (a, b, c) +GROUP BY ROLLUP(a, b), rollup(a, c) +ORDER BY a, b, c; + a | b | c +---+---+--- + 1 | 2 | 3 + 1 | 2 | + 1 | 2 | + 1 | | 3 + 1 | | 3 + 1 | | + 1 | | + 1 | | + 4 | | 6 + 4 | | 6 + 4 | | 6 + 4 | | + 4 | | + 4 | | + 4 | | + 4 | | + 7 | 8 | 9 + 7 | 8 | + 7 | 8 | + 7 | | 9 + 7 | | 9 + 7 | | + 7 | | + 7 | | + | | +(25 rows) + +SELECT a, b, c +FROM (VALUES (1, 2, 3), (4, NULL, 6), (7, 8, 9)) AS t (a, b, c) +GROUP BY DISTINCT ROLLUP(a, b), rollup(a, c) +ORDER BY a, b, c; + a | b | c +---+---+--- + 1 | 2 | 3 + 1 | 2 | + 1 | | 3 + 1 | | + 4 | | 6 + 4 | | 6 + 4 | | + 4 | | + 7 | 8 | 9 + 7 | 8 | + 7 | | 9 + 7 | | + | | +(13 rows) + +SELECT COUNT(*) FROM pg_stat_statements WHERE query LIKE '%GROUP BY%ROLLUP%'; + count +--- + 2 +(1 row) + +-- GROUPING SET agglevelsup +SELECT ( + SELECT ( +SELECT GROUPING(a,b) FROM (VALUES (1)) v2(c) + ) FROM (VALUES (1,2)) v1(a,b) GROUP BY (a,b) +) FROM (VALUES(6,7)) v3(e,f) GROUP BY ROLLUP(e,f); + grouping +-- +0 +0 +0 +(3 rows) + +SELECT ( + SELECT ( +SELECT GROUPING(e,f) FROM (VALUES (1)) v2(c) + ) FROM (VALUES (1,2)) v1(a,b) GROUP BY (a,b) +) FROM (VALUES(6,7)) v3(e,f) GROUP BY ROLLUP(e,f); + grouping +-- +3 +0 +1 +(3 rows) + +SELECT COUNT(*) FROM pg_stat_statements WHERE query LIKE '%SELECT GROUPING%'; + count +--- + 2 +(1 row) + DROP EXTENSION pg_stat_statements; diff --git a/contrib/pg_stat_statements/sql/pg_stat_statements.sql b/contrib/pg_stat_statements/sql/pg_stat_statements.sql index 56d8526ccf..bc3b6493e6 100644 --- a/contrib/pg_stat_statements/sql/pg_stat_statements.sql +++ b/contrib/pg_stat_statements/sql/pg_stat_statements.sql @@ -385,4 +385,56 @@ END; $$ LANGUAGE plpgsql; SELECT query, toplevel, plans, calls FROM pg_stat_statements WHERE query LIKE '%DELETE%' ORDER BY query COLLATE "C", toplevel; +-- FROM [ONLY] +CREATE TABLE tbl_inh(id integer); +CREATE TABLE tbl_inh_1() INHERITS (tbl_inh); +INSERT INTO tbl_inh_1 SELECT 1; + +SELECT * FROM tbl_inh; +SELECT * FROM ONLY tbl_inh; + +SELECT COUNT(*) FROM pg_stat_statements WHERE query LIKE '%FROM%tbl_inh%'; + +-- WITH TIES +CREATE TABLE limitoption AS SELECT 0 AS val FROM generate_series(1, 10); +SELECT * +FR