Re: [HACKERS] Speedup twophase transactions

Michael Paquier Sat, 17 Dec 2016 14:42:51 -0800

On Sun, Dec 18, 2016 at 6:42 AM, Bruce Momjian <[email protected]> wrote:
> Uh, did you mean to attached patch here?


Strange. I can confirm that I have received the patch as attached, but
it is not on the archives.
-- 
Michael

diff --git a/src/backend/access/transam/twophase.c 
b/src/backend/access/transam/twophase.c
index 5415604..fb69646 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -45,8 +45,8 @@
  *               fsynced
  *             * If COMMIT happens after checkpoint then backend reads state 
data from
  *               files
- *             * In case of crash replay will move data from xlog to files, if 
that
- *               hasn't happened before. XXX TODO - move to shmem in replay 
also
+ *             * Simplified version of the same scenario happens during 
recovery and
+ *               replication. See comments to KnownPreparedXact structure.
  *
  *-------------------------------------------------------------------------
  */
@@ -181,6 +181,35 @@ static GlobalTransaction MyLockedGxact = NULL;
 
 static bool twophaseExitRegistered = false;
 
+/*
+ * During replay and replication KnownPreparedList holds info about active 
prepared
+ * transactions that weren't moved to files yet. We will need that info by the 
end of
+ * recovery (including promote) to restore memory state of that transactions.
+ *
+ * Naive approach here is to move each PREPARE record to disk, fsync it and 
don't have
+ * that list at all, but that provokes a lot of unnecessary fsyncs on small 
files
+ * causing replica to be slower than master.
+ *
+ * Replay of twophase records happens by the following rules:
+ *             * On PREPARE redo KnownPreparedAdd() is called to add that 
transaction to
+ *               KnownPreparedList and no more actions are taken.
+ *             * On checkpoint redo we iterate through KnownPreparedList and 
move all prepare
+ *               records that behind redo_horizon to files and deleting them 
from list.
+ *             * On COMMIT/ABORT we delete file or entry in KnownPreparedList.
+ *             * At the end of recovery we move all known prepared 
transactions to disk
+ *               to allow 
RecoverPreparedTransactions/StandbyRecoverPreparedTransactions
+ *               do their work.
+ */
+typedef struct KnownPreparedXact
+{
+       TransactionId   xid;
+       XLogRecPtr              prepare_start_lsn;
+       XLogRecPtr              prepare_end_lsn;
+       dlist_node              list_node;
+} KnownPreparedXact;
+
+static dlist_head KnownPreparedList = DLIST_STATIC_INIT(KnownPreparedList);
+
 static void RecordTransactionCommitPrepared(TransactionId xid,
                                                                int nchildren,
                                                                TransactionId 
*children,
@@ -1241,9 +1270,9 @@ ReadTwoPhaseFile(TransactionId xid, bool give_warnings)
  * Reads 2PC data from xlog. During checkpoint this data will be moved to
  * twophase files and ReadTwoPhaseFile should be used instead.
  *
- * Note clearly that this function accesses WAL during normal operation, 
similarly
- * to the way WALSender or Logical Decoding would do. It does not run during
- * crash recovery or standby processing.
+ * Note clearly that this function can access WAL during normal operation, 
similarly
+ * to the way WALSender or Logical Decoding would do.
+ *
  */
 static void
 XlogReadTwoPhaseData(XLogRecPtr lsn, char **buf, int *len)
@@ -1252,8 +1281,6 @@ XlogReadTwoPhaseData(XLogRecPtr lsn, char **buf, int *len)
        XLogReaderState *xlogreader;
        char       *errormsg;
 
-       Assert(!RecoveryInProgress());
-
        xlogreader = XLogReaderAllocate(&read_local_xlog_page, NULL);
        if (!xlogreader)
                ereport(ERROR,
@@ -1691,6 +1718,15 @@ PrescanPreparedTransactions(TransactionId **xids_p, int 
*nxids_p)
        int                     nxids = 0;
        int                     allocsize = 0;
 
+       /*
+        * Move prepared transactions from KnownPreparedList to files, if any.
+        * It is possible to skip that step and teach subsequent code about
+        * KnownPreparedList, but whole PrescanPreparedTransactions() happens
+        * once during end of recovery or promote, so probably it isn't worth
+        * complications.
+        */
+       KnownPreparedRecreateFiles(InvalidXLogRecPtr);
+
        cldir = AllocateDir(TWOPHASE_DIR);
        while ((clde = ReadDir(cldir, TWOPHASE_DIR)) != NULL)
        {
@@ -2162,3 +2198,111 @@ RecordTransactionAbortPrepared(TransactionId xid,
         */
        SyncRepWaitForLSN(recptr, false);
 }
+
+/*
+ * KnownPreparedAdd.
+ *
+ * Store correspondence of start/end lsn and xid in KnownPreparedList.
+ * This is called during redo of prepare record to have list of prepared
+ * transactions that aren't yet moved to 2PC files by the end of recovery.
+ */
+void
+KnownPreparedAdd(XLogReaderState *record)
+{
+       KnownPreparedXact *xact;
+       TwoPhaseFileHeader *hdr = (TwoPhaseFileHeader *) XLogRecGetData(record);
+
+       Assert(RecoveryInProgress());
+
+       xact = (KnownPreparedXact *) palloc(sizeof(KnownPreparedXact));
+       xact->xid = hdr->xid;
+       xact->prepare_start_lsn = record->ReadRecPtr;
+       xact->prepare_end_lsn = record->EndRecPtr;
+
+       dlist_push_tail(&KnownPreparedList, &xact->list_node);
+}
+
+/*
+ * KnownPreparedRemoveByXid
+ *
+ * Forget about prepared transaction. Called during commit/abort redo.
+ */
+void
+KnownPreparedRemoveByXid(TransactionId xid)
+{
+       dlist_mutable_iter miter;
+
+       Assert(RecoveryInProgress());
+
+       dlist_foreach_modify(miter, &KnownPreparedList)
+       {
+               KnownPreparedXact   *xact = dlist_container(KnownPreparedXact,
+                                                                               
                                list_node, miter.cur);
+
+               if (xact->xid == xid)
+               {
+                       dlist_delete(miter.cur);
+                       /*
+                        * Since we found entry in KnownPreparedList we know 
that file isn't
+                        * on disk yet and we can end up here.
+                        */
+                       return;
+               }
+       }
+
+       /*
+        * Here we know that file should be moved to disk. But aborting 
recovery because
+        * of absence of unnecessary file doesn't seems to be a good idea, so 
call remove
+        * with giveWarning=false.
+        */
+       RemoveTwoPhaseFile(xid, false);
+}
+
+/*
+ * KnownPreparedRecreateFiles
+ *
+ * Moves prepare records from WAL to files. Called during checkpoint replay
+ * or PrescanPreparedTransactions.
+ *
+ * redo_horizon = InvalidXLogRecPtr indicates that all transactions from
+ *             KnownPreparedList should be moved to disk.
+ */
+void
+KnownPreparedRecreateFiles(XLogRecPtr redo_horizon)
+{
+       dlist_mutable_iter miter;
+       int                     serialized_xacts = 0;
+
+       Assert(RecoveryInProgress());
+
+       TRACE_POSTGRESQL_TWOPHASE_CHECKPOINT_START();
+
+       dlist_foreach_modify(miter, &KnownPreparedList)
+       {
+               KnownPreparedXact   *xact = dlist_container(KnownPreparedXact,
+                                                                               
                                list_node, miter.cur);
+
+               if (xact->prepare_end_lsn <= redo_horizon || redo_horizon == 
InvalidXLogRecPtr)
+               {
+                       char       *buf;
+                       int                     len;
+
+                       XlogReadTwoPhaseData(xact->prepare_start_lsn, &buf, 
&len);
+                       RecreateTwoPhaseFile(xact->xid, buf, len);
+                       pfree(buf);
+                       dlist_delete(miter.cur);
+                       serialized_xacts++;
+               }
+       }
+
+       TRACE_POSTGRESQL_TWOPHASE_CHECKPOINT_DONE();
+
+       if (log_checkpoints && serialized_xacts > 0)
+               ereport(LOG,
+                               (errmsg_plural("%u two-phase state file was 
written "
+                                                          "for long-running 
prepared transactions",
+                                                          "%u two-phase state 
files were written "
+                                                          "for long-running 
prepared transactions",
+                                                          serialized_xacts,
+                                                          serialized_xacts)));
+}
diff --git a/src/backend/access/transam/xact.c 
b/src/backend/access/transam/xact.c
index d643216..b3e0238 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -5604,7 +5604,9 @@ xact_redo(XLogReaderState *record)
                        Assert(TransactionIdIsValid(parsed.twophase_xid));
                        xact_redo_commit(&parsed, parsed.twophase_xid,
                                                         record->EndRecPtr, 
XLogRecGetOrigin(record));
-                       RemoveTwoPhaseFile(parsed.twophase_xid, false);
+
+                       /* Delete KnownPrepared entry or 2PC file. */
+                       KnownPreparedRemoveByXid(parsed.twophase_xid);
                }
        }
        else if (info == XLOG_XACT_ABORT || info == XLOG_XACT_ABORT_PREPARED)
@@ -5624,14 +5626,20 @@ xact_redo(XLogReaderState *record)
                {
                        Assert(TransactionIdIsValid(parsed.twophase_xid));
                        xact_redo_abort(&parsed, parsed.twophase_xid);
-                       RemoveTwoPhaseFile(parsed.twophase_xid, false);
+
+                       /* Delete KnownPrepared entry or 2PC file. */
+                       KnownPreparedRemoveByXid(parsed.twophase_xid);
                }
        }
        else if (info == XLOG_XACT_PREPARE)
        {
-               /* the record contents are exactly the 2PC file */
-               RecreateTwoPhaseFile(XLogRecGetXid(record),
-                                                 XLogRecGetData(record), 
XLogRecGetDataLen(record));
+               /*
+                * If that transaction will not be committed by the end of 
recovery then we
+                * will need 2PC file (the record contents is exactly the 2PC 
file) to be able
+                * to commit that later.
+                * For now store xid and pointers to that record in 
KnownPreparedList.
+                */
+               KnownPreparedAdd(record);
        }
        else if (info == XLOG_XACT_ASSIGNMENT)
        {
diff --git a/src/backend/access/transam/xlog.c 
b/src/backend/access/transam/xlog.c
index 084401d..9ac1fd7 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -9488,6 +9488,7 @@ xlog_redo(XLogReaderState *record)
                                        (errmsg("unexpected timeline ID %u 
(should be %u) in checkpoint record",
                                                        
checkPoint.ThisTimeLineID, ThisTimeLineID)));
 
+               KnownPreparedRecreateFiles(checkPoint.redo);
                RecoveryRestartPoint(&checkPoint);
        }
        else if (info == XLOG_END_OF_RECOVERY)
diff --git a/src/include/access/twophase.h b/src/include/access/twophase.h
index b7ce0c6..23be08f 100644
--- a/src/include/access/twophase.h
+++ b/src/include/access/twophase.h
@@ -15,6 +15,7 @@
 #define TWOPHASE_H
 
 #include "access/xlogdefs.h"
+#include "access/xlogreader.h"
 #include "datatype/timestamp.h"
 #include "storage/lock.h"
 
@@ -56,4 +57,8 @@ extern void CheckPointTwoPhase(XLogRecPtr redo_horizon);
 
 extern void FinishPreparedTransaction(const char *gid, bool isCommit);
 
+extern void KnownPreparedAdd(XLogReaderState *record);
+extern void KnownPreparedRemoveByXid(TransactionId xid);
+extern void KnownPreparedRecreateFiles(XLogRecPtr redo_horizon);
+
 #endif   /* TWOPHASE_H */
diff --git a/src/test/recovery/t/009_twophase.pl 
b/src/test/recovery/t/009_twophase.pl
new file mode 100644
index 0000000..27bccb3
--- /dev/null
+++ b/src/test/recovery/t/009_twophase.pl
@@ -0,0 +1,315 @@
+# Tests dedicated to two-phase commit in recovery
+use strict;
+use warnings;
+use PostgresNode;
+use TestLib;
+use Test::More tests => 13;
+
+# Setup master node
+my $node_master = get_new_node("master");
+$node_master->init(allows_streaming => 1);
+$node_master->append_conf('postgresql.conf', qq(
+       max_prepared_transactions = 10
+       log_checkpoints = true
+));
+$node_master->start;
+$node_master->backup('master_backup');
+$node_master->psql('postgres', "create table t(id int)");
+
+# Setup master node
+my $node_slave = get_new_node('slave');
+$node_slave->init_from_backup($node_master, 'master_backup', has_streaming => 
1);
+$node_slave->start;
+
+# Switch to synchronous replication
+$node_master->append_conf('postgresql.conf', qq(
+       synchronous_standby_names = '*'
+));
+$node_master->psql('postgres', "select pg_reload_conf()");
+
+my $psql_out = '';
+my $psql_rc = '';
+
+###############################################################################
+# Check that we can commit and abort tx after soft restart.
+# Here checkpoint happens before shutdown and no WAL replay will occur at next
+# startup. In this case postgres re-create shared-memory state from twophase
+# files.
+###############################################################################
+
+$node_master->psql('postgres', "
+       begin;
+       insert into t values (42);
+       savepoint s1;
+       insert into t values (43);
+       prepare transaction 'x';
+       begin;
+       insert into t values (142);
+       savepoint s1;
+       insert into t values (143);
+       prepare transaction 'y';");
+$node_master->stop;
+$node_master->start;
+
+$psql_rc = $node_master->psql('postgres', "commit prepared 'x'");
+is($psql_rc, '0', 'Commit prepared transaction after restart.');
+
+$psql_rc = $node_master->psql('postgres', "rollback prepared 'y'");
+is($psql_rc, '0', 'Rollback prepared transaction after restart.');
+
+###############################################################################
+# Check that we can commit and abort after hard restart.
+# At next startup, WAL replay will re-create shared memory state for prepared
+# transaction using dedicated WAL records.
+###############################################################################
+
+$node_master->psql('postgres', "
+       checkpoint;
+       begin;
+       insert into t values (42);
+       savepoint s1;
+       insert into t values (43);
+       prepare transaction 'x';
+       begin;
+       insert into t values (142);
+       savepoint s1;
+       insert into t values (143);
+       prepare transaction 'y';");
+$node_master->teardown_node;
+$node_master->start;
+
+$psql_rc = $node_master->psql('postgres', "commit prepared 'x'");
+is($psql_rc, '0', 'Commit prepared tx after teardown.');
+
+$psql_rc = $node_master->psql('postgres', "rollback prepared 'y'");
+is($psql_rc, '0', 'Rollback prepared transaction after teardown.');
+
+###############################################################################
+# Check that WAL replay can handle several transactions with same name GID.
+###############################################################################
+
+$node_master->psql('postgres', "
+       checkpoint;
+       begin;
+       insert into t values (42);
+       savepoint s1;
+       insert into t values (43);
+       prepare transaction 'x';
+       commit prepared 'x';
+       begin;
+       insert into t values (42);
+       savepoint s1;
+       insert into t values (43);
+       prepare transaction 'x';");
+$node_master->teardown_node;
+$node_master->start;
+
+$psql_rc = $node_master->psql('postgres', "commit prepared 'x'");
+is($psql_rc, '0', 'Replay several transactions with same GID.');
+
+###############################################################################
+# Check that WAL replay cleans up its shared memory state and releases locks
+# while replaying transaction commits.
+###############################################################################
+
+$node_master->psql('postgres', "
+       begin;
+       insert into t values (42);
+       savepoint s1;
+       insert into t values (43);
+       prepare transaction 'x';
+       commit prepared 'x';");
+$node_master->teardown_node;
+$node_master->start;
+$psql_rc = $node_master->psql('postgres', "begin;
+       insert into t values (42);
+       savepoint s1;
+       insert into t values (43);
+       -- This prepare can fail due to conflicting GID or locks conflicts if
+       -- replay did not fully cleanup its state on previous commit.
+       prepare transaction 'x';");
+is($psql_rc, '0', "Cleanup of shared memory state for 2PC commit");
+
+$node_master->psql('postgres', "commit prepared 'x'");
+
+###############################################################################
+# Check that WAL replay will cleanup its shared memory state on running slave.
+###############################################################################
+
+$node_master->psql('postgres', "
+       begin;
+       insert into t values (42);
+       savepoint s1;
+       insert into t values (43);
+       prepare transaction 'x';
+       commit prepared 'x';");
+$node_slave->psql('postgres', "select count(*) from pg_prepared_xacts;",
+         stdout => \$psql_out);
+is($psql_out, '0',
+   "Cleanup of shared memory state on running standby without checkpoint.");
+
+###############################################################################
+# Same as in previous case, but let's force checkpoint on slave between
+# prepare and commit to use on-disk twophase files.
+###############################################################################
+
+$node_master->psql('postgres', "
+       begin;
+       insert into t values (42);
+       savepoint s1;
+       insert into t values (43);
+       prepare transaction 'x';");
+$node_slave->psql('postgres', "checkpoint;");
+$node_master->psql('postgres', "commit prepared 'x';");
+$node_slave->psql('postgres', "select count(*) from pg_prepared_xacts;",
+         stdout => \$psql_out);
+is($psql_out, '0',
+   "Cleanup of shared memory state on running standby after checkpoint.");
+
+###############################################################################
+# Check that prepared transactions can be committed on promoted slave.
+###############################################################################
+
+$node_master->psql('postgres', "
+       begin;
+       insert into t values (42);
+       savepoint s1;
+       insert into t values (43);
+       prepare transaction 'x';");
+$node_master->teardown_node;
+$node_slave->promote;
+$node_slave->poll_query_until('postgres', "SELECT pg_is_in_recovery() <> 
true");
+
+$psql_rc = $node_slave->psql('postgres', "commit prepared 'x';");
+is($psql_rc, '0', "Restore of prepared transaction on promoted slave.");
+
+# change roles
+($node_master, $node_slave) = ($node_slave, $node_master);
+$node_slave->enable_streaming($node_master);
+$node_slave->append_conf('recovery.conf', qq(
+recovery_target_timeline='latest'
+));
+$node_slave->start;
+
+###############################################################################
+# Check that prepared transactions are replayed after soft restart of standby
+# while master is down. Since standby knows that master is down it uses
+# different code path on start to be sure that the status of transactions is
+# consistent.
+###############################################################################
+
+$node_master->psql('postgres', "
+       begin;
+       insert into t values (42);
+       savepoint s1;
+       insert into t values (43);
+       prepare transaction 'x';");
+$node_master->stop;
+$node_slave->restart;
+$node_slave->promote;
+$node_slave->poll_query_until('postgres', "SELECT pg_is_in_recovery() <> 
true");
+
+$node_slave->psql('postgres', "select count(*) from pg_prepared_xacts",
+         stdout => \$psql_out);
+is($psql_out, '1',
+   "Restore prepared transactions from files with master down.");
+
+# restore state
+($node_master, $node_slave) = ($node_slave, $node_master);
+$node_slave->enable_streaming($node_master);
+$node_slave->append_conf('recovery.conf', qq(
+recovery_target_timeline='latest'
+));
+$node_slave->start;
+$node_master->psql('postgres', "commit prepared 'x'");
+
+###############################################################################
+# Check that prepared transactions are correctly replayed after slave hard
+# restart while master is down.
+###############################################################################
+
+$node_master->psql('postgres', "
+       begin;
+       insert into t values (242);
+       savepoint s1;
+       insert into t values (243);
+       prepare transaction 'x';
+       ");
+$node_master->stop;
+$node_slave->teardown_node;
+$node_slave->start;
+$node_slave->promote;
+$node_slave->poll_query_until('postgres',
+         "SELECT pg_is_in_recovery() <> true");
+
+$node_slave->psql('postgres', "select count(*) from pg_prepared_xacts",
+         stdout => \$psql_out);
+is($psql_out, '1',
+   "Restore prepared transactions from records with master down.");
+
+# restore state
+($node_master, $node_slave) = ($node_slave, $node_master);
+$node_slave->enable_streaming($node_master);
+$node_slave->append_conf('recovery.conf', qq(
+recovery_target_timeline='latest'
+));
+$node_slave->start;
+$node_master->psql('postgres', "commit prepared 'x'");
+
+
+###############################################################################
+# Check for a lock conflict between prepared tx with DDL inside and replay of
+# XLOG_STANDBY_LOCK wal record.
+###############################################################################
+
+$node_master->psql('postgres', "
+       begin;
+       create table t2(id int);
+       savepoint s1;
+       insert into t2 values (42);
+       prepare transaction 'x';
+       -- checkpoint will issue XLOG_STANDBY_LOCK that can conflict with lock
+       -- held by 'create table' statement
+       checkpoint;
+       commit prepared 'x';");
+
+$node_slave->psql('postgres', "select count(*) from pg_prepared_xacts",
+         stdout => \$psql_out);
+is($psql_out, '0', "Replay prepared transaction with DDL.");
+
+
+###############################################################################
+# Check that replay will correctly set SUBTRANS and properly advance nextXid
+# so it won't conflict with savepoint xids.
+###############################################################################
+
+$node_master->psql('postgres', "
+       begin;
+       delete from t;
+       insert into t values (43);
+       savepoint s1;
+       insert into t values (43);
+       savepoint s2;
+       insert into t values (43);
+       savepoint s3;
+       insert into t values (43);
+       savepoint s4;
+       insert into t values (43);
+       savepoint s5;
+       insert into t values (43);
+       prepare transaction 'x';
+       checkpoint;");
+
+$node_master->stop;
+$node_master->start;
+$node_master->psql('postgres', "
+       -- here we can get xid of previous savepoint if nextXid
+       -- wasn't properly advanced
+       begin;
+       insert into t values (142);
+       abort;
+       commit prepared 'x';");
+
+$node_master->psql('postgres', "select count(*) from t",
+         stdout => \$psql_out);
+is($psql_out, '6', "Check nextXid handling for prepared subtransactions");

-- 
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] Speedup twophase transactions

Reply via email to