MonetDB: wlcr - A new, simplified replication interface.

Martin Kersten Tue, 14 Feb 2017 13:49:32 -0800

Changeset: 6d5dbd832675 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=6d5dbd832675
Modified Files:
        monetdb5/mal/Makefile.ag
        monetdb5/mal/mal.c
        monetdb5/mal/mal_client.c
        monetdb5/mal/mal_client.h
        monetdb5/modules/mal/wlcr.c
        monetdb5/modules/mal/wlcr.h
        monetdb5/modules/mal/wlcr.mal
        monetdb5/optimizer/opt_wlcr.c
        sql/backends/monet5/sql_scenario.c
        sql/backends/monet5/sql_wlcr.c
        sql/backends/monet5/sql_wlcr.h
        sql/backends/monet5/sql_wlcr.mal
        sql/scripts/60_wlcr.sql
        sql/test/wlcr/Tests/All
        sql/test/wlcr/Tests/wlc01.py
        sql/test/wlcr/Tests/wlr01.py
        sql/test/wlcr/Tests/wlr01.stable.err
        sql/test/wlcr/Tests/wlr01.stable.out
        sql/test/wlcr/Tests/wlr20.py
        sql/test/wlcr/Tests/wlr20.stable.err
        sql/test/wlcr/Tests/wlr20.stable.out
        sql/test/wlcr/Tests/wlr30.py
        sql/test/wlcr/Tests/wlr40.py
Branch: wlcr
Log Message:


A new, simplified replication interface.
- the replication process can be controlled by the transaction id
- masterClock(), replicaClock(), replicaBacklog() to inspect state.
- focus on update queries only.


diffs (truncated from 2251 to 300 lines):

diff --git a/monetdb5/mal/Makefile.ag b/monetdb5/mal/Makefile.ag
--- a/monetdb5/mal/Makefile.ag
+++ b/monetdb5/mal/Makefile.ag
@@ -10,6 +10,7 @@ INCLUDES = ../../common/options \
                   ../../clients/mapilib \
                   ../../gdk \
                   ../optimizer \
+                  ../modules/mal \
                   $(openssl_CFLAGS)
 MTSAFE
 
diff --git a/monetdb5/mal/mal.c b/monetdb5/mal/mal.c
--- a/monetdb5/mal/mal.c
+++ b/monetdb5/mal/mal.c
@@ -36,6 +36,7 @@ int have_hge;
 #include "mal_runtime.h"
 #include "mal_resource.h"
 #include "opt_statistics.h"
+#include "wlcr.h"
 
 MT_Lock     mal_contextLock MT_LOCK_INITIALIZER("mal_contextLock");
 MT_Lock     mal_namespaceLock MT_LOCK_INITIALIZER("mal_namespaceLock");
@@ -124,6 +125,7 @@ void mserver_reset(int exit)
        str err = 0;
 
        GDKprepareExit();
+       WLCreset();
        MCstopClients(0);
        setHeartbeat(-1);
        stopProfiler();
diff --git a/monetdb5/mal/mal_client.c b/monetdb5/mal/mal_client.c
--- a/monetdb5/mal/mal_client.c
+++ b/monetdb5/mal/mal_client.c
@@ -243,7 +243,6 @@ MCinitClientRecord(Client c, oid user, b
        c->exception_buf_initialized = 0;
        c->error_row = c->error_fld = c->error_msg = c->error_input = NULL;
        c->wlcr_kind = 0;
-       c->wlcr_mode = 0;
        c->wlcr = NULL;
 #ifndef HAVE_EMBEDDED /* no authentication in embedded mode */
        {
@@ -400,7 +399,6 @@ freeClient(Client c)
                if( c->wlcr)
                        freeMalBlk(c->wlcr);
                c->wlcr_kind = 0;
-               c->wlcr_mode = 0;
                c->wlcr = NULL;
        }
        if (t)
diff --git a/monetdb5/mal/mal_client.h b/monetdb5/mal/mal_client.h
--- a/monetdb5/mal/mal_client.h
+++ b/monetdb5/mal/mal_client.h
@@ -177,7 +177,6 @@ typedef struct CLIENT {
         * This allows a single server to act as both a master and a replica.
         */
        int wlcr_kind;  // used by master to characterise the compound 
transaction
-       int wlcr_mode;  // used by replica to control rerunning the transaction
        MalBlkPtr wlcr;
 
        /*      
diff --git a/monetdb5/modules/mal/wlcr.c b/monetdb5/modules/mal/wlcr.c
--- a/monetdb5/modules/mal/wlcr.c
+++ b/monetdb5/modules/mal/wlcr.c
@@ -9,118 +9,127 @@
 /*
  * (c) 2017 Martin Kersten
  * This module collects the workload-capture-replay statements during 
transaction execution,
- * also known as asynchronous logical replication management.
+ * also known as asynchronous logical replication management. It can be used 
for
+ * multiple purposes: BACKUP, REPLICATION, and REPLAY
  *
- * The goal is to easily clone a master database.  
+ * For a BACKUP we need either a complete update log from the beginning, or
+ * a binary snapshot with a collection of logs recording its changes since.
+ * To ensure transaction ACID properties, the log record should be stored on
+ * disk within the transaction brackets, which may cause a serious IO load.
+ * (Tip, store these logs files on an SSD or NVM)
  *
+ * For REPLICATION, also called a database clone or slave, we take a snapshot 
and the
+ * log files that reflect the recent changes. The log updates are replayed 
against
+ * the snapshot until a specific time point is reached. 
+ * 
+ * Some systems also use the logical logs to REPLAY all (expensive) queries
+ * against the database. 
+ *
+ * The goal of this module is to ease BACKUP and REPPLICATION of a master 
database 
+ * with a time-bounded delay.
+ * Such a clone is a database replica that aid in query workload sharing,
+ * database versioning, and (re-)partitioning.
+ *
+ * Simplicity and ease of end-user control has been the driving argument here.
  *
  * IMPLEMENTATION
  * The underlying assumption of the techniques deployed is that the database
- * resides on a proper (global) file system to guarantees recovery from most
- * storage system related failures. Such as RAID disks.
- * Furthermore, when deployed in a Cloud setting, the database recides in the
- * global file system.
+ * resides on a proper (global/distributed) file system to guarantees recovery 
+ * from most storage system related failures, e.g. using RAID disks or 
LSFsystems.
  *
- * A database can be set once into 'master' mode only once using the SQL 
command:
+ * A database can be set into 'master' mode only once using the SQL command:
  * CALL master()
+ * An alternative path to the log records can be given to reduce the storage 
cost,
+ * e.g. a nearby SSD.
+ * By default, it creates a directory .../dbfarm/dbname/master to hold all 
+ * necessary information for the creation of a database clone.
  *
- * It creates a directory .../dbfarm/dbname/master to hold all necessary 
information
- * for the creation and maintenance of replicas.
- * A configuration file is added to keep track on the status of the master.
+ * A master configuration file is added to the database directory to keep the 
state/
  * It contains the following key=value pairs:
- *             snapshot=<path to a binary snapshot>
+ *             snapshot=<path to a snapshot directory>
  *             logs=<path to the wlcr log directory>
- *             state=<started, paused,(resume), stopped>
- *             firstbatch=<first batch file to be applied>
- *             batches=<last batch file to be applied>
- *             drift=<maximal delay before transactions are seen globally, in 
seconds>
- *             threshold=<min response time for queries to be kept>
- *             rollbock=<flag to indicate keeping the aborted transactions as 
well>
+ *             state=<started, stopped>
+ *             batches=<next available batch file to be applied>
+ *             drift=<maximal delay before transactions are published as a 
separate log, in seconds>
+ *             write=<timestamp of the last transaction recorded>
  *
- * Every replica should start off with a copy of binary snapshot identified by 
'snapshot'
- * by default stored in .../dbfarm/dbname/master/bat. An alternative path can 
be given
- * to reduce the storage cost at the expense of slower recovery time (e.g. AWS 
glacier).
- * A missing path to the snapshot denotes that we can start rebuilding with an 
empty database instead.
- * The log files are stored as master/<dbname>_<batchnumber>.
+ * A missing path to the snapshot denotes that we can start the clone with an 
empty database.
+ * The log files are stored as master/<dbname>_<batchnumber>. They belong to 
the snapshot.
  * 
  * Each wlcr log file contains a serial log of committed compound transactions.
  * The log records are represented as ordinary MAL statement blocks, which
  * are executed in serial mode. (parallelism can be considered for large 
updates later)
- * Each transaction job is identified by the owner of the query, 
- * commit/rollback status, its starting time and runtime (in ms).
+ * Each transaction job is identified by the owner of the query, its starting 
time and runtime (in ms).
+ * The log-record should end with a commit.
  *
- * Update queries are always logged and pure queries can be limited to those 
- * that satisfy an minimum execution threshold.
- * CALL logthreshold(duration)
- * The threshold is given in milliseconds. 
- * The threshold setting is saved and affects all future master log records.
- * The default for a production system version should be set to -1, which 
ignores all pure queries.
- *
- * The aborted transactions can also be gathered using the call
- * CALL logrollback(1);
- * Such queries may be helpful in the analysis of transactions with failures.
- *
- * A transaction log is owned by the master. He decides when the log may be 
globally used.
- * The trigger for this is the allowed drift. A new transaction log is created 
when
+ * A transaction log is created by the master. He decides when the log may be 
globally used.
+ * The trigger for this is the allowed 'drift'. A new transaction log is 
created when
  * the system has been collecting logs for some time (drift in seconds).
  * The drift determines the maximal window of transactions loss that is 
permitted.
- * The maximum drift can be set using a SQL command. Setting it to zero leads 
to a
- * log file per transaction and may cause a large overhead for short running 
transactions.
+ * The maximum drift can be set using a SQL command, e.g.
+ * CALL drift(duration)
+ * Setting it to zero leads to a log file per transaction and may cause a 
large log directory.
+ * A default of 5 minutes should balance polling overhead.
  *
  * A minor problem here is that we should ensure that the log file is closed 
even if there
- * are no transactions running. It is solved with a separate monitor thread.
- * After closing, the replicas can see from the master configuration file that 
a log is available.
+ * are no transactions running. It is solved with a separate monitor thread, 
which ensures
+ * that the logs are flushed at least after 'drift' seconds since the first 
logrecord was created.
+ * After closing, the replicas can see from the master configuration file that 
a new log batch is available.
  *
- * The transaction loggin can be temporarily paused using the command
- * CALL master(2)
- * This mode should be uses sparingly. For example if you plan to perform a 
COPY INTO LOCKED mode
- * and want to avoid an avalanche of update records.
- *
- * Logging is resumed using the command 
- * CALL master(3)
- * A warning is issued when during the suspension update transactions have 
been issued.
- * The final step is to close transaction logging with the command
- * CALL master(4).
- * It typically is the end-of-life-time for a snapshot and its log files.
+ * The final step is to close stop ransaction logging with the command
+ * CALL stopmaster.
+ * It typically is the end-of-life-time for a snapshot. For example, when 
planning to do
+ * a large bulk load of the database, stopping logging avoids a double write 
into the
+ * database. The database can be brought back into wlcr mode using a fresh 
snapshot.
  *
  *[TODO] A more secure way to set a database into master mode is to use the 
command
  *      monetdb master <dbname> [ <optional snapshot path>]
  * which locks the database, takes a save copy, initializes the state chance. 
  *
  * A fresh replica can be constructed as follows:
- *     monetdb replica <dbname> <mastername>
+ *     monetdb replicate <dbname> <mastername>
  *
  * Instead of using the monetdb command line we can use the SQL calls directly
  * master() and replicate(), provided we start with a fresh database.
  *
- * REPLICAS
+ * CLONE
  *
- * A fresh database can be turned into a replica using the call
- * CALL replicate("mastername")
+ * Every clone should start off with a copy of the binary snapshot identified 
by 'snapshot'.
+ * A fresh database can be turned into a clone using the call
+ * CALL replicate('mastername')
  * It will grab the latest snapshot of the master and applies all
- * known log files before releasing the system. Progress of
+ * available log files before releasing the system. Progress of
  * the replication can be monitored using the -fraw option in mclient.
+ * The master has no knowledge about the number of clones and their 
whereabouts.
  *
- * It will iterate through the log files, applying all transactions.
- * It excludes catalog and update queries, which are always executed.
- * Queries are simply ignored.
+ * The clone process will iterate in the background through the log files, 
+ * applying all update transactions.
  *
- * The alternative is to also replay the queries .
- * CALL replaythreshold(threshold)
- * In this mode all pure queries are also executed for which the reported 
threshold exceeds the argument.
- * Enabling the query log collects the execution times for these queries.
+ * An optional timestamp or transaction id can be added to apply the logs until
+ * a given moment. This is particularly handy when an unexpected 
+ * desastrous user action (drop persisten table) has to be recovered from.
+ *
+ * CALL replicate('mastername');
+ * CALL replicate('mastername',NOW()); -- stops after we are in sink
+ * ...
+ * CALL replicate(NOW()); -- partial roll forward
+ * ...
+ * CALL replicate(); --continue nondisturbed
+ *
+ * SELECT replicaClock();
+ * returns the timestamp of the last replicated transaction.
+ * SELECT replicaBacklog();
+ * returns the number of pending transactions to be in sink with master.
+ * SELECT masterClock();
+ * return the timestamp of the last committed transaction in the master.
  *
  * Any failure encountered during a log replay terminates the replication 
process,
- * leaving a message in the merovingian log.
- *
- * The replica creation can be suspended at the master and at the clone.
- * It will continue after the corresponding resume* operation is issues.
+ * leaving a message in the merovingian log configuration.
  *
  * The wlcr files purposely have a textual format derived from the MAL 
statements.
- * Simplicity and ease of control has been the driving argument here.
+ * This provides a stepping stone for remote execution later.
  *
  * [TODO] consider the roll forward of SQL session variables, i.e. 
optimizer_pipe (for now assume default pipe).
- * [TODO] The status of the master/replica should be accessible for inspection
  * [TODO] the user might want to indicate a time-stamp, to rebuild to a 
certain point
  *
  */
@@ -129,23 +138,20 @@
 #include "mal_builder.h"
 #include "wlcr.h"
 
-static MT_Lock     wlcr_lock MT_LOCK_INITIALIZER("wlcr_lock");
+MT_Lock     wlcr_lock MT_LOCK_INITIALIZER("wlcr_lock");
 
-static str wlcr_snapshot= 0; // The location of the snapshot against which the 
logs work
-static str wlcr_logs = 0;      // The location in the global file store for 
the logs
-static stream *wlcr_fd = 0;
-static int wlcr_start = 0;     // time stamp of first transaction in log file
-static int wlcr_state = 0;     // The current status of the in the life cycle
-static int wlcr_tag = 0;       // number of database chancing transactions
-static int wlcr_pausetag = 0;  // number of database chancing transactions 
when pausing
+static char wlc_snapshot[PATHLENGTH]; // The location of the snapshot against 
which the logs work
+static lng wlc_start= 0;                       // Start time of first 
transaction
+static stream *wlc_fd = 0;
 
 // These properties are needed by the replica to direct the roll-forward.
-int wlcr_threshold = 0;                // should be set to -1 for production
-str wlcr_dbname = 0;           // The master database name
-int wlcr_firstbatch = 0;       // first log file  associated with the snapshot
-int wlcr_batches = 0;          // identifier of next batch
-int wlcr_drift = 10;           // maximal period covered by a single log file 
in seconds
-int wlcr_rollback= 0;          // also log the aborted queries.
+char wlc_dir[PATHLENGTH];      // The location in the global file store for 
the logs
+char wlc_name[IDLENGTH];       // The master database name
+lng   wlc_id = 0;                      // next transaction id
+int  wlc_state = 0;                    // The current status of the in the 
life cycle
+char wlc_write[26];                    // The timestamp of the last committed 
transaction
+int  wlc_batches = 0;          // identifier of next batch
+int  wlc_drift = 10;           // maximal period covered by a single log file 
in seconds
 
 /* The database snapshots are binary copies of the dbfarm/database/bat
  * New snapshots are created currently using the 'monetdb snapshot <db>' 
command
@@ -157,67 +163,67 @@ int wlcr_rollback= 0;             // also log the a
 int
 WLCused(void)
 {
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

MonetDB: wlcr - A new, simplified replication interface.

Reply via email to