Changeset: 6d5dbd832675 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=6d5dbd832675 Modified Files: monetdb5/mal/Makefile.ag monetdb5/mal/mal.c monetdb5/mal/mal_client.c monetdb5/mal/mal_client.h monetdb5/modules/mal/wlcr.c monetdb5/modules/mal/wlcr.h monetdb5/modules/mal/wlcr.mal monetdb5/optimizer/opt_wlcr.c sql/backends/monet5/sql_scenario.c sql/backends/monet5/sql_wlcr.c sql/backends/monet5/sql_wlcr.h sql/backends/monet5/sql_wlcr.mal sql/scripts/60_wlcr.sql sql/test/wlcr/Tests/All sql/test/wlcr/Tests/wlc01.py sql/test/wlcr/Tests/wlr01.py sql/test/wlcr/Tests/wlr01.stable.err sql/test/wlcr/Tests/wlr01.stable.out sql/test/wlcr/Tests/wlr20.py sql/test/wlcr/Tests/wlr20.stable.err sql/test/wlcr/Tests/wlr20.stable.out sql/test/wlcr/Tests/wlr30.py sql/test/wlcr/Tests/wlr40.py Branch: wlcr Log Message:
A new, simplified replication interface. - the replication process can be controlled by the transaction id - masterClock(), replicaClock(), replicaBacklog() to inspect state. - focus on update queries only. diffs (truncated from 2251 to 300 lines): diff --git a/monetdb5/mal/Makefile.ag b/monetdb5/mal/Makefile.ag --- a/monetdb5/mal/Makefile.ag +++ b/monetdb5/mal/Makefile.ag @@ -10,6 +10,7 @@ INCLUDES = ../../common/options \ ../../clients/mapilib \ ../../gdk \ ../optimizer \ + ../modules/mal \ $(openssl_CFLAGS) MTSAFE diff --git a/monetdb5/mal/mal.c b/monetdb5/mal/mal.c --- a/monetdb5/mal/mal.c +++ b/monetdb5/mal/mal.c @@ -36,6 +36,7 @@ int have_hge; #include "mal_runtime.h" #include "mal_resource.h" #include "opt_statistics.h" +#include "wlcr.h" MT_Lock mal_contextLock MT_LOCK_INITIALIZER("mal_contextLock"); MT_Lock mal_namespaceLock MT_LOCK_INITIALIZER("mal_namespaceLock"); @@ -124,6 +125,7 @@ void mserver_reset(int exit) str err = 0; GDKprepareExit(); + WLCreset(); MCstopClients(0); setHeartbeat(-1); stopProfiler(); diff --git a/monetdb5/mal/mal_client.c b/monetdb5/mal/mal_client.c --- a/monetdb5/mal/mal_client.c +++ b/monetdb5/mal/mal_client.c @@ -243,7 +243,6 @@ MCinitClientRecord(Client c, oid user, b c->exception_buf_initialized = 0; c->error_row = c->error_fld = c->error_msg = c->error_input = NULL; c->wlcr_kind = 0; - c->wlcr_mode = 0; c->wlcr = NULL; #ifndef HAVE_EMBEDDED /* no authentication in embedded mode */ { @@ -400,7 +399,6 @@ freeClient(Client c) if( c->wlcr) freeMalBlk(c->wlcr); c->wlcr_kind = 0; - c->wlcr_mode = 0; c->wlcr = NULL; } if (t) diff --git a/monetdb5/mal/mal_client.h b/monetdb5/mal/mal_client.h --- a/monetdb5/mal/mal_client.h +++ b/monetdb5/mal/mal_client.h @@ -177,7 +177,6 @@ typedef struct CLIENT { * This allows a single server to act as both a master and a replica. */ int wlcr_kind; // used by master to characterise the compound transaction - int wlcr_mode; // used by replica to control rerunning the transaction MalBlkPtr wlcr; /* diff --git a/monetdb5/modules/mal/wlcr.c b/monetdb5/modules/mal/wlcr.c --- a/monetdb5/modules/mal/wlcr.c +++ b/monetdb5/modules/mal/wlcr.c @@ -9,118 +9,127 @@ /* * (c) 2017 Martin Kersten * This module collects the workload-capture-replay statements during transaction execution, - * also known as asynchronous logical replication management. + * also known as asynchronous logical replication management. It can be used for + * multiple purposes: BACKUP, REPLICATION, and REPLAY * - * The goal is to easily clone a master database. + * For a BACKUP we need either a complete update log from the beginning, or + * a binary snapshot with a collection of logs recording its changes since. + * To ensure transaction ACID properties, the log record should be stored on + * disk within the transaction brackets, which may cause a serious IO load. + * (Tip, store these logs files on an SSD or NVM) * + * For REPLICATION, also called a database clone or slave, we take a snapshot and the + * log files that reflect the recent changes. The log updates are replayed against + * the snapshot until a specific time point is reached. + * + * Some systems also use the logical logs to REPLAY all (expensive) queries + * against the database. + * + * The goal of this module is to ease BACKUP and REPPLICATION of a master database + * with a time-bounded delay. + * Such a clone is a database replica that aid in query workload sharing, + * database versioning, and (re-)partitioning. + * + * Simplicity and ease of end-user control has been the driving argument here. * * IMPLEMENTATION * The underlying assumption of the techniques deployed is that the database - * resides on a proper (global) file system to guarantees recovery from most - * storage system related failures. Such as RAID disks. - * Furthermore, when deployed in a Cloud setting, the database recides in the - * global file system. + * resides on a proper (global/distributed) file system to guarantees recovery + * from most storage system related failures, e.g. using RAID disks or LSFsystems. * - * A database can be set once into 'master' mode only once using the SQL command: + * A database can be set into 'master' mode only once using the SQL command: * CALL master() + * An alternative path to the log records can be given to reduce the storage cost, + * e.g. a nearby SSD. + * By default, it creates a directory .../dbfarm/dbname/master to hold all + * necessary information for the creation of a database clone. * - * It creates a directory .../dbfarm/dbname/master to hold all necessary information - * for the creation and maintenance of replicas. - * A configuration file is added to keep track on the status of the master. + * A master configuration file is added to the database directory to keep the state/ * It contains the following key=value pairs: - * snapshot=<path to a binary snapshot> + * snapshot=<path to a snapshot directory> * logs=<path to the wlcr log directory> - * state=<started, paused,(resume), stopped> - * firstbatch=<first batch file to be applied> - * batches=<last batch file to be applied> - * drift=<maximal delay before transactions are seen globally, in seconds> - * threshold=<min response time for queries to be kept> - * rollbock=<flag to indicate keeping the aborted transactions as well> + * state=<started, stopped> + * batches=<next available batch file to be applied> + * drift=<maximal delay before transactions are published as a separate log, in seconds> + * write=<timestamp of the last transaction recorded> * - * Every replica should start off with a copy of binary snapshot identified by 'snapshot' - * by default stored in .../dbfarm/dbname/master/bat. An alternative path can be given - * to reduce the storage cost at the expense of slower recovery time (e.g. AWS glacier). - * A missing path to the snapshot denotes that we can start rebuilding with an empty database instead. - * The log files are stored as master/<dbname>_<batchnumber>. + * A missing path to the snapshot denotes that we can start the clone with an empty database. + * The log files are stored as master/<dbname>_<batchnumber>. They belong to the snapshot. * * Each wlcr log file contains a serial log of committed compound transactions. * The log records are represented as ordinary MAL statement blocks, which * are executed in serial mode. (parallelism can be considered for large updates later) - * Each transaction job is identified by the owner of the query, - * commit/rollback status, its starting time and runtime (in ms). + * Each transaction job is identified by the owner of the query, its starting time and runtime (in ms). + * The log-record should end with a commit. * - * Update queries are always logged and pure queries can be limited to those - * that satisfy an minimum execution threshold. - * CALL logthreshold(duration) - * The threshold is given in milliseconds. - * The threshold setting is saved and affects all future master log records. - * The default for a production system version should be set to -1, which ignores all pure queries. - * - * The aborted transactions can also be gathered using the call - * CALL logrollback(1); - * Such queries may be helpful in the analysis of transactions with failures. - * - * A transaction log is owned by the master. He decides when the log may be globally used. - * The trigger for this is the allowed drift. A new transaction log is created when + * A transaction log is created by the master. He decides when the log may be globally used. + * The trigger for this is the allowed 'drift'. A new transaction log is created when * the system has been collecting logs for some time (drift in seconds). * The drift determines the maximal window of transactions loss that is permitted. - * The maximum drift can be set using a SQL command. Setting it to zero leads to a - * log file per transaction and may cause a large overhead for short running transactions. + * The maximum drift can be set using a SQL command, e.g. + * CALL drift(duration) + * Setting it to zero leads to a log file per transaction and may cause a large log directory. + * A default of 5 minutes should balance polling overhead. * * A minor problem here is that we should ensure that the log file is closed even if there - * are no transactions running. It is solved with a separate monitor thread. - * After closing, the replicas can see from the master configuration file that a log is available. + * are no transactions running. It is solved with a separate monitor thread, which ensures + * that the logs are flushed at least after 'drift' seconds since the first logrecord was created. + * After closing, the replicas can see from the master configuration file that a new log batch is available. * - * The transaction loggin can be temporarily paused using the command - * CALL master(2) - * This mode should be uses sparingly. For example if you plan to perform a COPY INTO LOCKED mode - * and want to avoid an avalanche of update records. - * - * Logging is resumed using the command - * CALL master(3) - * A warning is issued when during the suspension update transactions have been issued. - * The final step is to close transaction logging with the command - * CALL master(4). - * It typically is the end-of-life-time for a snapshot and its log files. + * The final step is to close stop ransaction logging with the command + * CALL stopmaster. + * It typically is the end-of-life-time for a snapshot. For example, when planning to do + * a large bulk load of the database, stopping logging avoids a double write into the + * database. The database can be brought back into wlcr mode using a fresh snapshot. * *[TODO] A more secure way to set a database into master mode is to use the command * monetdb master <dbname> [ <optional snapshot path>] * which locks the database, takes a save copy, initializes the state chance. * * A fresh replica can be constructed as follows: - * monetdb replica <dbname> <mastername> + * monetdb replicate <dbname> <mastername> * * Instead of using the monetdb command line we can use the SQL calls directly * master() and replicate(), provided we start with a fresh database. * - * REPLICAS + * CLONE * - * A fresh database can be turned into a replica using the call - * CALL replicate("mastername") + * Every clone should start off with a copy of the binary snapshot identified by 'snapshot'. + * A fresh database can be turned into a clone using the call + * CALL replicate('mastername') * It will grab the latest snapshot of the master and applies all - * known log files before releasing the system. Progress of + * available log files before releasing the system. Progress of * the replication can be monitored using the -fraw option in mclient. + * The master has no knowledge about the number of clones and their whereabouts. * - * It will iterate through the log files, applying all transactions. - * It excludes catalog and update queries, which are always executed. - * Queries are simply ignored. + * The clone process will iterate in the background through the log files, + * applying all update transactions. * - * The alternative is to also replay the queries . - * CALL replaythreshold(threshold) - * In this mode all pure queries are also executed for which the reported threshold exceeds the argument. - * Enabling the query log collects the execution times for these queries. + * An optional timestamp or transaction id can be added to apply the logs until + * a given moment. This is particularly handy when an unexpected + * desastrous user action (drop persisten table) has to be recovered from. + * + * CALL replicate('mastername'); + * CALL replicate('mastername',NOW()); -- stops after we are in sink + * ... + * CALL replicate(NOW()); -- partial roll forward + * ... + * CALL replicate(); --continue nondisturbed + * + * SELECT replicaClock(); + * returns the timestamp of the last replicated transaction. + * SELECT replicaBacklog(); + * returns the number of pending transactions to be in sink with master. + * SELECT masterClock(); + * return the timestamp of the last committed transaction in the master. * * Any failure encountered during a log replay terminates the replication process, - * leaving a message in the merovingian log. - * - * The replica creation can be suspended at the master and at the clone. - * It will continue after the corresponding resume* operation is issues. + * leaving a message in the merovingian log configuration. * * The wlcr files purposely have a textual format derived from the MAL statements. - * Simplicity and ease of control has been the driving argument here. + * This provides a stepping stone for remote execution later. * * [TODO] consider the roll forward of SQL session variables, i.e. optimizer_pipe (for now assume default pipe). - * [TODO] The status of the master/replica should be accessible for inspection * [TODO] the user might want to indicate a time-stamp, to rebuild to a certain point * */ @@ -129,23 +138,20 @@ #include "mal_builder.h" #include "wlcr.h" -static MT_Lock wlcr_lock MT_LOCK_INITIALIZER("wlcr_lock"); +MT_Lock wlcr_lock MT_LOCK_INITIALIZER("wlcr_lock"); -static str wlcr_snapshot= 0; // The location of the snapshot against which the logs work -static str wlcr_logs = 0; // The location in the global file store for the logs -static stream *wlcr_fd = 0; -static int wlcr_start = 0; // time stamp of first transaction in log file -static int wlcr_state = 0; // The current status of the in the life cycle -static int wlcr_tag = 0; // number of database chancing transactions -static int wlcr_pausetag = 0; // number of database chancing transactions when pausing +static char wlc_snapshot[PATHLENGTH]; // The location of the snapshot against which the logs work +static lng wlc_start= 0; // Start time of first transaction +static stream *wlc_fd = 0; // These properties are needed by the replica to direct the roll-forward. -int wlcr_threshold = 0; // should be set to -1 for production -str wlcr_dbname = 0; // The master database name -int wlcr_firstbatch = 0; // first log file associated with the snapshot -int wlcr_batches = 0; // identifier of next batch -int wlcr_drift = 10; // maximal period covered by a single log file in seconds -int wlcr_rollback= 0; // also log the aborted queries. +char wlc_dir[PATHLENGTH]; // The location in the global file store for the logs +char wlc_name[IDLENGTH]; // The master database name +lng wlc_id = 0; // next transaction id +int wlc_state = 0; // The current status of the in the life cycle +char wlc_write[26]; // The timestamp of the last committed transaction +int wlc_batches = 0; // identifier of next batch +int wlc_drift = 10; // maximal period covered by a single log file in seconds /* The database snapshots are binary copies of the dbfarm/database/bat * New snapshots are created currently using the 'monetdb snapshot <db>' command @@ -157,67 +163,67 @@ int wlcr_rollback= 0; // also log the a int WLCused(void) { _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list