Changeset: 9d2c0c613846 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=9d2c0c613846
Modified Files:
        common/utils/msabaoth.c
        tools/merovingian/daemon/client.c
        tools/merovingian/daemon/controlrunner.c
        tools/merovingian/daemon/forkmserver.c
        tools/merovingian/daemon/handlers.c
        tools/merovingian/daemon/merovingian.c
        tools/merovingian/daemon/merovingian.h
        tools/merovingian/daemon/multiplex-funnel.c
Branch: Nov2019
Log Message:

Fixed a bunch of race conditions + some code cleanup.
When an mserver crashes, it leaves behind the .started file.  If a new
mserver is then started, it first locks the database and then, later,
removes the .started file.  In the intervening time, the database
seems up (locked and .started file present) but isn't yet, causing
errors for new connections.
Apparently other things could go wrong as well causing the internal
state of monetdbd to become confused and causing messages about an
"impossible state".
We now use a per database lock inside monetdbd when checking whether a
server is up.  This means that if there are multiple clients
connecting, they all have to wait until the server is back up.  We
also remove the .started file before starting the server.
The only drawback is that the internal list of databases past and
present is never cleaned up.  If this becomes a problem, we need to
implement a fix for that.

This needs more testing.


diffs (truncated from 1068 to 300 lines):

diff --git a/common/utils/msabaoth.c b/common/utils/msabaoth.c
--- a/common/utils/msabaoth.c
+++ b/common/utils/msabaoth.c
@@ -611,7 +611,7 @@ msab_getSingleStatus(const char *pathbuf
         */
        snprintf(buf, sizeof(buf), "%s/%s/%s", pathbuf, dbname,
                         _sabaoth_internal_uuid);
-       if (stat(buf, &statbuf) != -1) {
+       if (stat(buf, &statbuf) == 0) {
                /* database has the same process signature as ours, which
                 * means, it must be us, rely on the uplog state */
                snprintf(log, sizeof(log), "%s/%s/%s", pathbuf, dbname, 
UPLOGFILE);
@@ -635,7 +635,7 @@ msab_getSingleStatus(const char *pathbuf
                        (void)fclose(f);
                }
        } else if (snprintf(buf, sizeof(buf), "%s/%s/%s", pathbuf, dbname, 
".gdk_lock"),
-                          ((fd = MT_lockf(buf, F_TEST, 4, 1)) == -2)) {
+                          ((fd = MT_lockf(buf, F_TLOCK, 4, 1)) == -2)) {
                /* Locking failed; this can be because the lockfile couldn't
                 * be created.  Probably there is no Mserver running for
                 * that case also.
@@ -644,15 +644,18 @@ msab_getSingleStatus(const char *pathbuf
        } else if (fd == -1) {
                /* file is locked, so mserver is running, see if the database
                 * has finished starting */
-               snprintf(buf, sizeof(buf), "%s/%s/%s",
-                                pathbuf, dbname, STARTEDFILE);
+               snprintf(buf, sizeof(buf), "%s/%s/%s", pathbuf, dbname, 
STARTEDFILE);
                if (stat(buf, &statbuf) == -1) {
                        sdb->state = SABdbStarting;
                } else {
                        sdb->state = SABdbRunning;
                }
        } else {
-               /* file is not locked, check for a crash in the uplog */
+               /* file was not locked (we just locked it), check for a crash
+                * in the uplog */
+               snprintf(log, sizeof(log), "%s/%s/%s", pathbuf, dbname, 
STARTEDFILE);
+               /* just to be sure, remove the .started file */
+               (void) remove(log);             /* may fail, that's fine */
                snprintf(log, sizeof(log), "%s/%s/%s", pathbuf, dbname, 
UPLOGFILE);
                if ((f = fopen(log, "r")) != NULL) {
                        (void)fseek(f, -1, SEEK_END);
@@ -669,28 +672,27 @@ msab_getSingleStatus(const char *pathbuf
                        /* no uplog, so presumably never started */
                        sdb->state = SABdbInactive;
                }
+               MT_lockf(buf, F_ULOCK, 4, 1);
+               close(fd);
        }
        snprintf(buf, sizeof(buf), "%s/%s/%s", pathbuf, dbname, 
MAINTENANCEFILE);
-       sdb->locked = stat(buf, &statbuf) != -1;
+       sdb->locked = stat(buf, &statbuf) == 0;
 
        /* add scenarios that are supported */
        sdb->scens = NULL;
        snprintf(buf, sizeof(buf), "%s/%s/%s", pathbuf, dbname, SCENARIOFILE);
        if ((f = fopen(buf, "r")) != NULL) {
                sablist* np = NULL;
-               while (fgets(data, 8095, f) != NULL) {
+               while (fgets(data, (int) sizeof(data), f) != NULL) {
                        if (*data != '\0' && data[strlen(data) - 1] == '\n')
                                data[strlen(data) - 1] = '\0';
                        if (sdb->scens == NULL) {
-                               sdb->scens = malloc(sizeof(sablist));
-                               sdb->scens->val = strdup(data);
-                               sdb->scens->next = NULL;
-                               np = sdb->scens;
+                               np = sdb->scens = malloc(sizeof(sablist));
                        } else {
                                np = np->next = malloc(sizeof(sablist));
-                               np->val = strdup(data);
-                               np->next = NULL;
                        }
+                       np->val = strdup(data);
+                       np->next = NULL;
                }
                (void)fclose(f);
        }
@@ -700,19 +702,16 @@ msab_getSingleStatus(const char *pathbuf
        snprintf(buf, sizeof(buf), "%s/%s/%s", pathbuf, dbname, CONNECTIONFILE);
        if ((f = fopen(buf, "r")) != NULL) {
                sablist* np = NULL;
-               while (fgets(data, 8095, f) != NULL) {
+               while (fgets(data, (int) sizeof(data), f) != NULL) {
                        if (*data != '\0' && data[strlen(data) - 1] == '\n')
                                data[strlen(data) - 1] = '\0';
                        if (sdb->conns == NULL) {
-                               sdb->conns = malloc(sizeof(sablist));
-                               sdb->conns->val = strdup(data);
-                               sdb->conns->next = NULL;
-                               np = sdb->conns;
+                               np = sdb->conns = malloc(sizeof(sablist));
                        } else {
                                np = np->next = malloc(sizeof(sablist));
-                               np->val = strdup(data);
-                               np->next = NULL;
                        }
+                       np->val = strdup(data);
+                       np->next = NULL;
                }
                (void)fclose(f);
        }
diff --git a/tools/merovingian/daemon/client.c 
b/tools/merovingian/daemon/client.c
--- a/tools/merovingian/daemon/client.c
+++ b/tools/merovingian/daemon/client.c
@@ -44,7 +44,7 @@
 struct threads {
        struct threads *next;
        pthread_t tid;
-       volatile char dead;
+       volatile bool dead;
 };
 struct clientdata {
        int sock;
@@ -85,7 +85,7 @@ handleClient(void *data)
        free(data);
        fdin = socket_rstream(sock, "merovingian<-client (read)");
        if (fdin == 0) {
-               self->dead = 1;
+               self->dead = true;
                return(newErr("merovingian-client inputstream problems"));
        }
        fdin = block_stream(fdin);
@@ -93,7 +93,7 @@ handleClient(void *data)
        fout = socket_wstream(sock, "merovingian->client (write)");
        if (fout == 0) {
                close_stream(fdin);
-               self->dead = 1;
+               self->dead = true;
                return(newErr("merovingian-client outputstream problems"));
        }
        fout = block_stream(fout);
@@ -137,7 +137,7 @@ handleClient(void *data)
                mnstr_flush(fout);
                close_stream(fout);
                close_stream(fdin);
-               self->dead = 1;
+               self->dead = true;
                return(e);
        }
        buf[sizeof(buf) - 1] = '\0';
@@ -158,7 +158,7 @@ handleClient(void *data)
                mnstr_flush(fout);
                close_stream(fout);
                close_stream(fdin);
-               self->dead = 1;
+               self->dead = true;
                return(e);
        }
 
@@ -174,7 +174,7 @@ handleClient(void *data)
                        mnstr_flush(fout);
                        close_stream(fout);
                        close_stream(fdin);
-                       self->dead = 1;
+                       self->dead = true;
                        return(e);
                }
                algo = passwd + 1;
@@ -185,7 +185,7 @@ handleClient(void *data)
                        mnstr_flush(fout);
                        close_stream(fout);
                        close_stream(fdin);
-                       self->dead = 1;
+                       self->dead = true;
                        return(e);
                }
                *s = 0;
@@ -196,7 +196,7 @@ handleClient(void *data)
                mnstr_flush(fout);
                close_stream(fout);
                close_stream(fdin);
-               self->dead = 1;
+               self->dead = true;
                return(e);
        }
 
@@ -211,7 +211,7 @@ handleClient(void *data)
                mnstr_flush(fout);
                close_stream(fout);
                close_stream(fdin);
-               self->dead = 1;
+               self->dead = true;
                return(e);
        }
 
@@ -229,7 +229,7 @@ handleClient(void *data)
                        mnstr_flush(fout);
                        close_stream(fout);
                        close_stream(fdin);
-                       self->dead = 1;
+                       self->dead = true;
                        return(e);
                } else {
                        *s = '\0';
@@ -243,7 +243,7 @@ handleClient(void *data)
                mnstr_flush(fout);
                close_stream(fout);
                close_stream(fdin);
-               self->dead = 1;
+               self->dead = true;
                return(newErr("client %s specified no database", host));
        }
 
@@ -253,7 +253,7 @@ handleClient(void *data)
                        control_handleclient(host, sock, fdin, fout);
                close_stream(fout);
                close_stream(fdin);
-               self->dead = 1;
+               self->dead = true;
                return(NO_ERR);
        }
 
@@ -283,7 +283,7 @@ handleClient(void *data)
                mnstr_flush(fout);
                close_stream(fout);
                close_stream(fdin);
-               self->dead = 1;
+               self->dead = true;
                return(e);
        }
        stat = top;
@@ -295,7 +295,7 @@ handleClient(void *data)
        {
                multiplexAddClient(top->dbname, sock, fout, fdin, host);
                msab_freeStatus(&top);
-               self->dead = 1;
+               self->dead = true;
                return(NO_ERR);
        }
 
@@ -324,7 +324,7 @@ handleClient(void *data)
                close_stream(fout);
                close_stream(fdin);
                msab_freeStatus(&top);
-               self->dead = 1;
+               self->dead = true;
                return(e);
        }
 
@@ -395,13 +395,13 @@ handleClient(void *data)
                        close_stream(fdin);
                        Mfprintf(stdout, "starting a proxy failed: %s\n", e);
                        msab_freeStatus(&top);
-                       self->dead = 1;
+                       self->dead = true;
                        return(e);
                }
        }
 
        msab_freeStatus(&top);
-       self->dead = 1;
+       self->dead = true;
        return(NO_ERR);
 }
 
@@ -624,7 +624,7 @@ acceptConnections(int sock, int usock)
                }
                data->sock = msgsock;
                data->isusock = isusock;
-               p->dead = 0;
+               p->dead = false;
                data->self = p;
                data->challenge[31] = '\0';
                generateSalt(data->challenge, 31);
diff --git a/tools/merovingian/daemon/controlrunner.c 
b/tools/merovingian/daemon/controlrunner.c
--- a/tools/merovingian/daemon/controlrunner.c
+++ b/tools/merovingian/daemon/controlrunner.c
@@ -355,9 +355,12 @@ static void ctl_handle_client(
                                dp = _mero_topdp->next; /* don't need the 
console/log */
                                while (dp != NULL) {
                                        if (dp->type == MERODB && 
strcmp(dp->dbname, q) == 0) {
+                                               if (dp->pid <= 0) {
+                                                       dp = NULL;
+                                                       /* unlock happens below 
*/
+                                                       break;
+                                               }
                                                if (strcmp(p, "stop") == 0) {
-                                                       pid_t pid = dp->pid;
-                                                       char *dbname = 
strdup(dp->dbname);
                                                        mtype type = dp->type;
                                                        
pthread_mutex_unlock(&_mero_topdp_lock);
                                                        /* Try to shutdown the 
profiler before the DB.
@@ -366,10 +369,12 @@ static void ctl_handle_client(
                                                         * other words: ignore 
any errors that shutdown_profiler
                                                         * may have encountered.
                                                         */
-                                                       
shutdown_profiler(dbname, &stats);
+                                                       
shutdown_profiler(dp->dbname, &stats);
                                                        if (stats != NULL)
                                                                
msab_freeStatus(&stats);
-                                                       terminateProcess(pid, 
dbname, type, 1);
+                                                       
pthread_mutex_lock(&dp->fork_lock);
+                                                       terminateProcess(dp, 
type);
+                                                       
pthread_mutex_unlock(&dp->fork_lock);
                                                        Mfprintf(_mero_ctlout, 
"%s: stopped "
                                                                        
"database '%s'\n", origin, q);
                                                } else {
diff --git a/tools/merovingian/daemon/forkmserver.c 
b/tools/merovingian/daemon/forkmserver.c
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to