The following pull request was submitted through Github.
It can be accessed and reviewed at: https://github.com/lxc/lxd/pull/4983

This e-mail was sent by the LXC bot, direct replies will not reach the author
unless they happen to be subscribed to this list.

=== Description (from pull-request) ===
This makes database queries timeout after 10 seconds if quorum is lost **after** the node had initially started fine (e.g. cluster was up but a majority of db nodes were shutdown), which is probably the most common case we want to address.

What it does **not** do is trying to make the node minimally operational (e.g. server responding to "lxc info" without hanging forever) in the case where the daemon is still going through its startup sequence and it's stuck waiting for other nodes to be upgraded, since it found that its own version is more recent. That information is of course present in the logs of the daemon tho, so it should be pretty obvious to figure what's wrong with the stuck client.

Given that we're going to trigger snap refreshes explicitly when we detect that one node has upgraded, probably it's not worth too much polishing the UX for that window of time. It desired, that can be done in a separate branch.

(partially) Fixes #4479.

Signed-off-by: Free Ekanayaka <[email protected]>
From 066ea3acbe4da5a7e1a292df203f933fd7a56fdd Mon Sep 17 00:00:00 2001
From: Free Ekanayaka <[email protected]>
Date: Wed, 29 Aug 2018 12:57:54 +0200
Subject: [PATCH] Make database queries timeout after 10s if cluster db is
 unavail

Signed-off-by: Free Ekanayaka <[email protected]>
---
 lxd/api.go                     |  4 +++-
 lxd/api_cluster.go             |  1 +
 lxd/cluster/heartbeat_test.go  |  2 +-
 lxd/cluster/membership_test.go |  6 ++++--
 lxd/daemon.go                  |  3 ++-
 lxd/db/db.go                   | 13 ++++++-------
 lxd/db/query/retry.go          |  2 +-
 lxd/db/testing.go              |  3 ++-
 lxd/response.go                |  6 +++++-
 9 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/lxd/api.go b/lxd/api.go
index f6509de1c3..2ab2736697 100644
--- a/lxd/api.go
+++ b/lxd/api.go
@@ -63,7 +63,9 @@ func (s *lxdHttpServer) ServeHTTP(rw http.ResponseWriter, req 
*http.Request) {
                        return nil
                })
                if err != nil {
-                       http.Error(rw, err.Error(), 
http.StatusInternalServerError)
+                       response := SmartError(err)
+                       response.Render(rw)
+                       return
                }
        }
 
diff --git a/lxd/api_cluster.go b/lxd/api_cluster.go
index 785100ea4a..c585bca840 100644
--- a/lxd/api_cluster.go
+++ b/lxd/api_cluster.go
@@ -477,6 +477,7 @@ func clusterPutDisable(d *Daemon) Response {
        store := d.gateway.ServerStore()
        d.cluster, err = db.OpenCluster(
                "db.bin", store, address, "/unused/db/dir",
+               d.config.DqliteSetupTimeout,
                dqlite.WithDialFunc(d.gateway.DialFunc()),
                dqlite.WithContext(d.gateway.Context()),
        )
diff --git a/lxd/cluster/heartbeat_test.go b/lxd/cluster/heartbeat_test.go
index b6d698ad59..68c9883518 100644
--- a/lxd/cluster/heartbeat_test.go
+++ b/lxd/cluster/heartbeat_test.go
@@ -253,7 +253,7 @@ func (f *heartbeatFixture) node() (*state.State, 
*cluster.Gateway, string) {
        store := gateway.ServerStore()
        dial := gateway.DialFunc()
        state.Cluster, err = db.OpenCluster(
-               "db.bin", store, address, "/unused/db/dir", 
dqlite.WithDialFunc(dial))
+               "db.bin", store, address, "/unused/db/dir", 5*time.Second, 
dqlite.WithDialFunc(dial))
        require.NoError(f.t, err)
 
        f.gateways[len(f.gateways)] = gateway
diff --git a/lxd/cluster/membership_test.go b/lxd/cluster/membership_test.go
index 85707961b6..a954381e2d 100644
--- a/lxd/cluster/membership_test.go
+++ b/lxd/cluster/membership_test.go
@@ -6,6 +6,7 @@ import (
        "net/http"
        "path/filepath"
        "testing"
+       "time"
 
        "github.com/CanonicalLtd/go-dqlite"
        "github.com/lxc/lxd/lxd/cluster"
@@ -258,6 +259,7 @@ func TestJoin(t *testing.T) {
        var err error
        targetState.Cluster, err = db.OpenCluster(
                "db.bin", targetStore, targetAddress, "/unused/db/dir",
+               10*time.Second,
                dqlite.WithDialFunc(targetDialFunc))
        require.NoError(t, err)
 
@@ -294,7 +296,7 @@ func TestJoin(t *testing.T) {
        dialFunc := gateway.DialFunc()
 
        state.Cluster, err = db.OpenCluster(
-               "db.bin", store, address, "/unused/db/dir", 
dqlite.WithDialFunc(dialFunc))
+               "db.bin", store, address, "/unused/db/dir", 5*time.Second, 
dqlite.WithDialFunc(dialFunc))
        require.NoError(t, err)
 
        f := &membershipFixtures{t: t, state: state}
@@ -382,7 +384,7 @@ func FLAKY_TestPromote(t *testing.T) {
        store := targetGateway.ServerStore()
        dialFunc := targetGateway.DialFunc()
        targetState.Cluster, err = db.OpenCluster(
-               "db.bin", store, targetAddress, "/unused/db/dir", 
dqlite.WithDialFunc(dialFunc))
+               "db.bin", store, targetAddress, "/unused/db/dir", 
5*time.Second, dqlite.WithDialFunc(dialFunc))
        require.NoError(t, err)
        targetF := &membershipFixtures{t: t, state: targetState}
        targetF.NetworkAddress(targetAddress)
diff --git a/lxd/daemon.go b/lxd/daemon.go
index 4be1835898..55b7841033 100644
--- a/lxd/daemon.go
+++ b/lxd/daemon.go
@@ -475,9 +475,10 @@ func (d *Daemon) init() error {
                store := d.gateway.ServerStore()
                d.cluster, err = db.OpenCluster(
                        "db.bin", store, address, dir,
+                       d.config.DqliteSetupTimeout,
                        dqlite.WithDialFunc(d.gateway.DialFunc()),
                        dqlite.WithContext(d.gateway.Context()),
-                       
dqlite.WithConnectionTimeout(d.config.DqliteSetupTimeout),
+                       dqlite.WithConnectionTimeout(10*time.Second),
                        dqlite.WithLogFunc(cluster.DqliteLog),
                )
                if err == nil {
diff --git a/lxd/db/db.go b/lxd/db/db.go
index 6c6954191b..40e4036d65 100644
--- a/lxd/db/db.go
+++ b/lxd/db/db.go
@@ -8,7 +8,6 @@ import (
 
        "github.com/CanonicalLtd/go-dqlite"
        "github.com/pkg/errors"
-       "golang.org/x/net/context"
 
        "github.com/lxc/lxd/lxd/db/cluster"
        "github.com/lxc/lxd/lxd/db/node"
@@ -158,15 +157,15 @@ type Cluster struct {
 // database matches our version, and possibly trigger a schema update. If the
 // schema update can't be performed right now, because some nodes are still
 // behind, an Upgrading error is returned.
-func OpenCluster(name string, store dqlite.ServerStore, address, dir string, 
options ...dqlite.DriverOption) (*Cluster, error) {
+func OpenCluster(name string, store dqlite.ServerStore, address, dir string, 
timeout time.Duration, options ...dqlite.DriverOption) (*Cluster, error) {
        db, err := cluster.Open(name, store, options...)
        if err != nil {
                return nil, errors.Wrap(err, "failed to open database")
        }
 
-       // Test that the cluster database is operational. We wait up to 10
-       // minutes, in case there's no quorum of nodes online yet.
-       timeout := time.After(10 * time.Minute)
+       // Test that the cluster database is operational. We wait up to the
+       // given timeout , in case there's no quorum of nodes online yet.
+       timer := time.After(timeout)
        for i := 0; ; i++ {
                // Log initial attempts at debug level, but use warn
                // level after the 5'th attempt (about 10 seconds).
@@ -186,7 +185,7 @@ func OpenCluster(name string, store dqlite.ServerStore, 
address, dir string, opt
                }
 
                cause := errors.Cause(err)
-               if cause != context.DeadlineExceeded {
+               if cause != dqlite.ErrNoAvailableLeader {
                        return nil, err
                }
 
@@ -199,7 +198,7 @@ func OpenCluster(name string, store dqlite.ServerStore, 
address, dir string, opt
 
                time.Sleep(2 * time.Second)
                select {
-               case <-timeout:
+               case <-timer:
                        return nil, fmt.Errorf("failed to connect to cluster 
database")
                default:
                }
diff --git a/lxd/db/query/retry.go b/lxd/db/query/retry.go
index 0c78a8c7f7..1b288377a4 100644
--- a/lxd/db/query/retry.go
+++ b/lxd/db/query/retry.go
@@ -16,7 +16,7 @@ import (
 func Retry(f func() error) error {
        // TODO: the retry loop should be configurable.
        var err error
-       for i := 0; i < 20; i++ {
+       for i := 0; i < 5; i++ {
                err = f()
                if err != nil {
                        logger.Debugf("Database error: %#v", err)
diff --git a/lxd/db/testing.go b/lxd/db/testing.go
index 18a59e3031..850b474a0f 100644
--- a/lxd/db/testing.go
+++ b/lxd/db/testing.go
@@ -7,6 +7,7 @@ import (
        "net"
        "os"
        "testing"
+       "time"
 
        "github.com/CanonicalLtd/go-dqlite"
        "github.com/CanonicalLtd/raft-test"
@@ -63,7 +64,7 @@ func NewTestCluster(t *testing.T) (*Cluster, func()) {
        }
 
        cluster, err := OpenCluster(
-               "test.db", store, "1", "/unused/db/dir",
+               "test.db", store, "1", "/unused/db/dir", 5*time.Second,
                dqlite.WithLogFunc(log), dqlite.WithDialFunc(dial))
        require.NoError(t, err)
 
diff --git a/lxd/response.go b/lxd/response.go
index f4feb09769..dd56c0255b 100644
--- a/lxd/response.go
+++ b/lxd/response.go
@@ -11,7 +11,9 @@ import (
        "os"
        "time"
 
+       dqlite "github.com/CanonicalLtd/go-dqlite"
        "github.com/mattn/go-sqlite3"
+       "github.com/pkg/errors"
 
        lxd "github.com/lxc/lxd/client"
        "github.com/lxc/lxd/lxd/cluster"
@@ -509,7 +511,7 @@ func PreconditionFailed(err error) Response {
  * SmartError returns the right error message based on err.
  */
 func SmartError(err error) Response {
-       switch err {
+       switch errors.Cause(err) {
        case nil:
                return EmptySyncResponse
        case os.ErrNotExist:
@@ -524,6 +526,8 @@ func SmartError(err error) Response {
                return Conflict(nil)
        case sqlite3.ErrConstraintUnique:
                return Conflict(nil)
+       case dqlite.ErrNoAvailableLeader:
+               return Unavailable(err)
        default:
                return InternalError(err)
        }
_______________________________________________
lxc-devel mailing list
[email protected]
http://lists.linuxcontainers.org/listinfo/lxc-devel

Reply via email to