The following pull request was submitted through Github.
It can be accessed and reviewed at: https://github.com/lxc/lxd/pull/2191

This e-mail was sent by the LXC bot, direct replies will not reach the author
unless they happen to be subscribed to this list.

=== Description (from pull-request) ===

From bde987ddbf167f5e848d001b4e28d828d6b093b9 Mon Sep 17 00:00:00 2001
From: Tycho Andersen <tycho.ander...@canonical.com>
Date: Thu, 7 Jul 2016 22:28:32 +0000
Subject: [PATCH 1/2] make client.websocket a public API

We'll use this in the next patch.

Signed-off-by: Tycho Andersen <tycho.ander...@canonical.com>
---
 client.go | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/client.go b/client.go
index 3567127..3e0d791 100644
--- a/client.go
+++ b/client.go
@@ -482,7 +482,7 @@ func (c *Client) delete(base string, args interface{}, 
rtype ResponseType) (*Res
        return HoistResponse(resp, rtype)
 }
 
-func (c *Client) websocket(operation string, secret string) (*websocket.Conn, 
error) {
+func (c *Client) Websocket(operation string, secret string) (*websocket.Conn, 
error) {
        query := url.Values{"secret": []string{secret}}
        url := c.BaseWSURL + path.Join(operation, "websocket") + "?" + 
query.Encode()
        return WebsocketDial(c.websocketDialer, url)
@@ -1513,7 +1513,7 @@ func (c *Client) Exec(name string, cmd []string, env 
map[string]string,
        if controlHandler != nil {
                var control *websocket.Conn
                if wsControl, ok := fds["control"]; ok {
-                       control, err = c.websocket(resp.Operation, 
wsControl.(string))
+                       control, err = c.Websocket(resp.Operation, 
wsControl.(string))
                        if err != nil {
                                return -1, err
                        }
@@ -1522,7 +1522,7 @@ func (c *Client) Exec(name string, cmd []string, env 
map[string]string,
                        go controlHandler(c, control)
                }
 
-               conn, err := c.websocket(resp.Operation, fds["0"].(string))
+               conn, err := c.Websocket(resp.Operation, fds["0"].(string))
                if err != nil {
                        return -1, err
                }
@@ -1535,7 +1535,7 @@ func (c *Client) Exec(name string, cmd []string, env 
map[string]string,
                conns := make([]*websocket.Conn, 3)
                dones := make([]chan bool, 3)
 
-               conns[0], err = c.websocket(resp.Operation, 
fds[strconv.Itoa(0)].(string))
+               conns[0], err = c.Websocket(resp.Operation, 
fds[strconv.Itoa(0)].(string))
                if err != nil {
                        return -1, err
                }
@@ -1545,7 +1545,7 @@ func (c *Client) Exec(name string, cmd []string, env 
map[string]string,
 
                outputs := []io.WriteCloser{stdout, stderr}
                for i := 1; i < 3; i++ {
-                       conns[i], err = c.websocket(resp.Operation, 
fds[strconv.Itoa(i)].(string))
+                       conns[i], err = c.Websocket(resp.Operation, 
fds[strconv.Itoa(i)].(string))
                        if err != nil {
                                return -1, err
                        }

From b8078bc35703bdf09062adfc3491bbc70ba2aa14 Mon Sep 17 00:00:00 2001
From: Tycho Andersen <tycho.ander...@canonical.com>
Date: Thu, 7 Jul 2016 02:35:44 +0000
Subject: [PATCH 2/2] resume dumped container on failed restore

This commit implements "freezing" the dumped container until we are sure
that the restore was successful. Because of various issues (namely, TCP
socket repair mode and windowing, but also various other problems), we
can't simply implement some kind of --leave-frozen option in CRIU.

Instead, we have CRIU do a callback when the dump is done but the container
is still frozen, so that we can then try the restore and see if is succeds.

Signed-off-by: Tycho Andersen <tycho.ander...@canonical.com>
---
 lxd/container.go     |   7 +++-
 lxd/container_lxc.go |  14 +++++--
 lxd/main.go          |  23 +++++++++++
 lxd/migrate.go       | 115 ++++++++++++++++++++++++++++++++++++++++++++++++---
 4 files changed, 147 insertions(+), 12 deletions(-)

diff --git a/lxd/container.go b/lxd/container.go
index dd1038c..8242de7 100644
--- a/lxd/container.go
+++ b/lxd/container.go
@@ -271,7 +271,10 @@ type container interface {
 
        // Snapshots & migration
        Restore(sourceContainer container) error
-       Migrate(cmd uint, stateDir string, function string, stop bool) error
+       /* actionScript here is a script called action.sh in the stateDir, to
+        * be passed to CRIU as --action-script
+        */
+       Migrate(cmd uint, stateDir string, function string, stop bool, 
actionScript bool) error
        Snapshots() ([]container, error)
 
        // Config handling
@@ -457,7 +460,7 @@ func containerCreateAsSnapshot(d *Daemon, args 
containerArgs, sourceContainer co
                 * after snapshotting will fail.
                 */
 
-               err = sourceContainer.Migrate(lxc.MIGRATE_DUMP, stateDir, 
"snapshot", false)
+               err = sourceContainer.Migrate(lxc.MIGRATE_DUMP, stateDir, 
"snapshot", false, false)
                if err != nil {
                        os.RemoveAll(sourceContainer.StatePath())
                        return nil, err
diff --git a/lxd/container_lxc.go b/lxd/container_lxc.go
index 8761e3e..2cb3521 100644
--- a/lxd/container_lxc.go
+++ b/lxd/container_lxc.go
@@ -1198,7 +1198,7 @@ func (c *containerLXC) Start(stateful bool) error {
                        return fmt.Errorf("Container has no existing state to 
restore.")
                }
 
-               err := c.Migrate(lxc.MIGRATE_RESTORE, c.StatePath(), 
"snapshot", false)
+               err := c.Migrate(lxc.MIGRATE_RESTORE, c.StatePath(), 
"snapshot", false, false)
                if err != nil && !c.IsRunning() {
                        return err
                }
@@ -1364,7 +1364,7 @@ func (c *containerLXC) Stop(stateful bool) error {
                }
 
                // Checkpoint
-               err = c.Migrate(lxc.MIGRATE_DUMP, stateDir, "snapshot", true)
+               err = c.Migrate(lxc.MIGRATE_DUMP, stateDir, "snapshot", true, 
false)
                if err != nil {
                        return err
                }
@@ -1740,7 +1740,7 @@ func (c *containerLXC) Restore(sourceContainer container) 
error {
        // If the container wasn't running but was stateful, should we restore
        // it as running?
        if shared.PathExists(c.StatePath()) {
-               if err := c.Migrate(lxc.MIGRATE_RESTORE, c.StatePath(), 
"snapshot", false); err != nil {
+               if err := c.Migrate(lxc.MIGRATE_RESTORE, c.StatePath(), 
"snapshot", false, false); err != nil {
                        return err
                }
 
@@ -2661,7 +2661,7 @@ func findCriu(host string) error {
        return nil
 }
 
-func (c *containerLXC) Migrate(cmd uint, stateDir string, function string, 
stop bool) error {
+func (c *containerLXC) Migrate(cmd uint, stateDir string, function string, 
stop bool, actionScript bool) error {
        if err := findCriu(function); err != nil {
                return err
        }
@@ -2747,11 +2747,17 @@ func (c *containerLXC) Migrate(cmd uint, stateDir 
string, function string, stop
                        return err
                }
 
+               script := ""
+               if actionScript {
+                       script = filepath.Join(stateDir, "action.sh")
+               }
+
                opts := lxc.MigrateOptions{
                        Stop:            stop,
                        Directory:       stateDir,
                        Verbose:         true,
                        PreservesInodes: preservesInodes,
+                       ActionScript:    script,
                }
 
                migrateErr = c.c.Migrate(cmd, opts)
diff --git a/lxd/main.go b/lxd/main.go
index 9da7563..4eb4372 100644
--- a/lxd/main.go
+++ b/lxd/main.go
@@ -158,6 +158,8 @@ func run() error {
                fmt.Printf("        Start a container\n")
                fmt.Printf("    callhook\n")
                fmt.Printf("        Call a container hook\n")
+               fmt.Printf("    migratedumpsuccess\n")
+               fmt.Printf("        Indicate that a migration dump was 
successful\n")
                fmt.Printf("    netcat\n")
                fmt.Printf("        Mirror a unix socket to stdin/stdout\n")
        }
@@ -237,6 +239,8 @@ func run() error {
                        os.Exit(ret)
                case "netcat":
                        return Netcat(os.Args[1:])
+               case "migratedumpsuccess":
+                       return cmdMigrateDumpSuccess(os.Args[1:])
                }
        }
 
@@ -1050,3 +1054,22 @@ func printnet() error {
 
        return nil
 }
+
+func cmdMigrateDumpSuccess(args []string) error {
+       if len(args) != 3 {
+               return fmt.Errorf("bad migrate dump success args %s", args)
+       }
+
+       c, err := lxd.NewClient(&lxd.DefaultConfig, "local")
+       if err != nil {
+               return err
+       }
+
+       conn, err := c.Websocket(args[1], args[2])
+       if err != nil {
+               return err
+       }
+       conn.Close()
+
+       return c.WaitForSuccess(args[1])
+}
diff --git a/lxd/migrate.go b/lxd/migrate.go
index 14e9c00..5377a61 100644
--- a/lxd/migrate.go
+++ b/lxd/migrate.go
@@ -11,6 +11,7 @@ import (
        "net/http"
        "net/url"
        "os"
+       "path/filepath"
        "strconv"
        "strings"
        "sync"
@@ -228,7 +229,28 @@ func (s *migrationSourceWs) Connect(op *operation, r 
*http.Request, w http.Respo
        return nil
 }
 
-func (s *migrationSourceWs) Do(op *operation) error {
+func writeActionScript(directory string, operation string, secret string) 
error {
+       script := fmt.Sprintf(`#!/bin/sh -e
+if [ "$CRTOOLS_SCRIPT_ACTION" = "post-dump" ]; then
+       %s migratedumpsuccess %s %s
+fi
+`, execPath, operation, secret)
+
+       f, err := os.Create(filepath.Join(directory, "action.sh"))
+       if err != nil {
+               return err
+       }
+       defer f.Close()
+
+       if err := f.Chmod(0500); err != nil {
+               return err
+       }
+
+       _, err = f.WriteString(script)
+       return err
+}
+
+func (s *migrationSourceWs) Do(migrateOp *operation) error {
        <-s.allConnected
 
        criuType := CRIUType_CRIU_RSYNC.Enum()
@@ -326,15 +348,98 @@ func (s *migrationSourceWs) Do(op *operation) error {
                        return abort(fmt.Errorf("Formats other than criu rsync 
not understood"))
                }
 
+               /* What happens below is slightly convoluted. Due to various
+                * complications with networking, there's no easy way for criu
+                * to exit and leave the container in a frozen state for us to
+                * somehow resume later.
+                *
+                * Instead, we use what criu calls an "action-script", which is
+                * basically a callback that lets us know when the dump is
+                * done. (Unfortunately, we can't pass arguments, just an
+                * executable path, so we write a custom action script with the
+                * real command we want to run.)
+                *
+                * This script then hangs until the migration operation either
+                * finishes successfully or fails, and exits 1 or 0, which
+                * causes criu to either leave the container running or kill it
+                * as we asked.
+                */
+               dumpDone := make(chan bool, 1)
+               actionScriptOpSecret, err := shared.RandomCryptoString()
+               if err != nil {
+                       return abort(err)
+               }
+
+               actionScriptOp, err := operationCreate(
+                       operationClassWebsocket,
+                       nil,
+                       nil,
+                       func(op *operation) error {
+                               _, err := migrateOp.WaitFinal(-1)
+                               if err != nil {
+                                       return err
+                               }
+
+                               if migrateOp.status != shared.Success {
+                                       return fmt.Errorf("restore failed: %s", 
op.status.String())
+                               }
+                               return nil
+                       },
+                       nil,
+                       func(op *operation, r *http.Request, w 
http.ResponseWriter) error {
+                               secret := r.FormValue("secret")
+                               if secret == "" {
+                                       return fmt.Errorf("missing secret")
+                               }
+
+                               if secret != actionScriptOpSecret {
+                                       return os.ErrPermission
+                               }
+
+                               c, err := shared.WebsocketUpgrader.Upgrade(w, 
r, nil)
+                               if err != nil {
+                                       return err
+                               }
+
+                               dumpDone <- true
+
+                               closeMsg := 
websocket.FormatCloseMessage(websocket.CloseNormalClosure, "")
+                               return c.WriteMessage(websocket.CloseMessage, 
closeMsg)
+                       },
+               )
+               if err != nil {
+                       return abort(err)
+               }
+
                checkpointDir, err := ioutil.TempDir("", "lxd_checkpoint_")
                if err != nil {
                        return abort(err)
                }
-               defer os.RemoveAll(checkpointDir)
 
-               err = s.container.Migrate(lxc.MIGRATE_DUMP, checkpointDir, 
"migration", true)
+               if err := writeActionScript(checkpointDir, actionScriptOp.url, 
actionScriptOpSecret); err != nil {
+                       os.RemoveAll(checkpointDir)
+                       return abort(err)
+               }
+
+               _, err = actionScriptOp.Run()
                if err != nil {
+                       os.RemoveAll(checkpointDir)
+                       return abort(err)
+               }
+
+               migrateDone := make(chan error, 1)
+               go func() {
+                       defer os.RemoveAll(checkpointDir)
+                       migrateDone <- s.container.Migrate(lxc.MIGRATE_DUMP, 
checkpointDir, "migration", true, true)
+               }()
+
+               select {
+               /* the checkpoint failed, let's just abort */
+               case err = <-migrateDone:
                        return abort(err)
+               /* the dump finished, let's continue on to the restore */
+               case <-dumpDone:
+                       shared.Debugf("Dump finished, continuing with 
restore...")
                }
 
                /*
@@ -361,8 +466,6 @@ func (s *migrationSourceWs) Do(op *operation) error {
                return err
        }
 
-       // TODO: should we add some config here about automatically restarting
-       // the container migrate failure? What about the failures above?
        if !*msg.Success {
                return fmt.Errorf(*msg.Message)
        }
@@ -559,7 +662,7 @@ func (c *migrationSink) do() error {
                }
 
                if c.live {
-                       err = c.container.Migrate(lxc.MIGRATE_RESTORE, 
imagesDir, "migration", false)
+                       err = c.container.Migrate(lxc.MIGRATE_RESTORE, 
imagesDir, "migration", false, false)
                        if err != nil {
                                restore <- err
                                return
_______________________________________________
lxc-devel mailing list
lxc-devel@lists.linuxcontainers.org
http://lists.linuxcontainers.org/listinfo/lxc-devel

Reply via email to